MDEV-34062: Implement innodb_log_file_mmap on 64-bit systems

When using the default innodb_log_buffer_size=2m, mariadb-backup --backup
would spend a lot of time re-reading and re-parsing the log. For reads,
it would be beneficial to memory-map the entire ib_logfile0 to the
address space (typically 48 bits or 256 TiB) and read it from there,
both during --backup and --prepare.

We will introduce the Boolean read-only parameter innodb_log_file_mmap
that will be OFF by default on most platforms, to avoid aggressive
read-ahead of the entire ib_logfile0 in when only a tiny portion would be
accessed. On Linux and FreeBSD the default is innodb_log_file_mmap=ON,
because those platforms define a specific mmap(2) option for enabling
such read-ahead and therefore it can be assumed that the default would
be on-demand paging. This parameter will only have impact on the initial
InnoDB startup and recovery. Any writes to the log will use regular I/O,
except when the ib_logfile0 is stored in a specially configured file system
that is backed by persistent memory (Linux "mount -o dax").

We also experimented with allowing writes of the ib_logfile0 via a
memory mapping and decided against it. A fundamental problem would be
unnecessary read-before-write in case of a major page fault, that is,
when a new, not yet cached, virtual memory page in the circular
ib_logfile0 is being written to. There appears to be no way to tell
the operating system that we do not care about the previous contents of
the page, or that the page fault handler should just zero it out.

Many references to HAVE_PMEM have been replaced with references to
HAVE_INNODB_MMAP.

The predicate log_sys.is_pmem() has been replaced with
log_sys.is_mmap() && !log_sys.is_opened().

Memory-mapped regular files differ from MAP_SYNC (PMEM) mappings in the
way that an open file handle to ib_logfile0 will be retained. In both
code paths, log_sys.is_mmap() will hold. Holding a file handle open will
allow log_t::clear_mmap() to disable the interface with fewer operations.

It should be noted that ever since
commit 685d958e38 (MDEV-14425)
most 64-bit Linux platforms on our CI platforms
(s390x a.k.a. IBM System Z being a notable exception) read and write
/dev/shm/*/ib_logfile0 via a memory mapping, pretending that it is
persistent memory (mount -o dax). So, the memory mapping based log
parsing that this change is enabling by default on Linux and FreeBSD
has already been extensively tested on Linux.

::log_mmap(): If a log cannot be opened as PMEM and the desired access
is read-only, try to open a read-only memory mapping.

xtrabackup_copy_mmap_snippet(), xtrabackup_copy_mmap_logfile():
Copy the InnoDB log in mariadb-backup --backup from a memory
mapped file.
This commit is contained in:
Marko Mäkelä 2024-09-26 18:47:12 +03:00
parent 971cf59579
commit 6acada713a
22 changed files with 579 additions and 296 deletions

View file

@ -61,7 +61,6 @@ SET(HAVE_GETIFADDRS CACHE INTERNAL "")
SET(HAVE_GETCWD 1 CACHE INTERNAL "") SET(HAVE_GETCWD 1 CACHE INTERNAL "")
SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "") SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "")
SET(HAVE_GETHRTIME CACHE INTERNAL "") SET(HAVE_GETHRTIME CACHE INTERNAL "")
SET(HAVE_GETPAGESIZE CACHE INTERNAL "")
SET(HAVE_GETPASS CACHE INTERNAL "") SET(HAVE_GETPASS CACHE INTERNAL "")
SET(HAVE_GETMNTENT CACHE INTERNAL "") SET(HAVE_GETMNTENT CACHE INTERNAL "")
SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "") SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "")

View file

@ -151,7 +151,6 @@
#cmakedefine HAVE_GETCWD 1 #cmakedefine HAVE_GETCWD 1
#cmakedefine HAVE_GETHOSTBYADDR_R 1 #cmakedefine HAVE_GETHOSTBYADDR_R 1
#cmakedefine HAVE_GETHRTIME 1 #cmakedefine HAVE_GETHRTIME 1
#cmakedefine HAVE_GETPAGESIZE 1
#cmakedefine HAVE_GETPAGESIZES 1 #cmakedefine HAVE_GETPAGESIZES 1
#cmakedefine HAVE_GETPASS 1 #cmakedefine HAVE_GETPASS 1
#cmakedefine HAVE_GETPASSPHRASE 1 #cmakedefine HAVE_GETPASSPHRASE 1

View file

@ -463,7 +463,6 @@ CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE)
CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES) CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES)
CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME) CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME)
CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48) CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48)
CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE)
CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL) CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL) CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL) CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL)

View file

@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start;
lsn_t checkpoint_no_start; lsn_t checkpoint_no_start;
/** whether log_copying_thread() is active; protected by recv_sys.mutex */ /** whether log_copying_thread() is active; protected by recv_sys.mutex */
static bool log_copying_running; static bool log_copying_running;
/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */
lsn_t metadata_to_lsn;
uint xtrabackup_parallel; uint xtrabackup_parallel;
@ -236,7 +238,6 @@ my_bool opt_encrypted_backup;
#define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints" #define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints"
char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/ char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/
static lsn_t metadata_from_lsn; static lsn_t metadata_from_lsn;
lsn_t metadata_to_lsn;
static lsn_t metadata_last_lsn; static lsn_t metadata_last_lsn;
static ds_file_t* dst_log_file; static ds_file_t* dst_log_file;
@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE;
*/ */
ulong xtrabackup_innodb_force_recovery = 0; ulong xtrabackup_innodb_force_recovery = 0;
/* The flushed lsn which is read from data files */
lsn_t flushed_lsn= 0;
ulong xb_open_files_limit= 0; ulong xb_open_files_limit= 0;
char *xb_plugin_dir; char *xb_plugin_dir;
char *xb_plugin_load; char *xb_plugin_load;
@ -1329,6 +1327,9 @@ enum options_xtrabackup
OPT_INNODB_BUFFER_POOL_FILENAME, OPT_INNODB_BUFFER_POOL_FILENAME,
OPT_INNODB_LOCK_WAIT_TIMEOUT, OPT_INNODB_LOCK_WAIT_TIMEOUT,
OPT_INNODB_LOG_BUFFER_SIZE, OPT_INNODB_LOG_BUFFER_SIZE,
#ifdef HAVE_INNODB_MMAP
OPT_INNODB_LOG_FILE_MMAP,
#endif
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
OPT_INNODB_LOG_FILE_BUFFERING, OPT_INNODB_LOG_FILE_BUFFERING,
#endif #endif
@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] =
(G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0, (G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0,
GET_UINT, REQUIRED_ARG, 2U << 20, GET_UINT, REQUIRED_ARG, 2U << 20,
2U << 20, log_sys.buf_size_max, 0, 4096, 0}, 2U << 20, log_sys.buf_size_max, 0, 4096, 0},
#ifdef HAVE_INNODB_MMAP
{"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE,
"Whether ib_logfile0 should be memory-mapped",
(G_PTR*) &log_sys.log_mmap,
(G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG,
log_sys.log_mmap_default, 0, 0, 0, 0, 0},
#endif
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
{"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING, {"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING,
"Whether the file system cache for ib_logfile0 is enabled during --backup", "Whether the file system cache for ib_logfile0 is enabled during --backup",
@ -3368,8 +3376,108 @@ skip:
return(FALSE); return(FALSE);
} }
#ifdef HAVE_INNODB_MMAP
static int
xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end)
{
if (UNIV_UNLIKELY(start > end))
{
if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start))
return r;
start= log_sys.buf + log_sys.START_OFFSET;
}
return ds_write(ds, start, end - start);
}
/** Copy memory-mapped log until the end of the log is reached
or the log_copying_stop signal is received
@return whether the operation failed */
static bool xtrabackup_copy_mmap_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
recv_sys.len= size_t(log_sys.file_size);
const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const char one{'\1'};
for (unsigned retry_count{0};;)
{
recv_sys_t::parse_mtr_result r;
const byte *start= &log_sys.buf[recv_sys.offset];
if (recv_sys.parse_mmap<false>(false) == recv_sys_t::OK)
{
const byte *end;
do
{
/* Set the sequence bit (the backed-up log will not wrap around) */
size_t seqo= recv_sys.offset - seq_offset;
if (seqo < log_sys.START_OFFSET)
seqo+= log_sys.file_size - log_sys.START_OFFSET;
const byte *seq= &log_sys.buf[seqo];
ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset));
if (!*seq)
{
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) ||
ds_write(dst_log_file, &one, 1))
goto write_error;
start = seq + 1;
}
}
while ((r= recv_sys.parse_mmap<false>(false)) == recv_sys_t::OK);
end= &log_sys.buf[recv_sys.offset];
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end))
{
write_error:
msg("Error: write to ib_logfile0 failed");
return true;
}
start= end;
pthread_cond_broadcast(&scanned_lsn_cond);
if (r == recv_sys_t::GOT_EOF)
break;
retry_count= 0;
}
else
{
if (metadata_to_lsn)
{
if (metadata_to_lsn <= recv_sys.lsn)
return false;
}
else if (xtrabackup_throttle && io_ticket-- < 0)
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);
if (!retry_count++)
msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn);
else if (retry_count == 100)
break;
else
{
timespec abstime;
set_timespec_nsec(abstime, 1000000ULL /* 1 ms */);
if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex,
&abstime))
return true;
}
}
}
if (verbose)
msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn);
return false;
}
#endif
/** Copy redo log until the current end of the log is reached /** Copy redo log until the current end of the log is reached
@return whether the operation failed */ @return whether the operation failed */
static bool xtrabackup_copy_logfile() static bool xtrabackup_copy_logfile()
{ {
mysql_mutex_assert_owner(&recv_sys.mutex); mysql_mutex_assert_owner(&recv_sys.mutex);
@ -3377,16 +3485,17 @@ static bool xtrabackup_copy_logfile()
ut_a(dst_log_file); ut_a(dst_log_file);
ut_ad(recv_sys.is_initialised()); ut_ad(recv_sys.is_initialised());
#ifdef HAVE_INNODB_MMAP
if (log_sys.is_mmap())
return xtrabackup_copy_mmap_logfile();
#endif
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.write_size - 1}; const size_t block_size_1{log_sys.write_size - 1};
ut_ad(!log_sys.is_pmem()); recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
{ recv_sys.len= 0;
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;
}
for (unsigned retry_count{0};;) for (unsigned retry_count{0};;)
{ {
@ -5376,9 +5485,8 @@ fail:
goto fail; goto fail;
} }
if (!log_sys.create()) { log_sys.create();
goto fail;
}
/* get current checkpoint_lsn */ /* get current checkpoint_lsn */
{ {
log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.latch.wr_lock(SRW_LOCK_CALL);
@ -6730,9 +6838,7 @@ error:
} }
recv_sys.create(); recv_sys.create();
if (!log_sys.create()) { log_sys.create();
goto error;
}
recv_sys.recovery_on = true; recv_sys.recovery_on = true;
xb_fil_io_init(); xb_fil_io_init();

View file

@ -1017,11 +1017,7 @@ extern int my_win_pclose(FILE*);
#endif #endif
/* my_getpagesize */ /* my_getpagesize */
#ifdef HAVE_GETPAGESIZE
#define my_getpagesize() getpagesize()
#else
int my_getpagesize(void); int my_getpagesize(void);
#endif
int my_msync(int, void *, size_t, int); int my_msync(int, void *, size_t, int);

View file

@ -19,6 +19,12 @@ SHOW VARIABLES LIKE 'innodb_log_file_size';
Variable_name Value Variable_name Value
innodb_log_file_size 4194304 innodb_log_file_size 4194304
FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err
SET @save=@@GLOBAL.innodb_log_file_buffering;
SET GLOBAL innodb_log_file_buffering=OFF;
SET GLOBAL innodb_log_file_buffering=ON;
SET GLOBAL innodb_log_file_buffering=@save;
SET GLOBAL innodb_log_file_mmap=OFF;
Got one of the listed errors
SET GLOBAL innodb_log_file_size=5242880; SET GLOBAL innodb_log_file_size=5242880;
connect con1,localhost,root; connect con1,localhost,root;
UPDATE t SET b='' WHERE a<10; UPDATE t SET b='' WHERE a<10;

View file

@ -25,6 +25,17 @@ SHOW VARIABLES LIKE 'innodb_log_file_size';
let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB; let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB;
--source include/search_pattern_in_file.inc --source include/search_pattern_in_file.inc
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET @save=@@GLOBAL.innodb_log_file_buffering;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=OFF;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=ON;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=@save;
--error ER_INCORRECT_GLOBAL_LOCAL_VAR,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_mmap=OFF;
send SET GLOBAL innodb_log_file_size=5242880; send SET GLOBAL innodb_log_file_size=5242880;
--connect con1,localhost,root --connect con1,localhost,root

View file

@ -4,6 +4,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA 'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS 'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name; order by variable_name;

View file

@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA 'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS 'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name; order by variable_name;

View file

@ -16,8 +16,6 @@
#include "mysys_priv.h" #include "mysys_priv.h"
#ifndef HAVE_GETPAGESIZE
#if defined _WIN32 #if defined _WIN32
int my_getpagesize(void) int my_getpagesize(void)
@ -27,6 +25,13 @@ int my_getpagesize(void)
return si.dwPageSize; return si.dwPageSize;
} }
#elif defined _SC_PAGESIZE
int my_getpagesize(void)
{
return (int)sysconf(_SC_PAGESIZE);
}
#else #else
/* Default implementation */ /* Default implementation */
@ -36,6 +41,3 @@ int my_getpagesize(void)
} }
#endif #endif
#endif

View file

@ -151,9 +151,7 @@ my_bool my_init(void)
my_umask= 0660; /* Default umask for new files */ my_umask= 0660; /* Default umask for new files */
my_umask_dir= 0700; /* Default umask for new directories */ my_umask_dir= 0700; /* Default umask for new directories */
my_global_flags= 0; my_global_flags= 0;
#ifdef _SC_PAGESIZE my_system_page_size= my_getpagesize();
my_system_page_size= sysconf(_SC_PAGESIZE);
#endif
/* Default creation of new files */ /* Default creation of new files */
if ((str= getenv("UMASK")) != 0) if ((str= getenv("UMASK")) != 0)

View file

@ -1766,7 +1766,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency");
static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility");
byte* c= my_assume_aligned<CPU_LEVEL1_DCACHE_LINESIZE> byte* c= my_assume_aligned<CPU_LEVEL1_DCACHE_LINESIZE>
(is_pmem() ? buf + offset : checkpoint_buf); (is_mmap() ? buf + offset : checkpoint_buf);
memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(c, 0, CPU_LEVEL1_DCACHE_LINESIZE);
mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn);
mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn);
@ -1775,8 +1775,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
lsn_t resizing; lsn_t resizing;
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (is_pmem()) if (is_mmap())
{ {
ut_ad(!is_opened());
resizing= resize_lsn.load(std::memory_order_relaxed); resizing= resize_lsn.load(std::memory_order_relaxed);
if (resizing > 1 && resizing <= next_checkpoint_lsn) if (resizing > 1 && resizing <= next_checkpoint_lsn)
@ -1790,12 +1791,12 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
else else
#endif #endif
{ {
ut_ad(!is_mmap());
ut_ad(!checkpoint_pending); ut_ad(!checkpoint_pending);
checkpoint_pending= true; checkpoint_pending= true;
latch.wr_unlock(); latch.wr_unlock();
log_write_and_flush_prepare(); log_write_and_flush_prepare();
resizing= resize_lsn.load(std::memory_order_relaxed); resizing= resize_lsn.load(std::memory_order_relaxed);
/* FIXME: issue an asynchronous write */
ut_ad(ut_is_2pow(write_size)); ut_ad(ut_is_2pow(write_size));
ut_ad(write_size >= 512); ut_ad(write_size >= 512);
ut_ad(write_size <= 4096); ut_ad(write_size <= 4096);
@ -1838,9 +1839,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
if (resizing > 1 && resizing <= checkpoint_lsn) if (resizing > 1 && resizing <= checkpoint_lsn)
{ {
ut_ad(is_pmem() == !resize_flush_buf); ut_ad(is_mmap() == !resize_flush_buf);
if (!is_pmem()) if (!is_mmap())
{ {
if (srv_file_flush_method != SRV_O_DSYNC) if (srv_file_flush_method != SRV_O_DSYNC)
ut_a(resize_log.flush()); ut_a(resize_log.flush());
@ -1849,13 +1850,17 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
if (resize_rename()) if (resize_rename())
{ {
/* Resizing failed. Discard the log_sys.resize_log. */ /* Resizing failed. Discard the ib_logfile101. */
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (is_pmem()) if (is_mmap())
{
ut_ad(!is_opened());
my_munmap(resize_buf, resize_target); my_munmap(resize_buf, resize_target);
}
else else
#endif #endif
{ {
ut_ad(!is_mmap());
ut_free_dodump(resize_buf, buf_size); ut_free_dodump(resize_buf, buf_size);
ut_free_dodump(resize_flush_buf, buf_size); ut_free_dodump(resize_flush_buf, buf_size);
#ifdef _WIN32 #ifdef _WIN32
@ -1873,8 +1878,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
{ {
/* Adopt the resized log. */ /* Adopt the resized log. */
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (is_pmem()) if (is_mmap())
{ {
ut_ad(!is_opened());
my_munmap(buf, file_size); my_munmap(buf, file_size);
buf= resize_buf; buf= resize_buf;
set_buf_free(START_OFFSET + (get_lsn() - resizing)); set_buf_free(START_OFFSET + (get_lsn() - resizing));
@ -1882,6 +1888,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
else else
#endif #endif
{ {
ut_ad(!is_mmap());
IF_WIN(,log.close()); IF_WIN(,log.close());
std::swap(log, resize_log); std::swap(log, resize_log);
ut_free_dodump(buf, buf_size); ut_free_dodump(buf, buf_size);

View file

@ -18532,7 +18532,10 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
if (high_level_read_only) if (high_level_read_only)
ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_READ_ONLY_MODE); ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_READ_ONLY_MODE);
else if (!log_sys.is_pmem() && else if (
#ifdef HAVE_PMEM
!log_sys.is_mmap() &&
#endif
*static_cast<const ulonglong*>(save) < log_sys.buf_size) *static_cast<const ulonglong*>(save) < log_sys.buf_size)
my_printf_error(ER_WRONG_ARGUMENTS, my_printf_error(ER_WRONG_ARGUMENTS,
"innodb_log_file_size must be at least" "innodb_log_file_size must be at least"
@ -18573,7 +18576,7 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (start > log_sys.get_lsn()) if (start > log_sys.get_lsn())
{ {
ut_ad(!log_sys.is_pmem()); ut_ad(!log_sys.is_mmap());
/* The server is almost idle. Write dummy FILE_CHECKPOINT records /* The server is almost idle. Write dummy FILE_CHECKPOINT records
to ensure that the log resizing will complete. */ to ensure that the log resizing will complete. */
log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.latch.wr_lock(SRW_LOCK_CALL);
@ -19437,6 +19440,19 @@ static MYSQL_SYSVAR_UINT(log_buffer_size, log_sys.buf_size,
"Redo log buffer size in bytes.", "Redo log buffer size in bytes.",
NULL, NULL, 16U << 20, 2U << 20, log_sys.buf_size_max, 4096); NULL, NULL, 16U << 20, 2U << 20, log_sys.buf_size_max, 4096);
#ifdef HAVE_INNODB_MMAP
static constexpr const char *innodb_log_file_mmap_description=
"Whether ib_logfile0"
# ifdef HAVE_PMEM
" resides in persistent memory or"
# endif
" should initially be memory-mapped";
static MYSQL_SYSVAR_BOOL(log_file_mmap, log_sys.log_mmap,
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
innodb_log_file_mmap_description,
nullptr, nullptr, log_sys.log_mmap_default);
#endif
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
PLUGIN_VAR_OPCMDARG, PLUGIN_VAR_OPCMDARG,
@ -19922,6 +19938,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(deadlock_report), MYSQL_SYSVAR(deadlock_report),
MYSQL_SYSVAR(page_size), MYSQL_SYSVAR(page_size),
MYSQL_SYSVAR(log_buffer_size), MYSQL_SYSVAR(log_buffer_size),
#ifdef HAVE_INNODB_MMAP
MYSQL_SYSVAR(log_file_mmap),
#endif
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
MYSQL_SYSVAR(log_file_buffering), MYSQL_SYSVAR(log_file_buffering),
#endif #endif

View file

@ -118,15 +118,14 @@ public:
@return file size in bytes @return file size in bytes
@retval 0 if not readable */ @retval 0 if not readable */
os_offset_t open(bool read_only) noexcept; os_offset_t open(bool read_only) noexcept;
/** @return whether a handle to the log is open */
bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; } bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; }
dberr_t close() noexcept; dberr_t close() noexcept;
dberr_t read(os_offset_t offset, span<byte> buf) noexcept; dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
void write(os_offset_t offset, span<const byte> buf) noexcept; void write(os_offset_t offset, span<const byte> buf) noexcept;
bool flush() const noexcept { return os_file_flush(m_file); } bool flush() const noexcept { return os_file_flush(m_file); }
#ifdef HAVE_PMEM
byte *mmap(bool read_only, const struct stat &st) noexcept;
#endif
}; };
/** Redo log buffer */ /** Redo log buffer */
@ -189,7 +188,7 @@ private:
public: public:
/** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */ /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */
size_t waits; size_t waits;
/** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */ /** innodb_log_buffer_size (size of buf,flush_buf if !is_mmap(), in bytes) */
unsigned buf_size; unsigned buf_size;
/** log file size in bytes, including the header */ /** log file size in bytes, including the header */
lsn_t file_size; lsn_t file_size;
@ -231,7 +230,7 @@ public:
/** Last written LSN */ /** Last written LSN */
lsn_t write_lsn; lsn_t write_lsn;
/** buffer for writing data to ib_logfile0, or nullptr if is_pmem() /** Buffer for writing data to ib_logfile0, or nullptr if is_mmap().
In write_buf(), buf and flush_buf may be swapped */ In write_buf(), buf and flush_buf may be swapped */
byte *flush_buf; byte *flush_buf;
@ -280,6 +279,19 @@ public:
uint write_size; uint write_size;
/** format of the redo log: e.g., FORMAT_10_8 */ /** format of the redo log: e.g., FORMAT_10_8 */
uint32_t format; uint32_t format;
#ifdef HAVE_INNODB_MMAP
/** whether the memory-mapped interface is enabled for the log */
my_bool log_mmap;
/** the default value of log_mmap */
static constexpr bool log_mmap_default=
# if defined __linux__ /* MAP_POPULATE would enable read-ahead */
true ||
# elif defined __FreeBSD__ /* MAP_PREFAULT_READ would enable read-ahead */
true ||
# else /* an unnecessary read-ahead of a large ib_logfile0 is a risk */
# endif
false;
#endif
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
/** whether file system caching is enabled for the log */ /** whether file system caching is enabled for the log */
my_bool log_buffered; my_bool log_buffered;
@ -322,7 +334,7 @@ public:
/** whether there is capacity in the log buffer */ /** whether there is capacity in the log buffer */
bool buf_free_ok() const noexcept bool buf_free_ok() const noexcept
{ {
ut_ad(!is_pmem()); ut_ad(!is_mmap());
return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) < return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) <
max_buf_free; max_buf_free;
} }
@ -332,12 +344,14 @@ public:
void set_buf_free(size_t f) noexcept void set_buf_free(size_t f) noexcept
{ ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); } { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); }
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
bool is_pmem() const noexcept { return !flush_buf; } bool is_mmap() const noexcept { return !flush_buf; }
#else #else
static constexpr bool is_pmem() { return false; } static constexpr bool is_mmap() { return false; }
#endif #endif
/** @return whether a handle to the log is open;
is_mmap() && !is_opened() holds for PMEM */
bool is_opened() const noexcept { return log.is_opened(); } bool is_opened() const noexcept { return log.is_opened(); }
/** @return target write LSN to react on !buf_free_ok() */ /** @return target write LSN to react on !buf_free_ok() */
@ -381,40 +395,33 @@ public:
@return whether an error occurred */ @return whether an error occurred */
static bool resize_rename() noexcept; static bool resize_rename() noexcept;
#ifdef HAVE_PMEM
/** @return pointer for writing to resize_buf /** @return pointer for writing to resize_buf
@retval nullptr if no PMEM based resizing is active */ @retval nullptr if no is_mmap() based resizing is active */
inline byte *resize_buf_begin(lsn_t lsn) const noexcept; inline byte *resize_buf_begin(lsn_t lsn) const noexcept;
/** @return end of resize_buf */ /** @return end of resize_buf */
inline const byte *resize_buf_end() const noexcept inline const byte *resize_buf_end() const noexcept
{ return resize_buf + resize_target; } { return resize_buf + resize_target; }
/** Initialise the redo log subsystem. */ /** Initialise the redo log subsystem. */
void create_low(); void create();
/** Initialise the redo log subsystem.
@return whether the initialisation succeeded */
bool create() { create_low(); return true; }
/** Attach a log file. /** Attach a log file.
@return whether the memory allocation succeeded */ @return whether the memory allocation succeeded */
bool attach(log_file_t file, os_offset_t size); bool attach(log_file_t file, os_offset_t size);
#else
/** Initialise the redo log subsystem.
@return whether the initialisation succeeded */
bool create();
/** Attach a log file. */
void attach_low(log_file_t file, os_offset_t size);
bool attach(log_file_t file, os_offset_t size)
{ attach_low(file, size); return true; }
#endif
#ifdef HAVE_INNODB_MMAP
/** Disable memory-mapped access (update log_mmap) */
void clear_mmap();
void close_file(bool really_close= true);
#else
static void clear_mmap() {}
void close_file();
#endif
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
/** Try to enable or disable file system caching (update log_buffered) */ /** Try to enable or disable file system caching (update log_buffered) */
void set_buffered(bool buffered); void set_buffered(bool buffered);
#endif #endif
void close_file();
/** Calculate the checkpoint safety margins. */ /** Calculate the checkpoint safety margins. */
static void set_capacity(); static void set_capacity();
@ -494,11 +501,11 @@ private:
public: public:
/** Reserve space in the log buffer for appending data. /** Reserve space in the log buffer for appending data.
@tparam spin whether to use the spin-only lock_lsn() @tparam spin whether to use the spin-only lock_lsn()
@tparam pmem log_sys.is_pmem() @tparam mmap log_sys.is_mmap()
@param size total length of the data to append(), in bytes @param size total length of the data to append(), in bytes
@param ex whether log_sys.latch is exclusively locked @param ex whether log_sys.latch is exclusively locked
@return the start LSN and the buffer position for append() */ @return the start LSN and the buffer position for append() */
template<bool spin,bool pmem> template<bool spin,bool mmap>
std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept; std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
/** Append a string of bytes to the redo log. /** Append a string of bytes to the redo log.

View file

@ -408,19 +408,18 @@ private:
ATTRIBUTE_COLD void report_progress() const; ATTRIBUTE_COLD void report_progress() const;
public: public:
/** Parse and register one log_t::FORMAT_10_8 mini-transaction, /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
handling log_sys.is_pmem() buffer wrap-around. without handling any log_sys.is_mmap() buffer wrap-around.
@tparam store whether to store the records @tparam store whether to store the records
@param if_exists if store: whether to check if the tablespace exists */ @param if_exists if store: whether to check if the tablespace exists */
template<bool store> template<bool store>
static parse_mtr_result parse_mtr(bool if_exists) noexcept; static parse_mtr_result parse_mtr(bool if_exists) noexcept;
/** Parse and register one log_t::FORMAT_10_8 mini-transaction, /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
handling log_sys.is_pmem() buffer wrap-around. handling log_sys.is_mmap() buffer wrap-around.
@tparam store whether to store the records @tparam store whether to store the records
@param if_exists if store: whether to check if the tablespace exists */ @param if_exists if store: whether to check if the tablespace exists */
template<bool store> template<bool store>
static parse_mtr_result parse_pmem(bool if_exists) noexcept static parse_mtr_result parse_mmap(bool if_exists) noexcept
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
; ;
#else #else
{ return parse_mtr<store>(if_exists); } { return parse_mtr<store>(if_exists); }

View file

@ -696,7 +696,7 @@ private:
ATTRIBUTE_NOINLINE void encrypt(); ATTRIBUTE_NOINLINE void encrypt();
/** Commit the mini-transaction log. /** Commit the mini-transaction log.
@tparam pmem log_sys.is_pmem() @tparam pmem log_sys.is_mmap()
@param mtr mini-transaction @param mtr mini-transaction
@param lsns {start_lsn,flush_ahead} */ @param lsns {start_lsn,flush_ahead} */
template<bool pmem> template<bool pmem>
@ -708,11 +708,11 @@ private:
/** Append the redo log records to the redo log buffer. /** Append the redo log records to the redo log buffer.
@tparam spin whether to use the spin-only log_sys.lock_lsn() @tparam spin whether to use the spin-only log_sys.lock_lsn()
@tparam pmem log_sys.is_pmem() @tparam mmap log_sys.is_mmap()
@param mtr mini-transaction @param mtr mini-transaction
@param len number of bytes to write @param len number of bytes to write
@return {start_lsn,flush_ahead} */ @return {start_lsn,flush_ahead} */
template<bool spin,bool pmem> static template<bool spin,bool mmap> static
std::pair<lsn_t,page_flush_ahead> finish_writer(mtr_t *mtr, size_t len); std::pair<lsn_t,page_flush_ahead> finish_writer(mtr_t *mtr, size_t len);
/** The applicable variant of commit_log() */ /** The applicable variant of commit_log() */

View file

@ -170,6 +170,9 @@ using the call command. */
#define UNIV_INLINE static inline #define UNIV_INLINE static inline
#define UNIV_WORD_SIZE SIZEOF_SIZE_T #define UNIV_WORD_SIZE SIZEOF_SIZE_T
#if SIZEOF_SIZE_T == 8
# define HAVE_INNODB_MMAP
#endif
/** The following alignment is used in memory allocations in memory heap /** The following alignment is used in memory allocations in memory heap
management to ensure correct alignment for doubles etc. */ management to ensure correct alignment for doubles etc. */
@ -199,7 +202,7 @@ and 2 bits for flags. This limits the uncompressed page size to 16k.
/* Define the Min, Max, Default page sizes. */ /* Define the Min, Max, Default page sizes. */
/** Minimum Page Size Shift (power of 2) */ /** Minimum Page Size Shift (power of 2) */
#define UNIV_PAGE_SIZE_SHIFT_MIN 12U #define UNIV_PAGE_SIZE_SHIFT_MIN 12U
/** log2 of largest page size (1<<16 == 64436 bytes). */ /** log2 of largest page size (1<<16 == 65536 bytes). */
/** Maximum Page Size Shift (power of 2) */ /** Maximum Page Size Shift (power of 2) */
#define UNIV_PAGE_SIZE_SHIFT_MAX 16U #define UNIV_PAGE_SIZE_SHIFT_MAX 16U
/** log2 of default page size (1<<14 == 16384 bytes). */ /** log2 of default page size (1<<14 == 16384 bytes). */

View file

@ -86,11 +86,7 @@ void log_t::set_capacity()
log_sys.max_checkpoint_age = margin; log_sys.max_checkpoint_age = margin;
} }
#ifdef HAVE_PMEM void log_t::create()
void log_t::create_low()
#else
bool log_t::create()
#endif
{ {
ut_ad(this == &log_sys); ut_ad(this == &log_sys);
ut_ad(!is_initialised()); ut_ad(!is_initialised());
@ -101,35 +97,10 @@ bool log_t::create()
need_checkpoint.store(true, std::memory_order_relaxed); need_checkpoint.store(true, std::memory_order_relaxed);
write_lsn= FIRST_LSN; write_lsn= FIRST_LSN;
#ifndef HAVE_PMEM
buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME));
if (!buf)
{
alloc_fail:
sql_print_error("InnoDB: Cannot allocate memory;"
" too large innodb_log_buffer_size?");
return false;
}
flush_buf= static_cast<byte*>(ut_malloc_dontdump(buf_size,
PSI_INSTRUMENT_ME));
if (!flush_buf)
{
ut_free_dodump(buf, buf_size);
buf= nullptr;
goto alloc_fail;
}
TRASH_ALLOC(buf, buf_size);
TRASH_ALLOC(flush_buf, buf_size);
checkpoint_buf= static_cast<byte*>(aligned_malloc(4096, 4096));
memset_aligned<4096>(checkpoint_buf, 0, 4096);
max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
#else
ut_ad(!checkpoint_buf); ut_ad(!checkpoint_buf);
ut_ad(!buf); ut_ad(!buf);
ut_ad(!flush_buf); ut_ad(!flush_buf);
max_buf_free= 1; max_buf_free= 1;
#endif
latch.SRW_LOCK_INIT(log_latch_key); latch.SRW_LOCK_INIT(log_latch_key);
lsn_lock.init(); lsn_lock.init();
@ -144,9 +115,6 @@ bool log_t::create()
set_buf_free(0); set_buf_free(0);
ut_ad(is_initialised()); ut_ad(is_initialised());
#ifndef HAVE_PMEM
return true;
#endif
} }
dberr_t log_file_t::close() noexcept dberr_t log_file_t::close() noexcept
@ -178,22 +146,91 @@ void log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
<< IF_WIN(GetLastError(), errno) << "."; << IF_WIN(GetLastError(), errno) << ".";
} }
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
# include "cache.h" # ifdef HAVE_PMEM
# include "cache.h"
# endif
/** Attempt to memory map a file. /** Attempt to memory map a file.
@param file log file handle @param file log file handle
@param size file size @param size file size
@return pointer to memory mapping @return pointer to memory mapping
@retval MAP_FAILED if the memory cannot be mapped */ @retval MAP_FAILED if the memory cannot be mapped */
static void *log_mmap(os_file_t file, os_offset_t size) static void *log_mmap(os_file_t file,
# ifdef HAVE_PMEM
bool &is_pmem, /*!< whether the file is on pmem */
# endif
os_offset_t size)
{ {
void *ptr= if (my_system_page_size > 4096)
my_mmap(0, size_t(size), return MAP_FAILED;
srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, # ifndef HAVE_PMEM
MAP_SHARED_VALIDATE | MAP_SYNC, file, 0); if (!log_sys.log_mmap)
#ifdef __linux__ /* If support for persistent memory (Linux: mount -o dax) is enabled,
if (ptr == MAP_FAILED) we always attempt to open a MAP_SYNC memory mapping to ib_logfile0.
This mapping will be read-only during crash recovery, and read-write
during normal operation.
A regular read-only memory mapping may be attempted if
innodb_log_file_mmap=ON. This may benefit mariadb-backup
and crash recovery. */
return MAP_FAILED;
# endif
/* For now, InnoDB does not support memory-mapped writes to
a regular log file.
If PMEM is supported, the initially attempted memory mapping may
be read-write, but the fallback will be read-only.
The mapping will always be read-only if innodb_read_only=ON or
if mariadb-backup is running in any other mode than --prepare --export. */
const bool read_only=
srv_read_only_mode || srv_operation >= SRV_OPERATION_BACKUP;
# ifdef _WIN32
void *ptr= MAP_FAILED;
if (!read_only);
else if (HANDLE h=
CreateFileMappingA(file, nullptr, PAGE_READONLY,
DWORD(size >> 32), DWORD(size), nullptr))
{
if (h != INVALID_HANDLE_VALUE)
{
ptr= MapViewOfFileEx(h, FILE_MAP_READ, 0, 0, size, nullptr);
CloseHandle(h);
if (!ptr)
ptr= MAP_FAILED;
}
}
# else
int flags=
# ifdef HAVE_PMEM
MAP_SHARED_VALIDATE | MAP_SYNC,
# else
MAP_SHARED,
# endif
prot= PROT_READ;
if (!read_only)
# ifdef HAVE_PMEM
prot= PROT_READ | PROT_WRITE;
# else
return MAP_FAILED;
# endif
void *ptr= my_mmap(0, size_t(size), prot, flags, file, 0);
# ifdef HAVE_PMEM
is_pmem= ptr != MAP_FAILED;
# endif
if (ptr != MAP_FAILED)
return ptr;
# ifdef HAVE_PMEM
# ifdef __linux__ /* On Linux, we pretend that /dev/shm is PMEM */
if (srv_operation < SRV_OPERATION_BACKUP)
{ {
struct stat st; struct stat st;
if (!fstat(file, &st)) if (!fstat(file, &st))
@ -203,46 +240,82 @@ static void *log_mmap(os_file_t file, os_offset_t size)
if (!stat("/dev/shm", &st)) if (!stat("/dev/shm", &st))
{ {
MSAN_STAT_WORKAROUND(&st); MSAN_STAT_WORKAROUND(&st);
if (st.st_dev == st_dev) is_pmem= st.st_dev == st_dev;
ptr= my_mmap(0, size_t(size), if (!is_pmem)
srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, return ptr; /* MAP_FAILED */
MAP_SHARED, file, 0);
} }
} }
} }
#endif /* __linux__ */ # endif /* __linux__ */
if (read_only && log_sys.log_mmap)
ptr= my_mmap(0, size_t(size), PROT_READ, MAP_SHARED, file, 0);
# endif /* HAVE_PMEM */
# endif
return ptr; return ptr;
} }
#endif #endif
#ifdef HAVE_PMEM #if defined __linux__ || defined _WIN32
bool log_t::attach(log_file_t file, os_offset_t size) /** Display a message about opening the log */
ATTRIBUTE_COLD static void log_file_message()
{
sql_print_information("InnoDB: %s (block size=%u bytes)",
# ifdef HAVE_INNODB_MMAP
log_sys.log_mmap
? (log_sys.log_buffered
? "Memory-mapped log"
: "Memory-mapped unbuffered log")
:
# endif
log_sys.log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
log_sys.write_size);
}
#else #else
void log_t::attach_low(log_file_t file, os_offset_t size) static inline void log_file_message() {}
#endif #endif
bool log_t::attach(log_file_t file, os_offset_t size)
{ {
log= file; log= file;
ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT);
file_size= size; file_size= size;
#ifdef HAVE_PMEM
ut_ad(!buf); ut_ad(!buf);
ut_ad(!flush_buf); ut_ad(!flush_buf);
if (size && !(size_t(size) & 4095) && srv_operation != SRV_OPERATION_BACKUP) #ifdef HAVE_INNODB_MMAP
if (size)
{ {
void *ptr= log_mmap(log.m_file, size); # ifdef HAVE_PMEM
bool is_pmem;
void *ptr= ::log_mmap(log.m_file, is_pmem, size);
# else
void *ptr= ::log_mmap(log.m_file, size);
# endif
if (ptr != MAP_FAILED) if (ptr != MAP_FAILED)
{ {
log.close(); # ifdef HAVE_PMEM
mprotect(ptr, size_t(size), PROT_READ); if (is_pmem)
{
log.close();
log_buffered= false;
log_maybe_unbuffered= true;
IF_WIN(,mprotect(ptr, size_t(size), PROT_READ));
}
# endif
buf= static_cast<byte*>(ptr); buf= static_cast<byte*>(ptr);
max_buf_free= 1; max_buf_free= 1;
log_maybe_unbuffered= true;
log_buffered= false;
mtr_t::finisher_update(); mtr_t::finisher_update();
return true; # ifdef HAVE_PMEM
if (is_pmem)
return true;
# endif
goto func_exit;
} }
} }
log_mmap= false;
#endif
buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME));
if (!buf) if (!buf)
{ {
@ -256,33 +329,34 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
PSI_INSTRUMENT_ME)); PSI_INSTRUMENT_ME));
if (!flush_buf) if (!flush_buf)
{ {
alloc_fail2:
ut_free_dodump(buf, buf_size); ut_free_dodump(buf, buf_size);
buf= nullptr; buf= nullptr;
goto alloc_fail; goto alloc_fail;
} }
TRASH_ALLOC(buf, buf_size);
TRASH_ALLOC(flush_buf, buf_size);
max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
#endif
#if defined __linux__ || defined _WIN32
sql_print_information("InnoDB: %s (block size=%u bytes)",
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
write_size);
#endif
mtr_t::finisher_update();
#ifdef HAVE_PMEM
ut_ad(ut_is_2pow(write_size)); ut_ad(ut_is_2pow(write_size));
ut_ad(write_size >= 512); ut_ad(write_size >= 512);
ut_ad(write_size <= 4096); ut_ad(write_size <= 4096);
checkpoint_buf= static_cast<byte*>(aligned_malloc(write_size, write_size)); checkpoint_buf= static_cast<byte*>(aligned_malloc(write_size, write_size));
if (!checkpoint_buf)
{
ut_free_dodump(flush_buf, buf_size);
flush_buf= nullptr;
goto alloc_fail2;
}
TRASH_ALLOC(buf, buf_size);
TRASH_ALLOC(flush_buf, buf_size);
max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
mtr_t::finisher_update();
memset_aligned<512>(checkpoint_buf, 0, write_size); memset_aligned<512>(checkpoint_buf, 0, write_size);
return true;
#ifdef HAVE_INNODB_MMAP
func_exit:
#endif #endif
log_file_message();
return true;
} }
/** Write a log file header. /** Write a log file header.
@ -325,66 +399,83 @@ void log_t::create(lsn_t lsn) noexcept
last_checkpoint_lsn= 0; last_checkpoint_lsn= 0;
#ifdef HAVE_PMEM
if (is_pmem())
{
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
memset_aligned<4096>(buf, 0, 4096);
set_buf_free(START_OFFSET);
}
else
#endif
{
set_buf_free(0);
memset_aligned<4096>(flush_buf, 0, buf_size);
memset_aligned<4096>(buf, 0, buf_size);
}
log_sys.header_write(buf, lsn, is_encrypted());
DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn));
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (is_pmem()) if (is_mmap())
{
ut_ad(!is_opened());
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
memset_aligned<4096>(buf, 0, 4096);
log_sys.header_write(buf, lsn, is_encrypted());
set_buf_free(START_OFFSET);
pmem_persist(buf, 512); pmem_persist(buf, 512);
}
else else
#endif #endif
{ {
ut_ad(!is_mmap());
set_buf_free(0);
memset_aligned<4096>(flush_buf, 0, buf_size);
memset_aligned<4096>(buf, 0, buf_size);
log_sys.header_write(buf, lsn, is_encrypted());
log.write(0, {buf, 4096}); log.write(0, {buf, 4096});
memset_aligned<512>(buf, 0, 512); memset_aligned<512>(buf, 0, 512);
} }
} }
void log_t::close_file() ATTRIBUTE_COLD static void log_close_failed(dberr_t err)
{ {
#ifdef HAVE_PMEM ib::fatal() << "closing ib_logfile0 failed: " << err;
if (is_pmem()) }
#ifdef HAVE_INNODB_MMAP
void log_t::close_file(bool really_close)
#else
void log_t::close_file()
#endif
{
#ifdef HAVE_INNODB_MMAP
if (is_mmap())
{ {
ut_ad(!is_opened());
ut_ad(!checkpoint_buf); ut_ad(!checkpoint_buf);
ut_ad(!flush_buf);
if (buf) if (buf)
{ {
my_munmap(buf, file_size); my_munmap(buf, file_size);
buf= nullptr; buf= nullptr;
} }
return; }
else
#endif
{
ut_ad(!buf == !flush_buf);
ut_ad(!buf == !checkpoint_buf);
if (buf)
{
ut_free_dodump(buf, buf_size);
buf= nullptr;
ut_free_dodump(flush_buf, buf_size);
flush_buf= nullptr;
}
aligned_free(checkpoint_buf);
checkpoint_buf= nullptr;
} }
ut_free_dodump(buf, buf_size); #ifdef HAVE_INNODB_MMAP
buf= nullptr; if (really_close)
ut_free_dodump(flush_buf, buf_size);
flush_buf= nullptr;
aligned_free(checkpoint_buf);
checkpoint_buf= nullptr;
#endif #endif
if (is_opened()) if (is_opened())
if (const dberr_t err= log.close()) if (const dberr_t err= log.close())
ib::fatal() << "closing ib_logfile0 failed: " << err; log_close_failed(err);
} }
/** Acquire all latches that protect the log. */ /** Acquire all latches that protect the log. */
static void log_resize_acquire() static void log_resize_acquire()
{ {
if (!log_sys.is_pmem()) #ifdef HAVE_PMEM
if (!log_sys.is_mmap())
#endif
{ {
while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
group_commit_lock::ACQUIRED); group_commit_lock::ACQUIRED);
@ -400,7 +491,9 @@ void log_resize_release()
{ {
log_sys.latch.wr_unlock(); log_sys.latch.wr_unlock();
if (!log_sys.is_pmem()) #ifdef HAVE_PMEM
if (!log_sys.is_mmap())
#endif
{ {
lsn_t lsn1= write_lock.release(write_lock.value()); lsn_t lsn1= write_lock.release(write_lock.value());
lsn_t lsn2= flush_lock.release(flush_lock.value()); lsn_t lsn2= flush_lock.release(flush_lock.value());
@ -413,13 +506,17 @@ void log_resize_release()
/** Try to enable or disable file system caching (update log_buffered) */ /** Try to enable or disable file system caching (update log_buffered) */
void log_t::set_buffered(bool buffered) void log_t::set_buffered(bool buffered)
{ {
if (!log_maybe_unbuffered || is_pmem() || high_level_read_only) if (!log_maybe_unbuffered ||
#ifdef HAVE_PMEM
is_mmap() ||
#endif
high_level_read_only)
return; return;
log_resize_acquire(); log_resize_acquire();
if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered) if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered)
{ {
os_file_close_func(log.m_file); if (const dberr_t err= log.close())
log.m_file= OS_FILE_CLOSED; log_close_failed(err);
std::string path{get_log_file_path()}; std::string path{get_log_file_path()};
log_buffered= buffered; log_buffered= buffered;
bool success; bool success;
@ -427,11 +524,7 @@ void log_t::set_buffered(bool buffered)
OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE, OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
false, &success); false, &success);
ut_a(log.m_file != OS_FILE_CLOSED); ut_a(log.m_file != OS_FILE_CLOSED);
sql_print_information("InnoDB: %s (block size=%u bytes)", log_file_message();
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
write_size);
} }
log_resize_release(); log_resize_release();
} }
@ -450,6 +543,9 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
resize_start_status status= RESIZE_NO_CHANGE; resize_start_status status= RESIZE_NO_CHANGE;
lsn_t start_lsn{0}; lsn_t start_lsn{0};
#ifdef HAVE_PMEM
bool is_pmem{false};
#endif
if (resize_in_progress()) if (resize_in_progress())
status= RESIZE_IN_PROGRESS; status= RESIZE_IN_PROGRESS;
@ -475,10 +571,15 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
void *ptr= nullptr, *ptr2= nullptr; void *ptr= nullptr, *ptr2= nullptr;
success= os_file_set_size(path.c_str(), resize_log.m_file, size); success= os_file_set_size(path.c_str(), resize_log.m_file, size);
if (!success); if (!success);
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
else if (is_pmem()) else if (is_mmap())
{ {
ptr= log_mmap(resize_log.m_file, size); ptr= ::log_mmap(resize_log.m_file,
#ifdef HAVE_PMEM
is_pmem,
#endif
size);
if (ptr == MAP_FAILED) if (ptr == MAP_FAILED)
goto alloc_fail; goto alloc_fail;
} }
@ -518,12 +619,12 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
resize_flush_buf= static_cast<byte*>(ptr2); resize_flush_buf= static_cast<byte*>(ptr2);
start_lsn= get_lsn(); start_lsn= get_lsn();
if (is_pmem()) if (!is_mmap())
resize_log.close();
else
start_lsn= first_lsn + start_lsn= first_lsn +
(~lsn_t{write_size - 1} & (~lsn_t{write_size - 1} &
(lsn_t{write_size - 1} + start_lsn - first_lsn)); (lsn_t{write_size - 1} + start_lsn - first_lsn));
else if (!is_opened())
resize_log.close();
} }
resize_lsn.store(start_lsn, std::memory_order_relaxed); resize_lsn.store(start_lsn, std::memory_order_relaxed);
status= success ? RESIZE_STARTED : RESIZE_FAILED; status= success ? RESIZE_STARTED : RESIZE_FAILED;
@ -552,14 +653,13 @@ void log_t::resize_abort() noexcept
if (resize_in_progress() > 1) if (resize_in_progress() > 1)
{ {
if (!is_pmem()) if (!is_mmap())
{ {
resize_log.close();
ut_free_dodump(resize_buf, buf_size); ut_free_dodump(resize_buf, buf_size);
ut_free_dodump(resize_flush_buf, buf_size); ut_free_dodump(resize_flush_buf, buf_size);
resize_flush_buf= nullptr; resize_flush_buf= nullptr;
} }
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
else else
{ {
ut_ad(!resize_log.is_opened()); ut_ad(!resize_log.is_opened());
@ -568,6 +668,8 @@ void log_t::resize_abort() noexcept
my_munmap(resize_buf, resize_target); my_munmap(resize_buf, resize_target);
} }
#endif #endif
if (resize_log.is_opened())
resize_log.close();
resize_buf= nullptr; resize_buf= nullptr;
resize_target= 0; resize_target= 0;
resize_lsn.store(0, std::memory_order_relaxed); resize_lsn.store(0, std::memory_order_relaxed);
@ -732,7 +834,7 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra)
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
void log_t::persist(lsn_t lsn, bool holding_latch) noexcept void log_t::persist(lsn_t lsn, bool holding_latch) noexcept
{ {
ut_ad(is_pmem()); ut_ad(!is_opened());
ut_ad(!write_lock.is_owner()); ut_ad(!write_lock.is_owner());
ut_ad(!flush_lock.is_owner()); ut_ad(!flush_lock.is_owner());
#ifdef LOG_LATCH_DEBUG #ifdef LOG_LATCH_DEBUG
@ -752,12 +854,11 @@ void log_t::persist(lsn_t lsn, bool holding_latch) noexcept
if (UNIV_UNLIKELY(end < start)) if (UNIV_UNLIKELY(end < start))
{ {
pmem_persist(log_sys.buf + start, log_sys.file_size - start); pmem_persist(buf + start, file_size - start);
pmem_persist(log_sys.buf + log_sys.START_OFFSET, pmem_persist(buf + START_OFFSET, end - START_OFFSET);
end - log_sys.START_OFFSET);
} }
else else
pmem_persist(log_sys.buf + start, end - start); pmem_persist(buf + start, end - start);
old= flushed_to_disk_lsn.load(std::memory_order_relaxed); old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
@ -818,7 +919,7 @@ void log_t::resize_write_buf(const byte *b, size_t length) noexcept
template<bool release_latch> inline lsn_t log_t::write_buf() noexcept template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
{ {
ut_ad(latch_have_wr()); ut_ad(latch_have_wr());
ut_ad(!is_pmem()); ut_ad(!is_mmap());
ut_ad(!srv_read_only_mode); ut_ad(!srv_read_only_mode);
const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; const lsn_t lsn{get_lsn(std::memory_order_relaxed)};
@ -941,7 +1042,7 @@ bool log_t::flush(lsn_t lsn) noexcept
*/ */
static lsn_t log_flush(lsn_t lsn) static lsn_t log_flush(lsn_t lsn)
{ {
ut_ad(!log_sys.is_pmem()); ut_ad(!log_sys.is_mmap());
ut_a(log_sys.flush(lsn)); ut_a(log_sys.flush(lsn));
DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
return flush_lock.release(lsn); return flush_lock.release(lsn);
@ -961,6 +1062,7 @@ void log_write_up_to(lsn_t lsn, bool durable,
ut_ad(!srv_read_only_mode || log_sys.buf_free_ok()); ut_ad(!srv_read_only_mode || log_sys.buf_free_ok());
ut_ad(lsn != LSN_MAX); ut_ad(lsn != LSN_MAX);
ut_ad(lsn != 0); ut_ad(lsn != 0);
ut_ad(!log_sys.is_mmap() || !callback || durable);
if (UNIV_UNLIKELY(recv_no_ibuf_operations)) if (UNIV_UNLIKELY(recv_no_ibuf_operations))
{ {
@ -973,21 +1075,25 @@ void log_write_up_to(lsn_t lsn, bool durable,
ut_ad(lsn <= log_sys.get_lsn()); ut_ad(lsn <= log_sys.get_lsn());
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (log_sys.is_pmem()) if (log_sys.is_mmap())
{ {
ut_ad(!callback);
if (durable) if (durable)
log_sys.persist(lsn, false); log_sys.persist(lsn, false);
return; return;
} }
#endif #endif
ut_ad(!log_sys.is_mmap());
repeat: repeat:
if (durable) if (durable)
{ {
if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
return; return;
flush_lock.set_pending(log_sys.get_lsn()); /* Promise to other concurrent flush_lock.acquire() that we
will durable at least up to the current LSN. The LSN may still
advance until we acquire log_sys.latch below. */
lsn= log_sys.get_lsn();
flush_lock.set_pending(lsn);
} }
lsn_t pending_write_lsn= 0, pending_flush_lsn= 0; lsn_t pending_write_lsn= 0, pending_flush_lsn= 0;
@ -1023,8 +1129,10 @@ void log_buffer_flush_to_disk(bool durable)
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
ATTRIBUTE_COLD void log_write_and_flush_prepare() ATTRIBUTE_COLD void log_write_and_flush_prepare()
{ {
if (log_sys.is_pmem()) #ifdef HAVE_PMEM
if (log_sys.is_mmap())
return; return;
#endif
while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
group_commit_lock::ACQUIRED); group_commit_lock::ACQUIRED);
@ -1032,20 +1140,56 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare()
group_commit_lock::ACQUIRED); group_commit_lock::ACQUIRED);
} }
#ifdef HAVE_INNODB_MMAP
void log_t::clear_mmap()
{
if (!is_mmap() ||
#ifdef HAVE_PMEM
!is_opened() ||
#endif
high_level_read_only)
return;
log_resize_acquire();
ut_ad(!resize_in_progress());
ut_ad(write_lsn == get_lsn());
ut_ad(write_lsn == get_flushed_lsn(std::memory_order_relaxed));
if (buf) /* this may be invoked while creating a new database */
{
alignas(16) byte log_block[4096];
const size_t bs{write_size};
const size_t bf{buf_free.load(std::memory_order_relaxed)};
{
byte *const b= buf;
memcpy_aligned<16>(log_block, b + (bf & ~(bs - 1)), bs);
}
close_file(false);
log_mmap= false;
ut_a(attach(log, file_size));
ut_ad(!is_mmap());
set_buf_free(bf & (bs - 1));
memcpy_aligned<16>(log_sys.buf, log_block, bs);
}
log_resize_release();
}
#endif
/** Durably write the log up to log_sys.get_lsn(). */ /** Durably write the log up to log_sys.get_lsn(). */
ATTRIBUTE_COLD void log_write_and_flush() ATTRIBUTE_COLD void log_write_and_flush()
{ {
ut_ad(!srv_read_only_mode); ut_ad(!srv_read_only_mode);
if (!log_sys.is_pmem()) #ifdef HAVE_PMEM
if (log_sys.is_mmap())
log_sys.persist(log_sys.get_lsn(), true);
else
#endif
{ {
const lsn_t lsn{log_sys.write_buf<false>()}; const lsn_t lsn{log_sys.write_buf<false>()};
write_lock.release(lsn); write_lock.release(lsn);
log_flush(lsn); log_flush(lsn);
} }
#ifdef HAVE_PMEM
else
log_sys.persist(log_sys.get_lsn(), true);
#endif
} }
/****************************************************************//** /****************************************************************//**
@ -1323,18 +1467,9 @@ void log_t::close()
if (!is_initialised()) return; if (!is_initialised()) return;
close_file(); close_file();
#ifndef HAVE_PMEM
ut_free_dodump(buf, buf_size);
buf= nullptr;
ut_free_dodump(flush_buf, buf_size);
flush_buf= nullptr;
aligned_free(checkpoint_buf);
checkpoint_buf= nullptr;
#else
ut_ad(!checkpoint_buf); ut_ad(!checkpoint_buf);
ut_ad(!buf); ut_ad(!buf);
ut_ad(!flush_buf); ut_ad(!flush_buf);
#endif
latch.destroy(); latch.destroy();
lsn_lock.destroy(); lsn_lock.destroy();

View file

@ -1480,6 +1480,7 @@ void recv_sys_t::debug_free()
pages_it= pages.end(); pages_it= pages.end();
mysql_mutex_unlock(&mutex); mysql_mutex_unlock(&mutex);
log_sys.clear_mmap();
} }
@ -1632,7 +1633,7 @@ ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2()
byte *buf= const_cast<byte*>(field_ref_zero); byte *buf= const_cast<byte*>(field_ref_zero);
if (source_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) if (source_offset < (log_sys.is_mmap() ? log_sys.file_size : 4096))
memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512); memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512);
else else
if (dberr_t err= recv_sys.read(source_offset & ~511, {buf, 512})) if (dberr_t err= recv_sys.read(source_offset & ~511, {buf, 512}))
@ -1671,7 +1672,7 @@ static dberr_t recv_log_recover_10_5(lsn_t lsn_offset)
{ {
byte *buf= const_cast<byte*>(field_ref_zero); byte *buf= const_cast<byte*>(field_ref_zero);
if (lsn_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) if (lsn_offset < (log_sys.is_mmap() ? log_sys.file_size : 4096))
memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512); memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512);
else else
{ {
@ -1772,7 +1773,7 @@ dberr_t recv_sys_t::find_checkpoint()
log_sys.next_checkpoint_lsn= 0; log_sys.next_checkpoint_lsn= 0;
lsn= 0; lsn= 0;
buf= my_assume_aligned<4096>(log_sys.buf); buf= my_assume_aligned<4096>(log_sys.buf);
if (!log_sys.is_pmem()) if (!log_sys.is_mmap())
if (dberr_t err= log_sys.log.read(0, {buf, log_sys.START_OFFSET})) if (dberr_t err= log_sys.log.read(0, {buf, log_sys.START_OFFSET}))
return err; return err;
/* Check the header page checksum. There was no /* Check the header page checksum. There was no
@ -2210,7 +2211,7 @@ static void store_freed_or_init_rec(page_id_t page_id, bool freed)
/** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */ /** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */
struct recv_buf struct recv_buf
{ {
bool is_pmem() const noexcept { return log_sys.is_pmem(); } bool is_mmap() const noexcept { return log_sys.is_mmap(); }
const byte *ptr; const byte *ptr;
@ -2301,11 +2302,11 @@ struct recv_buf
} }
}; };
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
/** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */ /** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */
struct recv_ring : public recv_buf struct recv_ring : public recv_buf
{ {
static constexpr bool is_pmem() { return true; } static constexpr bool is_mmap() { return true; }
constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {} constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {}
@ -2598,7 +2599,7 @@ restart:
ut_d(const source el{l}); ut_d(const source el{l});
lsn+= l - begin; lsn+= l - begin;
offset= l.ptr - log_sys.buf; offset= l.ptr - log_sys.buf;
if (!l.is_pmem()); if (!l.is_mmap());
else if (offset == log_sys.file_size) else if (offset == log_sys.file_size)
offset= log_sys.START_OFFSET; offset= log_sys.START_OFFSET;
else else
@ -3110,12 +3111,12 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) noexcept
template template
recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr<false>(bool) noexcept; recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr<false>(bool) noexcept;
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
template<bool store> template<bool store>
recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool if_exists) noexcept
{ {
recv_sys_t::parse_mtr_result r{parse_mtr<store>(if_exists)}; recv_sys_t::parse_mtr_result r{parse_mtr<store>(if_exists)};
if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_pmem()) if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_mmap())
return r; return r;
ut_ad(recv_sys.len == log_sys.file_size); ut_ad(recv_sys.len == log_sys.file_size);
ut_ad(recv_sys.offset >= log_sys.START_OFFSET); ut_ad(recv_sys.offset >= log_sys.START_OFFSET);
@ -3126,6 +3127,10 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept
: &log_sys.buf[recv_sys.offset]}; : &log_sys.buf[recv_sys.offset]};
return recv_sys.parse<recv_ring,store>(s, if_exists); return recv_sys.parse<recv_ring,store>(s, if_exists);
} }
/** for mariadb-backup; @see xtrabackup_copy_mmap_logfile() */
template
recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap<false>(bool) noexcept;
#endif #endif
/** Apply the hashed log records to the page, if the page lsn is less than the /** Apply the hashed log records to the page, if the page lsn is less than the
@ -3996,7 +4001,7 @@ void recv_sys_t::apply(bool last_batch)
log_sort_flush_list(); log_sort_flush_list();
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (last_batch && log_sys.is_pmem()) if (last_batch && log_sys.is_mmap() && !log_sys.is_opened())
mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE); mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE);
#endif #endif
@ -4024,15 +4029,13 @@ static bool recv_scan_log(bool last_phase)
bool store{recv_sys.file_checkpoint != 0}; bool store{recv_sys.file_checkpoint != 0};
size_t buf_size= log_sys.buf_size; size_t buf_size= log_sys.buf_size;
#ifdef HAVE_PMEM if (log_sys.is_mmap())
if (log_sys.is_pmem())
{ {
recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
buf_size= size_t(log_sys.file_size); buf_size= size_t(log_sys.file_size);
recv_sys.len= size_t(log_sys.file_size); recv_sys.len= size_t(log_sys.file_size);
} }
else else
#endif
{ {
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1; block_size_1;
@ -4094,7 +4097,7 @@ static bool recv_scan_log(bool last_phase)
for (;;) for (;;)
{ {
const byte& b{log_sys.buf[recv_sys.offset]}; const byte& b{log_sys.buf[recv_sys.offset]};
r= recv_sys.parse_pmem<false>(false); r= recv_sys.parse_mmap<false>(false);
switch (r) { switch (r) {
case recv_sys_t::PREMATURE_EOF: case recv_sys_t::PREMATURE_EOF:
goto read_more; goto read_more;
@ -4124,7 +4127,7 @@ static bool recv_scan_log(bool last_phase)
else else
{ {
ut_ad(recv_sys.file_checkpoint != 0); ut_ad(recv_sys.file_checkpoint != 0);
switch ((r= recv_sys.parse_pmem<true>(false))) { switch ((r= recv_sys.parse_mmap<true>(false))) {
case recv_sys_t::PREMATURE_EOF: case recv_sys_t::PREMATURE_EOF:
goto read_more; goto read_more;
case recv_sys_t::GOT_EOF: case recv_sys_t::GOT_EOF:
@ -4146,11 +4149,11 @@ static bool recv_scan_log(bool last_phase)
if (!store) if (!store)
skip_the_rest: skip_the_rest:
while ((r= recv_sys.parse_pmem<false>(false)) == recv_sys_t::OK); while ((r= recv_sys.parse_mmap<false>(false)) == recv_sys_t::OK);
else else
{ {
uint16_t count= 0; uint16_t count= 0;
while ((r= recv_sys.parse_pmem<true>(last_phase)) == recv_sys_t::OK) while ((r= recv_sys.parse_mmap<true>(last_phase)) == recv_sys_t::OK)
if (!++count && recv_sys.report(time(nullptr))) if (!++count && recv_sys.report(time(nullptr)))
{ {
const size_t n= recv_sys.pages.size(); const size_t n= recv_sys.pages.size();
@ -4189,10 +4192,9 @@ static bool recv_scan_log(bool last_phase)
} }
read_more: read_more:
#ifdef HAVE_PMEM if (log_sys.is_mmap())
if (log_sys.is_pmem())
break; break;
#endif
if (recv_sys.is_corrupt_log()) if (recv_sys.is_corrupt_log())
break; break;
@ -4537,13 +4539,13 @@ inline void log_t::set_recovered() noexcept
ut_ad(get_flushed_lsn() == get_lsn()); ut_ad(get_flushed_lsn() == get_lsn());
ut_ad(recv_sys.lsn == get_lsn()); ut_ad(recv_sys.lsn == get_lsn());
size_t offset{recv_sys.offset}; size_t offset{recv_sys.offset};
if (!is_pmem()) if (!is_mmap())
{ {
const size_t bs{log_sys.write_size}, bs_1{bs - 1}; const size_t bs{log_sys.write_size}, bs_1{bs - 1};
memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs); memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs);
offset&= bs_1; offset&= bs_1;
} }
#ifdef HAVE_PMEM #ifndef _WIN32
else else
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
#endif #endif

View file

@ -42,6 +42,7 @@ Created 11/26/1995 Heikki Tuuri
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
void (*mtr_t::commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>); void (*mtr_t::commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>);
#endif #endif
std::pair<lsn_t,mtr_t::page_flush_ahead> (*mtr_t::finisher)(mtr_t *, size_t); std::pair<lsn_t,mtr_t::page_flush_ahead> (*mtr_t::finisher)(mtr_t *, size_t);
unsigned mtr_t::spin_wait_delay; unsigned mtr_t::spin_wait_delay;
@ -49,7 +50,7 @@ void mtr_t::finisher_update()
{ {
ut_ad(log_sys.latch_have_wr()); ut_ad(log_sys.latch_have_wr());
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
if (log_sys.is_pmem()) if (log_sys.is_mmap())
{ {
commit_logger= mtr_t::commit_log<true>; commit_logger= mtr_t::commit_log<true>;
finisher= spin_wait_delay finisher= spin_wait_delay
@ -351,11 +352,11 @@ inline lsn_t log_t::get_write_target() const
return write_lsn + max_buf_free / 2; return write_lsn + max_buf_free / 2;
} }
template<bool pmem> template<bool mmap>
void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns) void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
{ {
size_t modified= 0; size_t modified= 0;
const lsn_t write_lsn= pmem ? 0 : log_sys.get_write_target(); const lsn_t write_lsn= mmap ? 0 : log_sys.get_write_target();
if (mtr->m_made_dirty) if (mtr->m_made_dirty)
{ {
@ -475,7 +476,7 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
if (!pmem && UNIV_UNLIKELY(write_lsn != 0)) if (!mmap && UNIV_UNLIKELY(write_lsn != 0))
log_write_up_to(write_lsn, false); log_write_up_to(write_lsn, false);
} }
@ -1011,7 +1012,7 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn)
else else
latch.rd_unlock(); latch.rd_unlock();
log_write_up_to(lsn, is_pmem()); log_write_up_to(lsn, is_mmap());
if (ex) if (ex)
latch.wr_lock(SRW_LOCK_CALL); latch.wr_lock(SRW_LOCK_CALL);
@ -1027,16 +1028,16 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn)
/** Reserve space in the log buffer for appending data. /** Reserve space in the log buffer for appending data.
@tparam spin whether to use the spin-only lock_lsn() @tparam spin whether to use the spin-only lock_lsn()
@tparam pmem log_sys.is_pmem() @tparam mmap log_sys.is_mmap()
@param size total length of the data to append(), in bytes @param size total length of the data to append(), in bytes
@param ex whether log_sys.latch is exclusively locked @param ex whether log_sys.latch is exclusively locked
@return the start LSN and the buffer position for append() */ @return the start LSN and the buffer position for append() */
template<bool spin,bool pmem> template<bool spin,bool mmap>
inline inline
std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
{ {
ut_ad(ex ? latch_have_wr() : latch_have_rd()); ut_ad(ex ? latch_have_wr() : latch_have_rd());
ut_ad(pmem == is_pmem()); ut_ad(mmap == is_mmap());
if (!spin) if (!spin)
lsn_lock.wr_lock(); lsn_lock.wr_lock();
size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)}; size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)};
@ -1044,7 +1045,7 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size}; lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size};
if (UNIV_UNLIKELY(pmem if (UNIV_UNLIKELY(mmap
? (end_lsn - ? (end_lsn -
get_flushed_lsn(std::memory_order_relaxed)) > capacity() get_flushed_lsn(std::memory_order_relaxed)) > capacity()
: b + size >= buf_size)) : b + size >= buf_size))
@ -1057,7 +1058,7 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
} }
size_t new_buf_free= b + size; size_t new_buf_free= b + size;
if (pmem && new_buf_free >= file_size) if (mmap && new_buf_free >= file_size)
new_buf_free-= size_t(capacity()); new_buf_free-= size_t(capacity());
lsn.store(end_lsn, std::memory_order_relaxed); lsn.store(end_lsn, std::memory_order_relaxed);
@ -1213,10 +1214,10 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
end-= len; end-= len;
size_t s; size_t s;
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
if (!resize_flush_buf) if (!resize_flush_buf)
{ {
ut_ad(is_pmem()); ut_ad(is_mmap());
lsn_lock.wr_lock(); lsn_lock.wr_lock();
const size_t resize_capacity{resize_target - START_OFFSET}; const size_t resize_capacity{resize_target - START_OFFSET};
{ {
@ -1236,7 +1237,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
cannot overlap, that is, our entire log must be discarded. cannot overlap, that is, our entire log must be discarded.
Besides, incomplete mini-transactions cannot be parsed anyway. */ Besides, incomplete mini-transactions cannot be parsed anyway. */
ut_ad(resizing >= lsn + len); ut_ad(resizing >= lsn + len);
goto pmem_done; goto mmap_done;
} }
s= START_OFFSET; s= START_OFFSET;
@ -1277,7 +1278,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
we will advance resize_lsn. */ we will advance resize_lsn. */
ut_ad(resize_buf[s] <= 1); ut_ad(resize_buf[s] <= 1);
resize_buf[s]= 1; resize_buf[s]= 1;
pmem_done: mmap_done:
lsn_lock.wr_unlock(); lsn_lock.wr_unlock();
} }
else else
@ -1300,12 +1301,12 @@ inline void log_t::append(byte *&d, const void *s, size_t size) noexcept
{ {
ut_ad(log_sys.latch_have_any()); ut_ad(log_sys.latch_have_any());
ut_ad(d + size <= log_sys.buf + ut_ad(d + size <= log_sys.buf +
(log_sys.is_pmem() ? log_sys.file_size : log_sys.buf_size)); (log_sys.is_mmap() ? log_sys.file_size : log_sys.buf_size));
memcpy(d, s, size); memcpy(d, s, size);
d+= size; d+= size;
} }
template<bool spin,bool pmem> template<bool spin,bool mmap>
std::pair<lsn_t,mtr_t::page_flush_ahead> std::pair<lsn_t,mtr_t::page_flush_ahead>
mtr_t::finish_writer(mtr_t *mtr, size_t len) mtr_t::finish_writer(mtr_t *mtr, size_t len)
{ {
@ -1316,16 +1317,14 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len)
const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U};
std::pair<lsn_t, byte*> start= std::pair<lsn_t, byte*> start=
log_sys.append_prepare<spin,pmem>(len, mtr->m_latch_ex); log_sys.append_prepare<spin,mmap>(len, mtr->m_latch_ex);
if (!pmem) if (!mmap)
{ {
mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b) mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{ log_sys.append(start.second, b->begin(), b->used()); return true; }); { log_sys.append(start.second, b->begin(), b->used()); return true; });
#ifdef HAVE_PMEM
write_trailer: write_trailer:
#endif
*start.second++= log_sys.get_sequence_bit(start.first + len - size); *start.second++= log_sys.get_sequence_bit(start.first + len - size);
if (mtr->m_commit_lsn) if (mtr->m_commit_lsn)
{ {
@ -1336,7 +1335,6 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len)
mach_write_to_4(start.second, mtr->m_crc); mach_write_to_4(start.second, mtr->m_crc);
start.second+= 4; start.second+= 4;
} }
#ifdef HAVE_PMEM
else else
{ {
if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size]))
@ -1384,9 +1382,6 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len)
((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) + ((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) +
(size - size_left); (size - size_left);
} }
#else
static_assert(!pmem, "");
#endif
log_sys.resize_write(start.first, start.second, len, size); log_sys.resize_write(start.first, start.second, len, size);

View file

@ -1094,7 +1094,8 @@ same_size:
log_sys.latch.wr_unlock(); log_sys.latch.wr_unlock();
log_write_up_to(flushed_lsn, false); if (latest_format)
log_write_up_to(flushed_lsn, false);
ut_ad(flushed_lsn == log_sys.get_lsn()); ut_ad(flushed_lsn == log_sys.get_lsn());
ut_ad(!os_aio_pending_reads()); ut_ad(!os_aio_pending_reads());
@ -1290,10 +1291,7 @@ dberr_t srv_start(bool create_new_db)
} }
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
if (!log_sys.create()) { log_sys.create();
return srv_init_abort(DB_ERROR);
}
recv_sys.create(); recv_sys.create();
lock_sys.create(srv_lock_table_size); lock_sys.create(srv_lock_table_size);
@ -1856,13 +1854,13 @@ skip_monitors:
if (srv_print_verbose_log) { if (srv_print_verbose_log) {
sql_print_information("InnoDB: " sql_print_information("InnoDB: "
"log sequence number " LSN_PF "log sequence number " LSN_PF
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
"%s" "%s"
#endif #endif
"; transaction id " TRX_ID_FMT, "; transaction id " TRX_ID_FMT,
recv_sys.lsn, recv_sys.lsn,
#ifdef HAVE_PMEM #ifdef HAVE_INNODB_MMAP
log_sys.is_pmem() log_sys.is_mmap()
? " (memory-mapped)" : "", ? " (memory-mapped)" : "",
#endif #endif
trx_sys.get_max_trx_id()); trx_sys.get_max_trx_id());

View file

@ -1260,20 +1260,20 @@ static void trx_flush_log_if_needed(lsn_t lsn, trx_t *trx)
const bool flush= const bool flush=
(srv_file_flush_method != SRV_NOSYNC && (srv_file_flush_method != SRV_NOSYNC &&
(srv_flush_log_at_trx_commit & 1)); (srv_flush_log_at_trx_commit & 1));
if (!log_sys.is_mmap())
{
completion_callback cb;
completion_callback cb; if ((cb.m_param= thd_increment_pending_ops(trx->mysql_thd)))
if (!log_sys.is_pmem() && {
(cb.m_param= thd_increment_pending_ops(trx->mysql_thd))) cb.m_callback= (void (*)(void *)) thd_decrement_pending_ops;
{ log_write_up_to(lsn, flush, &cb);
cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops; return;
log_write_up_to(lsn, flush, &cb); }
}
else
{
trx->op_info= "flushing log";
log_write_up_to(lsn, flush);
trx->op_info= "";
} }
trx->op_info= "flushing log";
log_write_up_to(lsn, flush);
trx->op_info= "";
} }
/** Process tables that were modified by the committing transaction. */ /** Process tables that were modified by the committing transaction. */