2012-08-01 17:27:34 +03:00
|
|
|
/*****************************************************************************
|
|
|
|
|
2017-01-06 19:48:54 +05:30
|
|
|
Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
|
2022-04-19 13:49:52 +03:00
|
|
|
Copyright (c) 2017, 2022, MariaDB Corporation.
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
|
|
Foundation; version 2 of the License.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
2019-05-11 19:25:02 +03:00
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
/**************************************************//**
|
|
|
|
@file buf/buf0dump.cc
|
|
|
|
Implements a buffer pool dump/load.
|
|
|
|
|
|
|
|
Created April 08, 2011 Vasil Dimov
|
|
|
|
*******************************************************/
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "my_global.h"
|
2020-04-28 19:39:40 +03:00
|
|
|
#include "mysqld.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "my_sys.h"
|
|
|
|
|
|
|
|
#include "mysql/psi/mysql_stage.h"
|
|
|
|
#include "mysql/psi/psi.h"
|
2013-03-26 00:03:13 +02:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "buf0buf.h"
|
2012-08-01 17:27:34 +03:00
|
|
|
#include "buf0dump.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "dict0dict.h"
|
|
|
|
#include "os0file.h"
|
|
|
|
#include "srv0srv.h"
|
|
|
|
#include "srv0start.h"
|
|
|
|
#include "ut0byte.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2017-08-31 08:27:59 +03:00
|
|
|
#include "mysql/service_wsrep.h" /* wsrep_recovery */
|
2017-11-30 13:37:59 +11:00
|
|
|
#include <my_service_manager.h>
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
static void buf_do_load_dump();
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
enum status_severity {
|
|
|
|
STATUS_INFO,
|
|
|
|
STATUS_ERR
|
|
|
|
};
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE)
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
/* Flags that tell the buffer pool dump/load thread which action should it
|
|
|
|
take after being waked up. */
|
2017-02-17 10:32:21 +02:00
|
|
|
static volatile bool buf_dump_should_start;
|
|
|
|
static volatile bool buf_load_should_start;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
static bool buf_load_abort_flag;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/** Start the buffer pool dump/load task and instructs it to start a dump. */
|
|
|
|
void buf_dump_start()
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_dump_should_start= true;
|
|
|
|
buf_do_load_dump();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/** Start the buffer pool dump/load task and instructs it to start a load. */
|
|
|
|
void buf_load_start()
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_should_start= true;
|
|
|
|
buf_do_load_dump();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
|
|
|
Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
|
|
|
|
to the specified string. The format and the following parameters are the
|
|
|
|
same as the ones used for printf(3). The value of this variable can be
|
|
|
|
retrieved by:
|
|
|
|
SELECT variable_value FROM information_schema.global_status WHERE
|
|
|
|
variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
|
|
|
|
or by:
|
|
|
|
SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
|
2016-06-21 14:21:03 +02:00
|
|
|
static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
|
2012-08-01 17:27:34 +03:00
|
|
|
void
|
|
|
|
buf_dump_status(
|
|
|
|
/*============*/
|
|
|
|
enum status_severity severity,/*!< in: status severity */
|
|
|
|
const char* fmt, /*!< in: format */
|
|
|
|
...) /*!< in: extra parameters according
|
|
|
|
to fmt */
|
|
|
|
{
|
|
|
|
va_list ap;
|
|
|
|
|
|
|
|
va_start(ap, fmt);
|
|
|
|
|
2017-11-13 04:32:56 +02:00
|
|
|
vsnprintf(
|
2012-08-01 17:27:34 +03:00
|
|
|
export_vars.innodb_buffer_pool_dump_status,
|
|
|
|
sizeof(export_vars.innodb_buffer_pool_dump_status),
|
|
|
|
fmt, ap);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
switch (severity) {
|
|
|
|
case STATUS_INFO:
|
|
|
|
ib::info() << export_vars.innodb_buffer_pool_dump_status;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATUS_ERR:
|
|
|
|
ib::error() << export_vars.innodb_buffer_pool_dump_status;
|
|
|
|
break;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
va_end(ap);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
|
|
|
Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
|
|
|
|
to the specified string. The format and the following parameters are the
|
|
|
|
same as the ones used for printf(3). The value of this variable can be
|
|
|
|
retrieved by:
|
|
|
|
SELECT variable_value FROM information_schema.global_status WHERE
|
|
|
|
variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
|
|
|
|
or by:
|
|
|
|
SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
|
2016-06-21 14:21:03 +02:00
|
|
|
static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
|
2012-08-01 17:27:34 +03:00
|
|
|
void
|
|
|
|
buf_load_status(
|
|
|
|
/*============*/
|
|
|
|
enum status_severity severity,/*!< in: status severity */
|
|
|
|
const char* fmt, /*!< in: format */
|
|
|
|
...) /*!< in: extra parameters according to fmt */
|
|
|
|
{
|
|
|
|
va_list ap;
|
|
|
|
|
|
|
|
va_start(ap, fmt);
|
|
|
|
|
2017-11-13 04:32:56 +02:00
|
|
|
vsnprintf(
|
2012-08-01 17:27:34 +03:00
|
|
|
export_vars.innodb_buffer_pool_load_status,
|
|
|
|
sizeof(export_vars.innodb_buffer_pool_load_status),
|
|
|
|
fmt, ap);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
switch (severity) {
|
|
|
|
case STATUS_INFO:
|
|
|
|
ib::info() << export_vars.innodb_buffer_pool_load_status;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATUS_ERR:
|
|
|
|
ib::error() << export_vars.innodb_buffer_pool_load_status;
|
|
|
|
break;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
va_end(ap);
|
|
|
|
}
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** Returns the directory path where the buffer pool dump file will be created.
|
|
|
|
@return directory path */
|
|
|
|
static
|
|
|
|
const char*
|
|
|
|
get_buf_dump_dir()
|
|
|
|
{
|
|
|
|
const char* dump_dir;
|
|
|
|
|
|
|
|
/* The dump file should be created in the default data directory if
|
|
|
|
innodb_data_home_dir is set as an empty string. */
|
2020-04-28 14:51:25 +03:00
|
|
|
if (!*srv_data_home) {
|
2016-09-06 09:43:16 +03:00
|
|
|
dump_dir = fil_path_to_mysql_datadir;
|
|
|
|
} else {
|
|
|
|
dump_dir = srv_data_home;
|
|
|
|
}
|
|
|
|
|
|
|
|
return(dump_dir);
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Generate the path to the buffer pool dump/load file.
|
|
|
|
@param[out] path generated path
|
|
|
|
@param[in] path_size size of 'path', used as in snprintf(3). */
|
2020-04-28 19:39:40 +03:00
|
|
|
static void buf_dump_generate_path(char *path, size_t path_size)
|
2016-02-16 12:07:18 +01:00
|
|
|
{
|
2016-08-12 11:17:45 +03:00
|
|
|
char buf[FN_REFLEN];
|
|
|
|
|
2020-04-28 19:39:40 +03:00
|
|
|
mysql_mutex_lock(&LOCK_global_system_variables);
|
MDEV-25312 Replace fil_space_t::name with fil_space_t::name()
A consistency check for fil_space_t::name is causing recovery failures
in MDEV-25180 (Atomic ALTER TABLE). So, we'd better remove that field
altogether.
fil_space_t::name was more or less a copy of dict_table_t::name
(except for some special cases), and it was not being used for
anything useful.
There used to be a name_hash, but it had been removed already in
commit a75dbfd7183cc96680f3e3e684fd36500dac8158 (MDEV-12266).
We will also remove os_normalize_path(), OS_PATH_SEPARATOR,
OS_PATH_SEPATOR_ALT. On Microsoft Windows, we will treat \ and /
roughly in the same way. The intention is that for per-table
tablespaces, the filenames will always follow the pattern
prefix/databasename/tablename.ibd. (Any \ in the prefix must not
be converted.)
ut_basename_noext(): Remove (unused function).
read_link_file(): Replaces RemoteDatafile::read_link_file().
We will ensure that the last two path component separators are
forward slashes (converting up to 2 trailing backslashes on
Microsoft Windows), so that everywhere else we can
assume that data file names end in "/databasename/tablename.ibd".
Note: On Microsoft Windows, path names that start with \\?\ must
not contain / as path component separators. Previously, such paths
did work in the DATA DIRECTORY argument of InnoDB tables.
Reviewed by: Vladislav Vaintroub
2021-04-07 18:01:13 +03:00
|
|
|
snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(),
|
|
|
|
srv_buf_dump_filename);
|
2020-04-28 19:39:40 +03:00
|
|
|
mysql_mutex_unlock(&LOCK_global_system_variables);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
os_file_type_t type;
|
|
|
|
bool exists = false;
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
ret = os_file_status(buf, &exists, &type);
|
|
|
|
|
|
|
|
/* For realpath() to succeed the file must exist. */
|
|
|
|
|
|
|
|
if (ret && exists) {
|
|
|
|
/* my_realpath() assumes the destination buffer is big enough
|
|
|
|
to hold FN_REFLEN bytes. */
|
|
|
|
ut_a(path_size >= FN_REFLEN);
|
2016-02-16 12:07:18 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
my_realpath(path, buf, 0);
|
2016-02-16 12:07:18 +01:00
|
|
|
} else {
|
2016-08-12 11:17:45 +03:00
|
|
|
/* If it does not exist, then resolve only srv_data_home
|
|
|
|
and append srv_buf_dump_filename to it. */
|
|
|
|
char srv_data_home_full[FN_REFLEN];
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
|
MDEV-25312 Replace fil_space_t::name with fil_space_t::name()
A consistency check for fil_space_t::name is causing recovery failures
in MDEV-25180 (Atomic ALTER TABLE). So, we'd better remove that field
altogether.
fil_space_t::name was more or less a copy of dict_table_t::name
(except for some special cases), and it was not being used for
anything useful.
There used to be a name_hash, but it had been removed already in
commit a75dbfd7183cc96680f3e3e684fd36500dac8158 (MDEV-12266).
We will also remove os_normalize_path(), OS_PATH_SEPARATOR,
OS_PATH_SEPATOR_ALT. On Microsoft Windows, we will treat \ and /
roughly in the same way. The intention is that for per-table
tablespaces, the filenames will always follow the pattern
prefix/databasename/tablename.ibd. (Any \ in the prefix must not
be converted.)
ut_basename_noext(): Remove (unused function).
read_link_file(): Replaces RemoteDatafile::read_link_file().
We will ensure that the last two path component separators are
forward slashes (converting up to 2 trailing backslashes on
Microsoft Windows), so that everywhere else we can
assume that data file names end in "/databasename/tablename.ibd".
Note: On Microsoft Windows, path names that start with \\?\ must
not contain / as path component separators. Previously, such paths
did work in the DATA DIRECTORY argument of InnoDB tables.
Reviewed by: Vladislav Vaintroub
2021-04-07 18:01:13 +03:00
|
|
|
const char *format;
|
|
|
|
|
|
|
|
switch (srv_data_home_full[strlen(srv_data_home_full) - 1]) {
|
|
|
|
#ifdef _WIN32
|
|
|
|
case '\\':
|
|
|
|
#endif
|
|
|
|
case '/':
|
|
|
|
format = "%s%s";
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
format = "%s/%s";
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
MDEV-25312 Replace fil_space_t::name with fil_space_t::name()
A consistency check for fil_space_t::name is causing recovery failures
in MDEV-25180 (Atomic ALTER TABLE). So, we'd better remove that field
altogether.
fil_space_t::name was more or less a copy of dict_table_t::name
(except for some special cases), and it was not being used for
anything useful.
There used to be a name_hash, but it had been removed already in
commit a75dbfd7183cc96680f3e3e684fd36500dac8158 (MDEV-12266).
We will also remove os_normalize_path(), OS_PATH_SEPARATOR,
OS_PATH_SEPATOR_ALT. On Microsoft Windows, we will treat \ and /
roughly in the same way. The intention is that for per-table
tablespaces, the filenames will always follow the pattern
prefix/databasename/tablename.ibd. (Any \ in the prefix must not
be converted.)
ut_basename_noext(): Remove (unused function).
read_link_file(): Replaces RemoteDatafile::read_link_file().
We will ensure that the last two path component separators are
forward slashes (converting up to 2 trailing backslashes on
Microsoft Windows), so that everywhere else we can
assume that data file names end in "/databasename/tablename.ibd".
Note: On Microsoft Windows, path names that start with \\?\ must
not contain / as path component separators. Previously, such paths
did work in the DATA DIRECTORY argument of InnoDB tables.
Reviewed by: Vladislav Vaintroub
2021-04-07 18:01:13 +03:00
|
|
|
|
|
|
|
snprintf(path, path_size, format,
|
|
|
|
srv_data_home_full, srv_buf_dump_filename);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
2016-02-16 12:07:18 +01:00
|
|
|
}
|
|
|
|
|
2020-05-11 22:01:40 +02:00
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
/*****************************************************************//**
|
|
|
|
Perform a buffer pool dump into the file specified by
|
|
|
|
innodb_buffer_pool_filename. If any errors occur then the value of
|
|
|
|
innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
|
|
|
|
The dump filename can be specified by (relative to srv_data_home):
|
|
|
|
SET GLOBAL innodb_buffer_pool_filename='filename'; */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
buf_dump(
|
|
|
|
/*=====*/
|
|
|
|
ibool obey_shutdown) /*!< in: quit if we are in a shutting down
|
|
|
|
state */
|
|
|
|
{
|
|
|
|
#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown)
|
|
|
|
|
|
|
|
char full_filename[OS_FILE_MAX_PATH];
|
2018-07-30 14:05:24 +03:00
|
|
|
char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
|
2012-08-01 17:27:34 +03:00
|
|
|
char now[32];
|
|
|
|
FILE* f;
|
|
|
|
int ret;
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_generate_path(full_filename, sizeof(full_filename));
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2017-11-11 23:07:24 +02:00
|
|
|
snprintf(tmp_filename, sizeof(tmp_filename),
|
|
|
|
"%s.incomplete", full_filename);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
|
2012-08-01 17:27:34 +03:00
|
|
|
full_filename);
|
|
|
|
|
2020-05-11 22:01:40 +02:00
|
|
|
#ifdef _WIN32
|
|
|
|
/* use my_fopen() for correct permissions during bootstrap*/
|
|
|
|
f = my_fopen(tmp_filename, O_RDWR|O_TRUNC|O_CREAT, 0);
|
2021-06-06 13:21:03 +02:00
|
|
|
#elif defined(__GLIBC__) || O_CLOEXEC == 0
|
2018-03-02 10:16:46 +11:00
|
|
|
f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
|
2018-03-14 13:31:28 +11:00
|
|
|
#else
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640);
|
|
|
|
if (fd >= 0) {
|
|
|
|
f = fdopen(fd, "w");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
f = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2012-08-01 17:27:34 +03:00
|
|
|
if (f == NULL) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot open '%s' for writing: %s",
|
|
|
|
tmp_filename, strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
2020-02-12 14:45:21 +02:00
|
|
|
const buf_page_t* bpage;
|
2020-05-27 09:00:52 +03:00
|
|
|
page_id_t* dump;
|
2020-02-12 14:45:21 +02:00
|
|
|
ulint n_pages;
|
|
|
|
ulint j;
|
|
|
|
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_lock(&buf_pool.mutex);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-03-18 21:48:00 +02:00
|
|
|
n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
/* skip empty buffer pools */
|
|
|
|
if (n_pages == 0) {
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_unlock(&buf_pool.mutex);
|
2020-02-12 14:45:21 +02:00
|
|
|
goto done;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
if (srv_buf_pool_dump_pct != 100) {
|
|
|
|
ulint t_pages;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
/* limit the number of total pages dumped to X% of the
|
|
|
|
total number of pages */
|
2020-03-18 21:48:00 +02:00
|
|
|
t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
|
2020-02-12 14:45:21 +02:00
|
|
|
if (n_pages > t_pages) {
|
|
|
|
buf_dump_status(STATUS_INFO,
|
|
|
|
"Restricted to " ULINTPF
|
|
|
|
" pages due to "
|
|
|
|
"innodb_buf_pool_dump_pct=%lu",
|
|
|
|
t_pages, srv_buf_pool_dump_pct);
|
|
|
|
n_pages = t_pages;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (n_pages == 0) {
|
2020-02-12 14:45:21 +02:00
|
|
|
n_pages = 1;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
2020-02-12 14:45:21 +02:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
dump = static_cast<page_id_t*>(ut_malloc_nokey(
|
|
|
|
n_pages * sizeof(*dump)));
|
2015-11-29 18:08:42 +11:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
if (dump == NULL) {
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_unlock(&buf_pool.mutex);
|
2020-02-12 14:45:21 +02:00
|
|
|
fclose(f);
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot allocate " ULINTPF " bytes: %s",
|
|
|
|
(ulint) (n_pages * sizeof(*dump)),
|
|
|
|
strerror(errno));
|
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-03-18 21:48:00 +02:00
|
|
|
for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
|
2020-02-12 14:45:21 +02:00
|
|
|
bpage != NULL && j < n_pages;
|
|
|
|
bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
|
MDEV-27058: Reduce the size of buf_block_t and buf_page_t
buf_page_t::frame: Moved from buf_block_t::frame.
All 'thin' buf_page_t describing compressed-only ROW_FORMAT=COMPRESSED
pages will have frame=nullptr, while all 'fat' buf_block_t
will have a non-null frame pointing to aligned innodb_page_size bytes.
This eliminates the need for separate states for
BUF_BLOCK_FILE_PAGE and BUF_BLOCK_ZIP_PAGE.
buf_page_t::lock: Moved from buf_block_t::lock. That is, all block
descriptors will have a page latch. The IO_PIN state that was used
for discarding or creating the uncompressed page frame of a
ROW_FORMAT=COMPRESSED block is replaced by a combination of read-fix
and page X-latch.
page_zip_des_t::fix: Replaces state_, buf_fix_count_, io_fix_, status
of buf_page_t with a single std::atomic<uint32_t>. All modifications
will use store(), fetch_add(), fetch_sub(). This space was previously
wasted to alignment on 64-bit systems. We will use the following encoding
that combines a state (partly read-fix or write-fix) and a buffer-fix
count:
buf_page_t::NOT_USED=0 (previously BUF_BLOCK_NOT_USED)
buf_page_t::MEMORY=1 (previously BUF_BLOCK_MEMORY)
buf_page_t::REMOVE_HASH=2 (previously BUF_BLOCK_REMOVE_HASH)
buf_page_t::FREED=3 + fix: pages marked as freed in the file
buf_page_t::UNFIXED=1U<<29 + fix: normal pages
buf_page_t::IBUF_EXIST=2U<<29 + fix: normal pages; may need ibuf merge
buf_page_t::REINIT=3U<<29 + fix: reinitialized pages (skip doublewrite)
buf_page_t::READ_FIX=4U<<29 + fix: read-fixed pages (also X-latched)
buf_page_t::WRITE_FIX=5U<<29 + fix: write-fixed pages (also U-latched)
buf_page_t::WRITE_FIX_IBUF=6U<<29 + fix: write-fixed; may have ibuf
buf_page_t::WRITE_FIX_REINIT=7U<<29 + fix: write-fixed (no doublewrite)
buf_page_t::write_complete(): Change WRITE_FIX or WRITE_FIX_REINIT to
UNFIXED, and WRITE_FIX_IBUF to IBUF_EXIST, before releasing the U-latch.
buf_page_t::read_complete(): Renamed from buf_page_read_complete().
Change READ_FIX to UNFIXED or IBUF_EXIST, before releasing the X-latch.
buf_page_t::can_relocate(): If the page latch is being held or waited for,
or the block is buffer-fixed or io-fixed, return false. (The condition
on the page latch is new.)
Outside buf_page_get_gen(), buf_page_get_low() and buf_page_free(), we
will acquire the page latch before fix(), and unfix() before unlocking.
buf_page_t::flush(): Replaces buf_flush_page(). Optimize the
handling of FREED pages.
buf_pool_t::release_freed_page(): Assume that buf_pool.mutex is held
by the caller.
buf_page_t::is_read_fixed(), buf_page_t::is_write_fixed(): New predicates.
buf_page_get_low(): Ignore guesses that are read-fixed because they
may not yet be registered in buf_pool.page_hash and buf_pool.LRU.
buf_page_optimistic_get(): Acquire latch before buffer-fixing.
buf_page_make_young(): Leave read-fixed blocks alone, because they
might not be registered in buf_pool.LRU yet.
recv_sys_t::recover_deferred(), recv_sys_t::recover_low():
Possibly fix MDEV-26326, by holding a page X-latch instead of
only buffer-fixing the page.
2021-11-16 19:55:06 +02:00
|
|
|
const auto status = bpage->state();
|
|
|
|
if (status < buf_page_t::UNFIXED) {
|
|
|
|
ut_a(status >= buf_page_t::FREED);
|
2020-02-12 14:45:21 +02:00
|
|
|
continue;
|
2015-11-29 18:08:42 +11:00
|
|
|
}
|
MDEV-27058: Reduce the size of buf_block_t and buf_page_t
buf_page_t::frame: Moved from buf_block_t::frame.
All 'thin' buf_page_t describing compressed-only ROW_FORMAT=COMPRESSED
pages will have frame=nullptr, while all 'fat' buf_block_t
will have a non-null frame pointing to aligned innodb_page_size bytes.
This eliminates the need for separate states for
BUF_BLOCK_FILE_PAGE and BUF_BLOCK_ZIP_PAGE.
buf_page_t::lock: Moved from buf_block_t::lock. That is, all block
descriptors will have a page latch. The IO_PIN state that was used
for discarding or creating the uncompressed page frame of a
ROW_FORMAT=COMPRESSED block is replaced by a combination of read-fix
and page X-latch.
page_zip_des_t::fix: Replaces state_, buf_fix_count_, io_fix_, status
of buf_page_t with a single std::atomic<uint32_t>. All modifications
will use store(), fetch_add(), fetch_sub(). This space was previously
wasted to alignment on 64-bit systems. We will use the following encoding
that combines a state (partly read-fix or write-fix) and a buffer-fix
count:
buf_page_t::NOT_USED=0 (previously BUF_BLOCK_NOT_USED)
buf_page_t::MEMORY=1 (previously BUF_BLOCK_MEMORY)
buf_page_t::REMOVE_HASH=2 (previously BUF_BLOCK_REMOVE_HASH)
buf_page_t::FREED=3 + fix: pages marked as freed in the file
buf_page_t::UNFIXED=1U<<29 + fix: normal pages
buf_page_t::IBUF_EXIST=2U<<29 + fix: normal pages; may need ibuf merge
buf_page_t::REINIT=3U<<29 + fix: reinitialized pages (skip doublewrite)
buf_page_t::READ_FIX=4U<<29 + fix: read-fixed pages (also X-latched)
buf_page_t::WRITE_FIX=5U<<29 + fix: write-fixed pages (also U-latched)
buf_page_t::WRITE_FIX_IBUF=6U<<29 + fix: write-fixed; may have ibuf
buf_page_t::WRITE_FIX_REINIT=7U<<29 + fix: write-fixed (no doublewrite)
buf_page_t::write_complete(): Change WRITE_FIX or WRITE_FIX_REINIT to
UNFIXED, and WRITE_FIX_IBUF to IBUF_EXIST, before releasing the U-latch.
buf_page_t::read_complete(): Renamed from buf_page_read_complete().
Change READ_FIX to UNFIXED or IBUF_EXIST, before releasing the X-latch.
buf_page_t::can_relocate(): If the page latch is being held or waited for,
or the block is buffer-fixed or io-fixed, return false. (The condition
on the page latch is new.)
Outside buf_page_get_gen(), buf_page_get_low() and buf_page_free(), we
will acquire the page latch before fix(), and unfix() before unlocking.
buf_page_t::flush(): Replaces buf_flush_page(). Optimize the
handling of FREED pages.
buf_pool_t::release_freed_page(): Assume that buf_pool.mutex is held
by the caller.
buf_page_t::is_read_fixed(), buf_page_t::is_write_fixed(): New predicates.
buf_page_get_low(): Ignore guesses that are read-fixed because they
may not yet be registered in buf_pool.page_hash and buf_pool.LRU.
buf_page_optimistic_get(): Acquire latch before buffer-fixing.
buf_page_make_young(): Leave read-fixed blocks alone, because they
might not be registered in buf_pool.LRU yet.
recv_sys_t::recover_deferred(), recv_sys_t::recover_low():
Possibly fix MDEV-26326, by holding a page X-latch instead of
only buffer-fixing the page.
2021-11-16 19:55:06 +02:00
|
|
|
const page_id_t id{bpage->id()};
|
2015-11-29 18:08:42 +11:00
|
|
|
|
MDEV-27058: Reduce the size of buf_block_t and buf_page_t
buf_page_t::frame: Moved from buf_block_t::frame.
All 'thin' buf_page_t describing compressed-only ROW_FORMAT=COMPRESSED
pages will have frame=nullptr, while all 'fat' buf_block_t
will have a non-null frame pointing to aligned innodb_page_size bytes.
This eliminates the need for separate states for
BUF_BLOCK_FILE_PAGE and BUF_BLOCK_ZIP_PAGE.
buf_page_t::lock: Moved from buf_block_t::lock. That is, all block
descriptors will have a page latch. The IO_PIN state that was used
for discarding or creating the uncompressed page frame of a
ROW_FORMAT=COMPRESSED block is replaced by a combination of read-fix
and page X-latch.
page_zip_des_t::fix: Replaces state_, buf_fix_count_, io_fix_, status
of buf_page_t with a single std::atomic<uint32_t>. All modifications
will use store(), fetch_add(), fetch_sub(). This space was previously
wasted to alignment on 64-bit systems. We will use the following encoding
that combines a state (partly read-fix or write-fix) and a buffer-fix
count:
buf_page_t::NOT_USED=0 (previously BUF_BLOCK_NOT_USED)
buf_page_t::MEMORY=1 (previously BUF_BLOCK_MEMORY)
buf_page_t::REMOVE_HASH=2 (previously BUF_BLOCK_REMOVE_HASH)
buf_page_t::FREED=3 + fix: pages marked as freed in the file
buf_page_t::UNFIXED=1U<<29 + fix: normal pages
buf_page_t::IBUF_EXIST=2U<<29 + fix: normal pages; may need ibuf merge
buf_page_t::REINIT=3U<<29 + fix: reinitialized pages (skip doublewrite)
buf_page_t::READ_FIX=4U<<29 + fix: read-fixed pages (also X-latched)
buf_page_t::WRITE_FIX=5U<<29 + fix: write-fixed pages (also U-latched)
buf_page_t::WRITE_FIX_IBUF=6U<<29 + fix: write-fixed; may have ibuf
buf_page_t::WRITE_FIX_REINIT=7U<<29 + fix: write-fixed (no doublewrite)
buf_page_t::write_complete(): Change WRITE_FIX or WRITE_FIX_REINIT to
UNFIXED, and WRITE_FIX_IBUF to IBUF_EXIST, before releasing the U-latch.
buf_page_t::read_complete(): Renamed from buf_page_read_complete().
Change READ_FIX to UNFIXED or IBUF_EXIST, before releasing the X-latch.
buf_page_t::can_relocate(): If the page latch is being held or waited for,
or the block is buffer-fixed or io-fixed, return false. (The condition
on the page latch is new.)
Outside buf_page_get_gen(), buf_page_get_low() and buf_page_free(), we
will acquire the page latch before fix(), and unfix() before unlocking.
buf_page_t::flush(): Replaces buf_flush_page(). Optimize the
handling of FREED pages.
buf_pool_t::release_freed_page(): Assume that buf_pool.mutex is held
by the caller.
buf_page_t::is_read_fixed(), buf_page_t::is_write_fixed(): New predicates.
buf_page_get_low(): Ignore guesses that are read-fixed because they
may not yet be registered in buf_pool.page_hash and buf_pool.LRU.
buf_page_optimistic_get(): Acquire latch before buffer-fixing.
buf_page_make_young(): Leave read-fixed blocks alone, because they
might not be registered in buf_pool.LRU yet.
recv_sys_t::recover_deferred(), recv_sys_t::recover_low():
Possibly fix MDEV-26326, by holding a page X-latch instead of
only buffer-fixing the page.
2021-11-16 19:55:06 +02:00
|
|
|
if (id.space() == SRV_TMP_SPACE_ID) {
|
|
|
|
/* Ignore the innodb_temporary tablespace. */
|
2021-01-27 16:24:37 +05:30
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
dump[j++] = id;
|
2020-02-12 14:45:21 +02:00
|
|
|
}
|
|
|
|
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_unlock(&buf_pool.mutex);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
ut_a(j <= n_pages);
|
|
|
|
n_pages = j;
|
|
|
|
|
|
|
|
for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
|
2020-05-27 09:00:52 +03:00
|
|
|
ret = fprintf(f, "%u,%u\n",
|
|
|
|
dump[j].space(), dump[j].page_no());
|
2020-02-12 14:45:21 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
ut_free(dump);
|
2012-08-01 17:27:34 +03:00
|
|
|
fclose(f);
|
|
|
|
buf_dump_status(STATUS_ERR,
|
2020-02-12 14:45:21 +02:00
|
|
|
"Cannot write to '%s': %s",
|
|
|
|
tmp_filename, strerror(errno));
|
2012-08-01 17:27:34 +03:00
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
2020-02-12 14:45:21 +02:00
|
|
|
if (SHUTTING_DOWN() && !(j & 1023)) {
|
|
|
|
service_manager_extend_timeout(
|
|
|
|
INNODB_EXTEND_TIMEOUT_INTERVAL,
|
|
|
|
"Dumping buffer pool page "
|
|
|
|
ULINTPF "/" ULINTPF, j + 1, n_pages);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
ut_free(dump);
|
|
|
|
|
|
|
|
done:
|
2020-05-11 22:01:40 +02:00
|
|
|
ret = IF_WIN(my_fclose(f,0),fclose(f));
|
2012-08-01 17:27:34 +03:00
|
|
|
if (ret != 0) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot close '%s': %s",
|
|
|
|
tmp_filename, strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
ret = unlink(full_filename);
|
|
|
|
if (ret != 0 && errno != ENOENT) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot delete '%s': %s",
|
|
|
|
full_filename, strerror(errno));
|
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
ret = rename(tmp_filename, full_filename);
|
|
|
|
if (ret != 0) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot rename '%s' to '%s': %s",
|
|
|
|
tmp_filename, full_filename,
|
|
|
|
strerror(errno));
|
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
/* success */
|
|
|
|
|
|
|
|
ut_sprintf_timestamp(now);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Buffer pool(s) dump completed at %s", now);
|
2016-10-06 15:16:18 +02:00
|
|
|
|
|
|
|
/* Though dumping doesn't related to an incomplete load,
|
|
|
|
we reset this to 0 here to indicate that a shutdown can also perform
|
|
|
|
a dump */
|
|
|
|
export_vars.innodb_buffer_pool_load_incomplete = 0;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
|
|
|
Perform a buffer pool load from the file specified by
|
|
|
|
innodb_buffer_pool_filename. If any errors occur then the value of
|
|
|
|
innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
|
|
|
|
The dump filename can be specified by (relative to srv_data_home):
|
|
|
|
SET GLOBAL innodb_buffer_pool_filename='filename'; */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
buf_load()
|
|
|
|
/*======*/
|
|
|
|
{
|
|
|
|
char full_filename[OS_FILE_MAX_PATH];
|
|
|
|
char now[32];
|
|
|
|
FILE* f;
|
2020-05-27 09:00:52 +03:00
|
|
|
page_id_t* dump;
|
2012-08-01 17:27:34 +03:00
|
|
|
ulint dump_n;
|
|
|
|
ulint i;
|
2020-10-15 16:28:19 +03:00
|
|
|
uint32_t space_id;
|
|
|
|
uint32_t page_no;
|
2012-08-01 17:27:34 +03:00
|
|
|
int fscanf_ret;
|
|
|
|
|
|
|
|
/* Ignore any leftovers from before */
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_generate_path(full_filename, sizeof(full_filename));
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_load_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Loading buffer pool(s) from %s", full_filename);
|
|
|
|
|
2018-03-02 10:16:46 +11:00
|
|
|
f = fopen(full_filename, "r" STR_O_CLOEXEC);
|
2012-08-01 17:27:34 +03:00
|
|
|
if (f == NULL) {
|
2017-04-04 12:19:42 +03:00
|
|
|
buf_load_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Cannot open '%s' for reading: %s",
|
|
|
|
full_filename, strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
/* First scan the file to estimate how many entries are in it.
|
|
|
|
This file is tiny (approx 500KB per 1GB buffer pool), reading it
|
|
|
|
two times is fine. */
|
|
|
|
dump_n = 0;
|
2020-10-15 16:28:19 +03:00
|
|
|
while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
|
2012-08-01 17:27:34 +03:00
|
|
|
&& !SHUTTING_DOWN()) {
|
|
|
|
dump_n++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!SHUTTING_DOWN() && !feof(f)) {
|
|
|
|
/* fscanf() returned != 2 */
|
|
|
|
const char* what;
|
|
|
|
if (ferror(f)) {
|
|
|
|
what = "reading";
|
|
|
|
} else {
|
|
|
|
what = "parsing";
|
|
|
|
}
|
|
|
|
fclose(f);
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_load_status(STATUS_ERR, "Error %s '%s',"
|
|
|
|
" unable to load buffer pool (stage 1)",
|
2012-08-01 17:27:34 +03:00
|
|
|
what, full_filename);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If dump is larger than the buffer pool(s), then we ignore the
|
|
|
|
extra trailing. This could happen if a dump is made, then buffer
|
2016-08-12 11:17:45 +03:00
|
|
|
pool is shrunk and then load is attempted. */
|
2020-03-18 21:48:00 +02:00
|
|
|
dump_n = std::min(dump_n, buf_pool.get_n_pages());
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
if (dump_n != 0) {
|
2020-05-27 09:00:52 +03:00
|
|
|
dump = static_cast<page_id_t*>(ut_malloc_nokey(
|
2016-09-06 09:43:16 +03:00
|
|
|
dump_n * sizeof(*dump)));
|
|
|
|
} else {
|
|
|
|
fclose(f);
|
|
|
|
ut_sprintf_timestamp(now);
|
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load completed at %s"
|
|
|
|
" (%s was empty)", now, full_filename);
|
|
|
|
return;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (dump == NULL) {
|
|
|
|
fclose(f);
|
|
|
|
buf_load_status(STATUS_ERR,
|
2017-06-06 11:50:42 +03:00
|
|
|
"Cannot allocate " ULINTPF " bytes: %s",
|
|
|
|
dump_n * sizeof(*dump),
|
2012-08-01 17:27:34 +03:00
|
|
|
strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
rewind(f);
|
|
|
|
|
2016-10-06 15:16:18 +02:00
|
|
|
export_vars.innodb_buffer_pool_load_incomplete = 1;
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
|
2020-10-15 16:28:19 +03:00
|
|
|
fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (fscanf_ret != 2) {
|
|
|
|
if (feof(f)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
ut_free(dump);
|
|
|
|
fclose(f);
|
|
|
|
buf_load_status(STATUS_ERR,
|
2016-08-12 11:17:45 +03:00
|
|
|
"Error parsing '%s', unable"
|
|
|
|
" to load buffer pool (stage 2)",
|
2012-08-01 17:27:34 +03:00
|
|
|
full_filename);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
|
|
|
|
ut_free(dump);
|
|
|
|
fclose(f);
|
|
|
|
buf_load_status(STATUS_ERR,
|
2016-08-12 11:17:45 +03:00
|
|
|
"Error parsing '%s': bogus"
|
2020-10-15 16:28:19 +03:00
|
|
|
" space,page %u,%u at line " ULINTPF
|
|
|
|
", unable to load buffer pool",
|
2012-08-01 17:27:34 +03:00
|
|
|
full_filename,
|
|
|
|
space_id, page_no,
|
|
|
|
i);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
dump[i] = page_id_t(space_id, page_no);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Set dump_n to the actual number of initialized elements,
|
|
|
|
i could be smaller than dump_n here if the file got truncated after
|
|
|
|
we read it the first time. */
|
|
|
|
dump_n = i;
|
|
|
|
|
|
|
|
fclose(f);
|
|
|
|
|
|
|
|
if (dump_n == 0) {
|
|
|
|
ut_free(dump);
|
|
|
|
ut_sprintf_timestamp(now);
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load completed at %s"
|
2016-12-06 16:39:23 +11:00
|
|
|
" (%s was empty or had errors)", now, full_filename);
|
2012-08-01 17:27:34 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!SHUTTING_DOWN()) {
|
2016-08-12 11:17:45 +03:00
|
|
|
std::sort(dump, dump + dump_n);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2020-10-26 16:04:12 +02:00
|
|
|
/* Avoid calling the expensive fil_space_t::get() for each
|
2016-08-12 11:17:45 +03:00
|
|
|
page within the same tablespace. dump[] is sorted by (space, page),
|
|
|
|
so all pages from a given tablespace are consecutive. */
|
2020-05-27 09:00:52 +03:00
|
|
|
ulint cur_space_id = dump[0].space();
|
2020-10-26 16:04:12 +02:00
|
|
|
fil_space_t* space = fil_space_t::get(cur_space_id);
|
2019-02-06 19:50:11 +02:00
|
|
|
ulint zip_size = space ? space->zip_size() : 0;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2020-02-15 18:25:57 +01:00
|
|
|
PSI_stage_progress* pfs_stage_progress __attribute__((unused))
|
2016-08-12 11:17:45 +03:00
|
|
|
= mysql_set_stage(srv_stage_buffer_pool_load.m_key);
|
|
|
|
mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
|
|
|
|
mysql_stage_set_work_completed(pfs_stage_progress, 0);
|
2015-11-29 18:08:42 +11:00
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/* space_id for this iteration of the loop */
|
2020-05-27 09:00:52 +03:00
|
|
|
const ulint this_space_id = dump[i].space();
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2019-11-25 22:32:24 +07:00
|
|
|
if (this_space_id == SRV_TMP_SPACE_ID) {
|
2018-03-29 13:22:16 +03:00
|
|
|
/* Ignore the innodb_temporary tablespace. */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
if (this_space_id != cur_space_id) {
|
2020-10-26 15:59:30 +02:00
|
|
|
if (space) {
|
2018-04-23 13:15:54 +03:00
|
|
|
space->release();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
cur_space_id = this_space_id;
|
2020-10-26 16:04:12 +02:00
|
|
|
space = fil_space_t::get(cur_space_id);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2020-10-26 15:59:30 +02:00
|
|
|
if (!space) {
|
|
|
|
continue;
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
2020-10-26 15:59:30 +02:00
|
|
|
|
|
|
|
zip_size = space->zip_size();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* JAN: TODO: As we use background page read below,
|
|
|
|
if tablespace is encrypted we cant use it. */
|
2020-10-26 15:59:30 +02:00
|
|
|
if (!space || dump[i].page_no() >= space->get_size() ||
|
|
|
|
(space->crypt_data &&
|
|
|
|
space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
|
|
|
|
space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
|
2016-08-12 11:17:45 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
MDEV-23855: Improve InnoDB log checkpoint performance
After MDEV-15053, MDEV-22871, MDEV-23399 shifted the scalability
bottleneck, log checkpoints became a new bottleneck.
If innodb_io_capacity is set low or innodb_max_dirty_pct_lwm is
set high and the workload fits in the buffer pool, the page cleaner
thread will perform very little flushing. When we reach the capacity
of the circular redo log file ib_logfile0 and must initiate a checkpoint,
some 'furious flushing' will be necessary. (If innodb_flush_sync=OFF,
then flushing would continue at the innodb_io_capacity rate, and
writers would be throttled.)
We have the best chance of advancing the checkpoint LSN immediately
after a page flush batch has been completed. Hence, it is best to
perform checkpoints after every batch in the page cleaner thread,
attempting to run once per second.
By initiating high-priority flushing in the page cleaner as early
as possible, we aim to make the throughput more stable.
The function buf_flush_wait_flushed() used to sleep for 10ms, hoping
that the page cleaner thread would do something during that time.
The observed end result was that a large number of threads that call
log_free_check() would end up sleeping while nothing useful is happening.
We will revise the design so that in the default innodb_flush_sync=ON
mode, buf_flush_wait_flushed() will wake up the page cleaner thread
to perform the necessary flushing, and it will wait for a signal from
the page cleaner thread.
If innodb_io_capacity is set to a low value (causing the page cleaner to
throttle its work), a write workload would initially perform well, until
the capacity of the circular ib_logfile0 is reached and log_free_check()
will trigger checkpoints. At that point, the extra waiting in
buf_flush_wait_flushed() will start reducing throughput.
The page cleaner thread will also initiate log checkpoints after each
buf_flush_lists() call, because that is the best point of time for
the checkpoint LSN to advance by the maximum amount.
Even in 'furious flushing' mode we invoke buf_flush_lists() with
innodb_io_capacity_max pages at a time, and at the start of each
batch (in the log_flush() callback function that runs in a separate
task) we will invoke os_aio_wait_until_no_pending_writes(). This
tweak allows the checkpoint to advance in smaller steps and
significantly reduces the maximum latency. On an Intel Optane 960
NVMe SSD on Linux, it reduced from 4.6 seconds to 74 milliseconds.
On Microsoft Windows with a slower SSD, it reduced from more than
180 seconds to 0.6 seconds.
We will make innodb_adaptive_flushing=OFF simply flush innodb_io_capacity
per second whenever the dirty proportion of buffer pool pages exceeds
innodb_max_dirty_pages_pct_lwm. For innodb_adaptive_flushing=ON we try
to make page_cleaner_flush_pages_recommendation() more consistent and
predictable: if we are below innodb_adaptive_flushing_lwm, let us flush
pages according to the return value of af_get_pct_for_dirty().
innodb_max_dirty_pages_pct_lwm: Revert the change of the default value
that was made in MDEV-23399. The value innodb_max_dirty_pages_pct_lwm=0
guarantees that a shutdown of an idle server will be fast. Users might
be surprised if normal shutdown suddenly became slower when upgrading
within a GA release series.
innodb_checkpoint_usec: Remove. The master task will no longer perform
periodic log checkpoints. It is the duty of the page cleaner thread.
log_sys.max_modified_age: Remove. The current span of the
buf_pool.flush_list expressed in LSN only matters for adaptive
flushing (outside the 'furious flushing' condition).
For the correctness of checkpoints, the only thing that matters is
the checkpoint age (log_sys.lsn - log_sys.last_checkpoint_lsn).
This run-time constant was also reported as log_max_modified_age_sync.
log_sys.max_checkpoint_age_async: Remove. This does not serve any
purpose, because the checkpoints will now be triggered by the page
cleaner thread. We will retain the log_sys.max_checkpoint_age limit
for engaging 'furious flushing'.
page_cleaner.slot: Remove. It turns out that
page_cleaner_slot.flush_list_time was duplicating
page_cleaner.slot.flush_time and page_cleaner.slot.flush_list_pass
was duplicating page_cleaner.flush_pass.
Likewise, there were some redundant monitor counters, because the
page cleaner thread no longer performs any buf_pool.LRU flushing, and
because there only is one buf_flush_page_cleaner thread.
buf_flush_sync_lsn: Protect writes by buf_pool.flush_list_mutex.
buf_pool_t::get_oldest_modification(): Add a parameter to specify the
return value when no persistent data pages are dirty. Require the
caller to hold buf_pool.flush_list_mutex.
log_buf_pool_get_oldest_modification(): Take the fall-back LSN
as a parameter. All callers will also invoke log_sys.get_lsn().
log_preflush_pool_modified_pages(): Replaced with buf_flush_wait_flushed().
buf_flush_wait_flushed(): Implement two limits. If not enough buffer pool
has been flushed, signal the page cleaner (unless innodb_flush_sync=OFF)
and wait for the page cleaner to complete. If the page cleaner
thread is not running (which can be the case durign shutdown),
initiate the flush and wait for it directly.
buf_flush_ahead(): If innodb_flush_sync=ON (the default),
submit a new buf_flush_sync_lsn target for the page cleaner
but do not wait for the flushing to finish.
log_get_capacity(), log_get_max_modified_age_async(): Remove, to make
it easier to see that af_get_pct_for_lsn() is not acquiring any mutexes.
page_cleaner_flush_pages_recommendation(): Protect all access to
buf_pool.flush_list with buf_pool.flush_list_mutex. Previously there
were some race conditions in the calculation.
buf_flush_sync_for_checkpoint(): New function to process
buf_flush_sync_lsn in the page cleaner thread. At the end of
each batch, we try to wake up any blocked buf_flush_wait_flushed().
If everything up to buf_flush_sync_lsn has been flushed, we will
reset buf_flush_sync_lsn=0. The page cleaner thread will keep
'furious flushing' until the limit is reached. Any threads that
are waiting in buf_flush_wait_flushed() will be able to resume
as soon as their own limit has been satisfied.
buf_flush_page_cleaner: Prioritize buf_flush_sync_lsn and do not
sleep as long as it is set. Do not update any page_cleaner statistics
for this special mode of operation. In the normal mode
(buf_flush_sync_lsn is not set for innodb_flush_sync=ON),
try to wake up once per second. No longer check whether
srv_inc_activity_count() has been called. After each batch,
try to perform a log checkpoint, because the best chances for
the checkpoint LSN to advance by the maximum amount are upon
completing a flushing batch.
log_t: Move buf_free, max_buf_free possibly to the same cache line
with log_sys.mutex.
log_margin_checkpoint_age(): Simplify the logic, and replace
a 0.1-second sleep with a call to buf_flush_wait_flushed() to
initiate flushing. Moved to the same compilation unit
with the only caller.
log_close(): Clean up the calculations. (Should be no functional
change.) Return whether flush-ahead is needed. Moved to the same
compilation unit with the only caller.
mtr_t::finish_write(): Return whether flush-ahead is needed.
mtr_t::commit(): Invoke buf_flush_ahead() when needed. Let us avoid
external calls in mtr_t::commit() and make the logic easier to follow
by having related code in a single compilation unit. Also, we will
invoke srv_stats.log_write_requests.inc() only once per
mini-transaction commit, while not holding mutexes.
log_checkpoint_margin(): Only care about log_sys.max_checkpoint_age.
Upon reaching log_sys.max_checkpoint_age where we must wait to prevent
the log from getting corrupted, let us wait for at most 1MiB of LSN
at a time, before rechecking the condition. This should allow writers
to proceed even if the redo log capacity has been reached and
'furious flushing' is in progress. We no longer care about
log_sys.max_modified_age_sync or log_sys.max_modified_age_async.
The log_sys.max_modified_age_sync could be a relic from the time when
there was a srv_master_thread that wrote dirty pages to data files.
Also, we no longer have any log_sys.max_checkpoint_age_async limit,
because log checkpoints will now be triggered by the page cleaner
thread upon completing buf_flush_lists().
log_set_capacity(): Simplify the calculations of the limit
(no functional change).
log_checkpoint_low(): Split from log_checkpoint(). Moved to the
same compilation unit with the caller.
log_make_checkpoint(): Only wait for everything to be flushed until
the current LSN.
create_log_file(): After checkpoint, invoke log_write_up_to()
to ensure that the FILE_CHECKPOINT record has been written.
This avoids ut_ad(!srv_log_file_created) in create_log_file_rename().
srv_start(): Do not call recv_recovery_from_checkpoint_start()
if the log has just been created. Set fil_system.space_id_reuse_warned
before dict_boot() has been executed, and clear it after recovery
has finished.
dict_boot(): Initialize fil_system.max_assigned_id.
srv_check_activity(): Remove. The activity count is counting transaction
commits and therefore mostly interesting for the purge of history.
BtrBulk::insert(): Do not explicitly wake up the page cleaner,
but do invoke srv_inc_activity_count(), because that counter is
still being used in buf_load_throttle_if_needed() for some
heuristics. (It might be cleaner to execute buf_load() in the
page cleaner thread!)
Reviewed by: Vladislav Vaintroub
2020-10-26 16:35:47 +02:00
|
|
|
if (space->is_stopping()) {
|
2020-10-26 16:04:12 +02:00
|
|
|
space->release();
|
MDEV-23855: Improve InnoDB log checkpoint performance
After MDEV-15053, MDEV-22871, MDEV-23399 shifted the scalability
bottleneck, log checkpoints became a new bottleneck.
If innodb_io_capacity is set low or innodb_max_dirty_pct_lwm is
set high and the workload fits in the buffer pool, the page cleaner
thread will perform very little flushing. When we reach the capacity
of the circular redo log file ib_logfile0 and must initiate a checkpoint,
some 'furious flushing' will be necessary. (If innodb_flush_sync=OFF,
then flushing would continue at the innodb_io_capacity rate, and
writers would be throttled.)
We have the best chance of advancing the checkpoint LSN immediately
after a page flush batch has been completed. Hence, it is best to
perform checkpoints after every batch in the page cleaner thread,
attempting to run once per second.
By initiating high-priority flushing in the page cleaner as early
as possible, we aim to make the throughput more stable.
The function buf_flush_wait_flushed() used to sleep for 10ms, hoping
that the page cleaner thread would do something during that time.
The observed end result was that a large number of threads that call
log_free_check() would end up sleeping while nothing useful is happening.
We will revise the design so that in the default innodb_flush_sync=ON
mode, buf_flush_wait_flushed() will wake up the page cleaner thread
to perform the necessary flushing, and it will wait for a signal from
the page cleaner thread.
If innodb_io_capacity is set to a low value (causing the page cleaner to
throttle its work), a write workload would initially perform well, until
the capacity of the circular ib_logfile0 is reached and log_free_check()
will trigger checkpoints. At that point, the extra waiting in
buf_flush_wait_flushed() will start reducing throughput.
The page cleaner thread will also initiate log checkpoints after each
buf_flush_lists() call, because that is the best point of time for
the checkpoint LSN to advance by the maximum amount.
Even in 'furious flushing' mode we invoke buf_flush_lists() with
innodb_io_capacity_max pages at a time, and at the start of each
batch (in the log_flush() callback function that runs in a separate
task) we will invoke os_aio_wait_until_no_pending_writes(). This
tweak allows the checkpoint to advance in smaller steps and
significantly reduces the maximum latency. On an Intel Optane 960
NVMe SSD on Linux, it reduced from 4.6 seconds to 74 milliseconds.
On Microsoft Windows with a slower SSD, it reduced from more than
180 seconds to 0.6 seconds.
We will make innodb_adaptive_flushing=OFF simply flush innodb_io_capacity
per second whenever the dirty proportion of buffer pool pages exceeds
innodb_max_dirty_pages_pct_lwm. For innodb_adaptive_flushing=ON we try
to make page_cleaner_flush_pages_recommendation() more consistent and
predictable: if we are below innodb_adaptive_flushing_lwm, let us flush
pages according to the return value of af_get_pct_for_dirty().
innodb_max_dirty_pages_pct_lwm: Revert the change of the default value
that was made in MDEV-23399. The value innodb_max_dirty_pages_pct_lwm=0
guarantees that a shutdown of an idle server will be fast. Users might
be surprised if normal shutdown suddenly became slower when upgrading
within a GA release series.
innodb_checkpoint_usec: Remove. The master task will no longer perform
periodic log checkpoints. It is the duty of the page cleaner thread.
log_sys.max_modified_age: Remove. The current span of the
buf_pool.flush_list expressed in LSN only matters for adaptive
flushing (outside the 'furious flushing' condition).
For the correctness of checkpoints, the only thing that matters is
the checkpoint age (log_sys.lsn - log_sys.last_checkpoint_lsn).
This run-time constant was also reported as log_max_modified_age_sync.
log_sys.max_checkpoint_age_async: Remove. This does not serve any
purpose, because the checkpoints will now be triggered by the page
cleaner thread. We will retain the log_sys.max_checkpoint_age limit
for engaging 'furious flushing'.
page_cleaner.slot: Remove. It turns out that
page_cleaner_slot.flush_list_time was duplicating
page_cleaner.slot.flush_time and page_cleaner.slot.flush_list_pass
was duplicating page_cleaner.flush_pass.
Likewise, there were some redundant monitor counters, because the
page cleaner thread no longer performs any buf_pool.LRU flushing, and
because there only is one buf_flush_page_cleaner thread.
buf_flush_sync_lsn: Protect writes by buf_pool.flush_list_mutex.
buf_pool_t::get_oldest_modification(): Add a parameter to specify the
return value when no persistent data pages are dirty. Require the
caller to hold buf_pool.flush_list_mutex.
log_buf_pool_get_oldest_modification(): Take the fall-back LSN
as a parameter. All callers will also invoke log_sys.get_lsn().
log_preflush_pool_modified_pages(): Replaced with buf_flush_wait_flushed().
buf_flush_wait_flushed(): Implement two limits. If not enough buffer pool
has been flushed, signal the page cleaner (unless innodb_flush_sync=OFF)
and wait for the page cleaner to complete. If the page cleaner
thread is not running (which can be the case durign shutdown),
initiate the flush and wait for it directly.
buf_flush_ahead(): If innodb_flush_sync=ON (the default),
submit a new buf_flush_sync_lsn target for the page cleaner
but do not wait for the flushing to finish.
log_get_capacity(), log_get_max_modified_age_async(): Remove, to make
it easier to see that af_get_pct_for_lsn() is not acquiring any mutexes.
page_cleaner_flush_pages_recommendation(): Protect all access to
buf_pool.flush_list with buf_pool.flush_list_mutex. Previously there
were some race conditions in the calculation.
buf_flush_sync_for_checkpoint(): New function to process
buf_flush_sync_lsn in the page cleaner thread. At the end of
each batch, we try to wake up any blocked buf_flush_wait_flushed().
If everything up to buf_flush_sync_lsn has been flushed, we will
reset buf_flush_sync_lsn=0. The page cleaner thread will keep
'furious flushing' until the limit is reached. Any threads that
are waiting in buf_flush_wait_flushed() will be able to resume
as soon as their own limit has been satisfied.
buf_flush_page_cleaner: Prioritize buf_flush_sync_lsn and do not
sleep as long as it is set. Do not update any page_cleaner statistics
for this special mode of operation. In the normal mode
(buf_flush_sync_lsn is not set for innodb_flush_sync=ON),
try to wake up once per second. No longer check whether
srv_inc_activity_count() has been called. After each batch,
try to perform a log checkpoint, because the best chances for
the checkpoint LSN to advance by the maximum amount are upon
completing a flushing batch.
log_t: Move buf_free, max_buf_free possibly to the same cache line
with log_sys.mutex.
log_margin_checkpoint_age(): Simplify the logic, and replace
a 0.1-second sleep with a call to buf_flush_wait_flushed() to
initiate flushing. Moved to the same compilation unit
with the only caller.
log_close(): Clean up the calculations. (Should be no functional
change.) Return whether flush-ahead is needed. Moved to the same
compilation unit with the only caller.
mtr_t::finish_write(): Return whether flush-ahead is needed.
mtr_t::commit(): Invoke buf_flush_ahead() when needed. Let us avoid
external calls in mtr_t::commit() and make the logic easier to follow
by having related code in a single compilation unit. Also, we will
invoke srv_stats.log_write_requests.inc() only once per
mini-transaction commit, while not holding mutexes.
log_checkpoint_margin(): Only care about log_sys.max_checkpoint_age.
Upon reaching log_sys.max_checkpoint_age where we must wait to prevent
the log from getting corrupted, let us wait for at most 1MiB of LSN
at a time, before rechecking the condition. This should allow writers
to proceed even if the redo log capacity has been reached and
'furious flushing' is in progress. We no longer care about
log_sys.max_modified_age_sync or log_sys.max_modified_age_async.
The log_sys.max_modified_age_sync could be a relic from the time when
there was a srv_master_thread that wrote dirty pages to data files.
Also, we no longer have any log_sys.max_checkpoint_age_async limit,
because log checkpoints will now be triggered by the page cleaner
thread upon completing buf_flush_lists().
log_set_capacity(): Simplify the calculations of the limit
(no functional change).
log_checkpoint_low(): Split from log_checkpoint(). Moved to the
same compilation unit with the caller.
log_make_checkpoint(): Only wait for everything to be flushed until
the current LSN.
create_log_file(): After checkpoint, invoke log_write_up_to()
to ensure that the FILE_CHECKPOINT record has been written.
This avoids ut_ad(!srv_log_file_created) in create_log_file_rename().
srv_start(): Do not call recv_recovery_from_checkpoint_start()
if the log has just been created. Set fil_system.space_id_reuse_warned
before dict_boot() has been executed, and clear it after recovery
has finished.
dict_boot(): Initialize fil_system.max_assigned_id.
srv_check_activity(): Remove. The activity count is counting transaction
commits and therefore mostly interesting for the purge of history.
BtrBulk::insert(): Do not explicitly wake up the page cleaner,
but do invoke srv_inc_activity_count(), because that counter is
still being used in buf_load_throttle_if_needed() for some
heuristics. (It might be cleaner to execute buf_load() in the
page cleaner thread!)
Reviewed by: Vladislav Vaintroub
2020-10-26 16:35:47 +02:00
|
|
|
space = nullptr;
|
2016-08-12 11:17:45 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-10-26 16:04:12 +02:00
|
|
|
space->reacquire();
|
2021-09-06 10:14:24 +03:00
|
|
|
buf_read_page_background(space, dump[i], zip_size);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (buf_load_abort_flag) {
|
2020-10-26 15:59:30 +02:00
|
|
|
if (space) {
|
2018-04-23 13:15:54 +03:00
|
|
|
space->release();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
ut_free(dump);
|
|
|
|
buf_load_status(
|
2016-08-12 11:17:45 +03:00
|
|
|
STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Buffer pool(s) load aborted on request");
|
2016-08-12 11:17:45 +03:00
|
|
|
/* Premature end, set estimated = completed = i and
|
|
|
|
end the current stage event. */
|
2020-02-15 18:25:57 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
mysql_stage_set_work_estimated(pfs_stage_progress, i);
|
2020-02-15 18:25:57 +01:00
|
|
|
mysql_stage_set_work_completed(pfs_stage_progress, i);
|
|
|
|
|
|
|
|
mysql_end_stage();
|
2012-08-01 17:27:34 +03:00
|
|
|
return;
|
|
|
|
}
|
2015-11-29 18:08:42 +11:00
|
|
|
|
2016-12-06 16:39:23 +11:00
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
if ((i+1) >= srv_buf_pool_load_pages_abort) {
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag = true;
|
2016-12-06 16:39:23 +11:00
|
|
|
}
|
|
|
|
#endif
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2020-10-26 15:59:30 +02:00
|
|
|
if (space) {
|
2018-04-23 13:15:54 +03:00
|
|
|
space->release();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
ut_free(dump);
|
|
|
|
|
2021-09-07 08:55:08 +03:00
|
|
|
if (i == dump_n) {
|
2023-04-24 09:57:58 +03:00
|
|
|
os_aio_wait_until_no_pending_reads(true);
|
2021-09-07 08:55:08 +03:00
|
|
|
}
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
ut_sprintf_timestamp(now);
|
|
|
|
|
2016-10-06 15:16:18 +02:00
|
|
|
if (i == dump_n) {
|
|
|
|
buf_load_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Buffer pool(s) load completed at %s", now);
|
2016-10-06 15:16:18 +02:00
|
|
|
export_vars.innodb_buffer_pool_load_incomplete = 0;
|
|
|
|
} else if (!buf_load_abort_flag) {
|
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load aborted due to user instigated abort at %s",
|
|
|
|
now);
|
|
|
|
/* intentionally don't reset innodb_buffer_pool_load_incomplete
|
|
|
|
as we don't want a shutdown to save the buffer pool */
|
|
|
|
} else {
|
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load aborted due to shutdown at %s",
|
|
|
|
now);
|
|
|
|
/* intentionally don't reset innodb_buffer_pool_load_incomplete
|
|
|
|
as we want to abort without saving the buffer pool */
|
|
|
|
}
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/* Make sure that estimated = completed when we end. */
|
2020-02-15 18:25:57 +01:00
|
|
|
mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
|
2016-08-12 11:17:45 +03:00
|
|
|
/* End the stage progress event. */
|
2020-02-15 18:25:57 +01:00
|
|
|
mysql_end_stage();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/** Abort a currently running buffer pool load. */
|
|
|
|
void buf_load_abort()
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag= true;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
2019-10-29 22:37:12 +01:00
|
|
|
This is the main task for buffer pool dump/load. when scheduled
|
|
|
|
either performs a dump or load, depending on server state, state of the variables etc- */
|
|
|
|
static void buf_dump_load_func(void *)
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2013-03-26 00:03:13 +02:00
|
|
|
ut_ad(!srv_read_only_mode);
|
2019-10-29 22:37:12 +01:00
|
|
|
static bool first_time = true;
|
|
|
|
if (first_time && srv_buffer_pool_load_at_startup) {
|
2017-08-31 08:27:59 +03:00
|
|
|
|
2017-03-02 17:53:16 +01:00
|
|
|
#ifdef WITH_WSREP
|
2019-01-23 15:30:00 +04:00
|
|
|
if (!get_wsrep_recovery()) {
|
2017-03-02 17:53:16 +01:00
|
|
|
#endif /* WITH_WSREP */
|
2017-08-31 08:27:59 +03:00
|
|
|
buf_load();
|
2017-03-02 17:53:16 +01:00
|
|
|
#ifdef WITH_WSREP
|
2017-08-31 08:27:59 +03:00
|
|
|
}
|
2017-03-02 17:53:16 +01:00
|
|
|
#endif /* WITH_WSREP */
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
2019-10-29 22:37:12 +01:00
|
|
|
first_time = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
while (!SHUTTING_DOWN()) {
|
|
|
|
if (buf_dump_should_start) {
|
2017-02-17 10:32:21 +02:00
|
|
|
buf_dump_should_start = false;
|
2020-02-12 14:45:21 +02:00
|
|
|
buf_dump(true);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
if (buf_load_should_start) {
|
2017-02-17 10:32:21 +02:00
|
|
|
buf_load_should_start = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
buf_load();
|
|
|
|
}
|
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
if (!buf_dump_should_start && !buf_load_should_start) {
|
|
|
|
return;
|
2017-02-17 10:32:21 +02:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
/* In shutdown */
|
2012-08-01 17:27:34 +03:00
|
|
|
if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
|
2016-10-06 15:16:18 +02:00
|
|
|
if (export_vars.innodb_buffer_pool_load_incomplete) {
|
|
|
|
buf_dump_status(STATUS_INFO,
|
|
|
|
"Dumping of buffer pool not started"
|
|
|
|
" as load was incomplete");
|
2017-03-02 17:53:16 +01:00
|
|
|
#ifdef WITH_WSREP
|
2019-01-23 15:30:00 +04:00
|
|
|
} else if (get_wsrep_recovery()) {
|
2017-03-02 17:53:16 +01:00
|
|
|
#endif /* WITH_WSREP */
|
2016-10-06 15:16:18 +02:00
|
|
|
} else {
|
2020-02-12 14:45:21 +02:00
|
|
|
buf_dump(false/* do complete dump at shutdown */);
|
2017-03-02 17:53:16 +01:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
2019-10-29 22:37:12 +01:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/* Execute task with max.concurrency */
|
|
|
|
static tpool::task_group tpool_group(1);
|
2019-10-29 22:37:12 +01:00
|
|
|
static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
|
|
|
|
static bool load_dump_enabled;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
|
|
|
|
void buf_load_at_startup()
|
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
load_dump_enabled= true;
|
|
|
|
if (srv_buffer_pool_load_at_startup)
|
|
|
|
buf_do_load_dump();
|
2019-10-29 22:37:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void buf_do_load_dump()
|
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
if (load_dump_enabled && !buf_dump_load_task.is_running())
|
|
|
|
srv_thread_pool->submit_task(&buf_dump_load_task);
|
2019-10-29 22:37:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/** Wait for currently running load/dumps to finish*/
|
|
|
|
void buf_load_dump_end()
|
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
ut_ad(SHUTTING_DOWN());
|
|
|
|
buf_dump_load_task.wait();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|