2012-08-01 17:27:34 +03:00
|
|
|
/*****************************************************************************
|
|
|
|
|
2017-01-06 19:48:54 +05:30
|
|
|
Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
|
2021-09-06 10:14:24 +03:00
|
|
|
Copyright (c) 2017, 2021, MariaDB Corporation.
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
|
|
Foundation; version 2 of the License.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
2019-05-11 19:25:02 +03:00
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
/**************************************************//**
|
|
|
|
@file buf/buf0dump.cc
|
|
|
|
Implements a buffer pool dump/load.
|
|
|
|
|
|
|
|
Created April 08, 2011 Vasil Dimov
|
|
|
|
*******************************************************/
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "my_global.h"
|
2020-04-28 19:39:40 +03:00
|
|
|
#include "mysqld.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "my_sys.h"
|
|
|
|
|
|
|
|
#include "mysql/psi/mysql_stage.h"
|
|
|
|
#include "mysql/psi/psi.h"
|
2013-03-26 00:03:13 +02:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "buf0buf.h"
|
2012-08-01 17:27:34 +03:00
|
|
|
#include "buf0dump.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "dict0dict.h"
|
|
|
|
#include "os0file.h"
|
|
|
|
#include "os0thread.h"
|
|
|
|
#include "srv0srv.h"
|
|
|
|
#include "srv0start.h"
|
|
|
|
#include "sync0rw.h"
|
|
|
|
#include "ut0byte.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2017-08-31 08:27:59 +03:00
|
|
|
#include "mysql/service_wsrep.h" /* wsrep_recovery */
|
2017-11-30 13:37:59 +11:00
|
|
|
#include <my_service_manager.h>
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
static void buf_do_load_dump();
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
enum status_severity {
|
|
|
|
STATUS_INFO,
|
|
|
|
STATUS_ERR
|
|
|
|
};
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE)
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
/* Flags that tell the buffer pool dump/load thread which action should it
|
|
|
|
take after being waked up. */
|
2017-02-17 10:32:21 +02:00
|
|
|
static volatile bool buf_dump_should_start;
|
|
|
|
static volatile bool buf_load_should_start;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
static bool buf_load_abort_flag;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/** Start the buffer pool dump/load task and instructs it to start a dump. */
|
|
|
|
void buf_dump_start()
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_dump_should_start= true;
|
|
|
|
buf_do_load_dump();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/** Start the buffer pool dump/load task and instructs it to start a load. */
|
|
|
|
void buf_load_start()
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_should_start= true;
|
|
|
|
buf_do_load_dump();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
|
|
|
Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
|
|
|
|
to the specified string. The format and the following parameters are the
|
|
|
|
same as the ones used for printf(3). The value of this variable can be
|
|
|
|
retrieved by:
|
|
|
|
SELECT variable_value FROM information_schema.global_status WHERE
|
|
|
|
variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
|
|
|
|
or by:
|
|
|
|
SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
|
2016-06-21 14:21:03 +02:00
|
|
|
static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
|
2012-08-01 17:27:34 +03:00
|
|
|
void
|
|
|
|
buf_dump_status(
|
|
|
|
/*============*/
|
|
|
|
enum status_severity severity,/*!< in: status severity */
|
|
|
|
const char* fmt, /*!< in: format */
|
|
|
|
...) /*!< in: extra parameters according
|
|
|
|
to fmt */
|
|
|
|
{
|
|
|
|
va_list ap;
|
|
|
|
|
|
|
|
va_start(ap, fmt);
|
|
|
|
|
2017-11-13 04:32:56 +02:00
|
|
|
vsnprintf(
|
2012-08-01 17:27:34 +03:00
|
|
|
export_vars.innodb_buffer_pool_dump_status,
|
|
|
|
sizeof(export_vars.innodb_buffer_pool_dump_status),
|
|
|
|
fmt, ap);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
switch (severity) {
|
|
|
|
case STATUS_INFO:
|
|
|
|
ib::info() << export_vars.innodb_buffer_pool_dump_status;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATUS_ERR:
|
|
|
|
ib::error() << export_vars.innodb_buffer_pool_dump_status;
|
|
|
|
break;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
va_end(ap);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
|
|
|
Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
|
|
|
|
to the specified string. The format and the following parameters are the
|
|
|
|
same as the ones used for printf(3). The value of this variable can be
|
|
|
|
retrieved by:
|
|
|
|
SELECT variable_value FROM information_schema.global_status WHERE
|
|
|
|
variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
|
|
|
|
or by:
|
|
|
|
SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
|
2016-06-21 14:21:03 +02:00
|
|
|
static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
|
2012-08-01 17:27:34 +03:00
|
|
|
void
|
|
|
|
buf_load_status(
|
|
|
|
/*============*/
|
|
|
|
enum status_severity severity,/*!< in: status severity */
|
|
|
|
const char* fmt, /*!< in: format */
|
|
|
|
...) /*!< in: extra parameters according to fmt */
|
|
|
|
{
|
|
|
|
va_list ap;
|
|
|
|
|
|
|
|
va_start(ap, fmt);
|
|
|
|
|
2017-11-13 04:32:56 +02:00
|
|
|
vsnprintf(
|
2012-08-01 17:27:34 +03:00
|
|
|
export_vars.innodb_buffer_pool_load_status,
|
|
|
|
sizeof(export_vars.innodb_buffer_pool_load_status),
|
|
|
|
fmt, ap);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
switch (severity) {
|
|
|
|
case STATUS_INFO:
|
|
|
|
ib::info() << export_vars.innodb_buffer_pool_load_status;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATUS_ERR:
|
|
|
|
ib::error() << export_vars.innodb_buffer_pool_load_status;
|
|
|
|
break;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
va_end(ap);
|
|
|
|
}
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** Returns the directory path where the buffer pool dump file will be created.
|
|
|
|
@return directory path */
|
|
|
|
static
|
|
|
|
const char*
|
|
|
|
get_buf_dump_dir()
|
|
|
|
{
|
|
|
|
const char* dump_dir;
|
|
|
|
|
|
|
|
/* The dump file should be created in the default data directory if
|
|
|
|
innodb_data_home_dir is set as an empty string. */
|
2020-04-28 14:51:25 +03:00
|
|
|
if (!*srv_data_home) {
|
2016-09-06 09:43:16 +03:00
|
|
|
dump_dir = fil_path_to_mysql_datadir;
|
|
|
|
} else {
|
|
|
|
dump_dir = srv_data_home;
|
|
|
|
}
|
|
|
|
|
|
|
|
return(dump_dir);
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Generate the path to the buffer pool dump/load file.
|
|
|
|
@param[out] path generated path
|
|
|
|
@param[in] path_size size of 'path', used as in snprintf(3). */
|
2020-04-28 19:39:40 +03:00
|
|
|
static void buf_dump_generate_path(char *path, size_t path_size)
|
2016-02-16 12:07:18 +01:00
|
|
|
{
|
2016-08-12 11:17:45 +03:00
|
|
|
char buf[FN_REFLEN];
|
|
|
|
|
2020-04-28 19:39:40 +03:00
|
|
|
mysql_mutex_lock(&LOCK_global_system_variables);
|
2017-11-11 23:07:24 +02:00
|
|
|
snprintf(buf, sizeof(buf), "%s%c%s", get_buf_dump_dir(),
|
|
|
|
OS_PATH_SEPARATOR, srv_buf_dump_filename);
|
2020-04-28 19:39:40 +03:00
|
|
|
mysql_mutex_unlock(&LOCK_global_system_variables);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
os_file_type_t type;
|
|
|
|
bool exists = false;
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
ret = os_file_status(buf, &exists, &type);
|
|
|
|
|
|
|
|
/* For realpath() to succeed the file must exist. */
|
|
|
|
|
|
|
|
if (ret && exists) {
|
|
|
|
/* my_realpath() assumes the destination buffer is big enough
|
|
|
|
to hold FN_REFLEN bytes. */
|
|
|
|
ut_a(path_size >= FN_REFLEN);
|
2016-02-16 12:07:18 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
my_realpath(path, buf, 0);
|
2016-02-16 12:07:18 +01:00
|
|
|
} else {
|
2016-08-12 11:17:45 +03:00
|
|
|
/* If it does not exist, then resolve only srv_data_home
|
|
|
|
and append srv_buf_dump_filename to it. */
|
|
|
|
char srv_data_home_full[FN_REFLEN];
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
|
2016-02-16 12:07:18 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
if (srv_data_home_full[strlen(srv_data_home_full) - 1]
|
|
|
|
== OS_PATH_SEPARATOR) {
|
|
|
|
|
2017-11-11 23:07:24 +02:00
|
|
|
snprintf(path, path_size, "%s%s",
|
|
|
|
srv_data_home_full,
|
|
|
|
srv_buf_dump_filename);
|
2016-08-12 11:17:45 +03:00
|
|
|
} else {
|
2017-11-11 23:07:24 +02:00
|
|
|
snprintf(path, path_size, "%s%c%s",
|
|
|
|
srv_data_home_full,
|
|
|
|
OS_PATH_SEPARATOR,
|
|
|
|
srv_buf_dump_filename);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
}
|
2016-02-16 12:07:18 +01:00
|
|
|
}
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
/*****************************************************************//**
|
|
|
|
Perform a buffer pool dump into the file specified by
|
|
|
|
innodb_buffer_pool_filename. If any errors occur then the value of
|
|
|
|
innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
|
|
|
|
The dump filename can be specified by (relative to srv_data_home):
|
|
|
|
SET GLOBAL innodb_buffer_pool_filename='filename'; */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
buf_dump(
|
|
|
|
/*=====*/
|
|
|
|
ibool obey_shutdown) /*!< in: quit if we are in a shutting down
|
|
|
|
state */
|
|
|
|
{
|
|
|
|
#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown)
|
|
|
|
|
|
|
|
char full_filename[OS_FILE_MAX_PATH];
|
2018-07-30 14:05:24 +03:00
|
|
|
char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
|
2012-08-01 17:27:34 +03:00
|
|
|
char now[32];
|
|
|
|
FILE* f;
|
|
|
|
int ret;
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_generate_path(full_filename, sizeof(full_filename));
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2017-11-11 23:07:24 +02:00
|
|
|
snprintf(tmp_filename, sizeof(tmp_filename),
|
|
|
|
"%s.incomplete", full_filename);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
|
2012-08-01 17:27:34 +03:00
|
|
|
full_filename);
|
|
|
|
|
2018-03-14 13:31:28 +11:00
|
|
|
#if defined(__GLIBC__) || defined(__WIN__) || O_CLOEXEC == 0
|
2018-03-02 10:16:46 +11:00
|
|
|
f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
|
2018-03-14 13:31:28 +11:00
|
|
|
#else
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640);
|
|
|
|
if (fd >= 0) {
|
|
|
|
f = fdopen(fd, "w");
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
f = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2012-08-01 17:27:34 +03:00
|
|
|
if (f == NULL) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot open '%s' for writing: %s",
|
|
|
|
tmp_filename, strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
2020-02-12 14:45:21 +02:00
|
|
|
const buf_page_t* bpage;
|
2020-05-27 09:00:52 +03:00
|
|
|
page_id_t* dump;
|
2020-02-12 14:45:21 +02:00
|
|
|
ulint n_pages;
|
|
|
|
ulint j;
|
|
|
|
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_lock(&buf_pool.mutex);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-03-18 21:48:00 +02:00
|
|
|
n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
/* skip empty buffer pools */
|
|
|
|
if (n_pages == 0) {
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_unlock(&buf_pool.mutex);
|
2020-02-12 14:45:21 +02:00
|
|
|
goto done;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
if (srv_buf_pool_dump_pct != 100) {
|
|
|
|
ulint t_pages;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
/* limit the number of total pages dumped to X% of the
|
|
|
|
total number of pages */
|
2020-03-18 21:48:00 +02:00
|
|
|
t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
|
2020-02-12 14:45:21 +02:00
|
|
|
if (n_pages > t_pages) {
|
|
|
|
buf_dump_status(STATUS_INFO,
|
|
|
|
"Restricted to " ULINTPF
|
|
|
|
" pages due to "
|
|
|
|
"innodb_buf_pool_dump_pct=%lu",
|
|
|
|
t_pages, srv_buf_pool_dump_pct);
|
|
|
|
n_pages = t_pages;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (n_pages == 0) {
|
2020-02-12 14:45:21 +02:00
|
|
|
n_pages = 1;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
2020-02-12 14:45:21 +02:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
dump = static_cast<page_id_t*>(ut_malloc_nokey(
|
|
|
|
n_pages * sizeof(*dump)));
|
2015-11-29 18:08:42 +11:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
if (dump == NULL) {
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_unlock(&buf_pool.mutex);
|
2020-02-12 14:45:21 +02:00
|
|
|
fclose(f);
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot allocate " ULINTPF " bytes: %s",
|
|
|
|
(ulint) (n_pages * sizeof(*dump)),
|
|
|
|
strerror(errno));
|
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-03-18 21:48:00 +02:00
|
|
|
for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
|
2020-02-12 14:45:21 +02:00
|
|
|
bpage != NULL && j < n_pages;
|
|
|
|
bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
|
|
|
|
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
ut_a(bpage->in_file());
|
|
|
|
const page_id_t id(bpage->id());
|
2020-02-12 14:45:21 +02:00
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
if (id.space() == SRV_TMP_SPACE_ID) {
|
2020-02-12 14:45:21 +02:00
|
|
|
/* Ignore the innodb_temporary tablespace. */
|
|
|
|
continue;
|
2015-11-29 18:08:42 +11:00
|
|
|
}
|
|
|
|
|
2021-01-27 16:24:37 +05:30
|
|
|
if (bpage->status == buf_page_t::FREED) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
dump[j++] = id;
|
2020-02-12 14:45:21 +02:00
|
|
|
}
|
|
|
|
|
MDEV-23399: Performance regression with write workloads
The buffer pool refactoring in MDEV-15053 and MDEV-22871 shifted
the performance bottleneck to the page flushing.
The configuration parameters will be changed as follows:
innodb_lru_flush_size=32 (new: how many pages to flush on LRU eviction)
innodb_lru_scan_depth=1536 (old: 1024)
innodb_max_dirty_pages_pct=90 (old: 75)
innodb_max_dirty_pages_pct_lwm=75 (old: 0)
Note: The parameter innodb_lru_scan_depth will only affect LRU
eviction of buffer pool pages when a new page is being allocated. The
page cleaner thread will no longer evict any pages. It used to
guarantee that some pages will remain free in the buffer pool. Now, we
perform that eviction 'on demand' in buf_LRU_get_free_block().
The parameter innodb_lru_scan_depth(srv_LRU_scan_depth) is used as follows:
* When the buffer pool is being shrunk in buf_pool_t::withdraw_blocks()
* As a buf_pool.free limit in buf_LRU_list_batch() for terminating
the flushing that is initiated e.g., by buf_LRU_get_free_block()
The parameter also used to serve as an initial limit for unzip_LRU
eviction (evicting uncompressed page frames while retaining
ROW_FORMAT=COMPRESSED pages), but now we will use a hard-coded limit
of 100 or unlimited for invoking buf_LRU_scan_and_free_block().
The status variables will be changed as follows:
innodb_buffer_pool_pages_flushed: This includes also the count of
innodb_buffer_pool_pages_LRU_flushed and should work reliably,
updated one by one in buf_flush_page() to give more real-time
statistics. The function buf_flush_stats(), which we are removing,
was not called in every code path. For both counters, we will use
regular variables that are incremented in a critical section of
buf_pool.mutex. Note that show_innodb_vars() directly links to the
variables, and reads of the counters will *not* be protected by
buf_pool.mutex, so you cannot get a consistent snapshot of both variables.
The following INFORMATION_SCHEMA.INNODB_METRICS counters will be
removed, because the page cleaner no longer deals with writing or
evicting least recently used pages, and because the single-page writes
have been removed:
* buffer_LRU_batch_flush_avg_time_slot
* buffer_LRU_batch_flush_avg_time_thread
* buffer_LRU_batch_flush_avg_time_est
* buffer_LRU_batch_flush_avg_pass
* buffer_LRU_single_flush_scanned
* buffer_LRU_single_flush_num_scan
* buffer_LRU_single_flush_scanned_per_call
When moving to a single buffer pool instance in MDEV-15058, we missed
some opportunity to simplify the buf_flush_page_cleaner thread. It was
unnecessarily using a mutex and some complex data structures, even
though we always have a single page cleaner thread.
Furthermore, the buf_flush_page_cleaner thread had separate 'recovery'
and 'shutdown' modes where it was waiting to be triggered by some
other thread, adding unnecessary latency and potential for hangs in
relatively rarely executed startup or shutdown code.
The page cleaner was also running two kinds of batches in an
interleaved fashion: "LRU flush" (writing out some least recently used
pages and evicting them on write completion) and the normal batches
that aim to increase the MIN(oldest_modification) in the buffer pool,
to help the log checkpoint advance.
The buf_pool.flush_list flushing was being blocked by
buf_block_t::lock for no good reason. Furthermore, if the FIL_PAGE_LSN
of a page is ahead of log_sys.get_flushed_lsn(), that is, what has
been persistently written to the redo log, we would trigger a log
flush and then resume the page flushing. This would unnecessarily
limit the performance of the page cleaner thread and trigger the
infamous messages "InnoDB: page_cleaner: 1000ms intended loop took 4450ms.
The settings might not be optimal" that were suppressed in
commit d1ab89037a518fcffbc50c24e4bd94e4ec33aed0 unless log_warnings>2.
Our revised algorithm will make log_sys.get_flushed_lsn() advance at
the start of buf_flush_lists(), and then execute a 'best effort' to
write out all pages. The flush batches will skip pages that were modified
since the log was written, or are are currently exclusively locked.
The MDEV-13670 message "page_cleaner: 1000ms intended loop took" message
will be removed, because by design, the buf_flush_page_cleaner() should
not be blocked during a batch for extended periods of time.
We will remove the single-page flushing altogether. Related to this,
the debug parameter innodb_doublewrite_batch_size will be removed,
because all of the doublewrite buffer will be used for flushing
batches. If a page needs to be evicted from the buffer pool and all
100 least recently used pages in the buffer pool have unflushed
changes, buf_LRU_get_free_block() will execute buf_flush_lists() to
write out and evict innodb_lru_flush_size pages. At most one thread
will execute buf_flush_lists() in buf_LRU_get_free_block(); other
threads will wait for that LRU flushing batch to finish.
To improve concurrency, we will replace the InnoDB ib_mutex_t and
os_event_t native mutexes and condition variables in this area of code.
Most notably, this means that the buffer pool mutex (buf_pool.mutex)
is no longer instrumented via any InnoDB interfaces. It will continue
to be instrumented via PERFORMANCE_SCHEMA.
For now, both buf_pool.flush_list_mutex and buf_pool.mutex will be
declared with MY_MUTEX_INIT_FAST (PTHREAD_MUTEX_ADAPTIVE_NP). The critical
sections of buf_pool.flush_list_mutex should be shorter than those for
buf_pool.mutex, because in the worst case, they cover a linear scan of
buf_pool.flush_list, while the worst case of a critical section of
buf_pool.mutex covers a linear scan of the potentially much longer
buf_pool.LRU list.
mysql_mutex_is_owner(), safe_mutex_is_owner(): New predicate, usable
with SAFE_MUTEX. Some InnoDB debug assertions need this predicate
instead of mysql_mutex_assert_owner() or mysql_mutex_assert_not_owner().
buf_pool_t::n_flush_LRU, buf_pool_t::n_flush_list:
Replaces buf_pool_t::init_flush[] and buf_pool_t::n_flush[].
The number of active flush operations.
buf_pool_t::mutex, buf_pool_t::flush_list_mutex: Use mysql_mutex_t
instead of ib_mutex_t, to have native mutexes with PERFORMANCE_SCHEMA
and SAFE_MUTEX instrumentation.
buf_pool_t::done_flush_LRU: Condition variable for !n_flush_LRU.
buf_pool_t::done_flush_list: Condition variable for !n_flush_list.
buf_pool_t::do_flush_list: Condition variable to wake up the
buf_flush_page_cleaner when a log checkpoint needs to be written
or the server is being shut down. Replaces buf_flush_event.
We will keep using timed waits (the page cleaner thread will wake
_at least_ once per second), because the calculations for
innodb_adaptive_flushing depend on fixed time intervals.
buf_dblwr: Allocate statically, and move all code to member functions.
Use a native mutex and condition variable. Remove code to deal with
single-page flushing.
buf_dblwr_check_block(): Make the check debug-only. We were spending
a significant amount of execution time in page_simple_validate_new().
flush_counters_t::unzip_LRU_evicted: Remove.
IORequest: Make more members const. FIXME: m_fil_node should be removed.
buf_flush_sync_lsn: Protect by std::atomic, not page_cleaner.mutex
(which we are removing).
page_cleaner_slot_t, page_cleaner_t: Remove many redundant members.
pc_request_flush_slot(): Replaces pc_request() and pc_flush_slot().
recv_writer_thread: Remove. Recovery works just fine without it, if we
simply invoke buf_flush_sync() at the end of each batch in
recv_sys_t::apply().
recv_recovery_from_checkpoint_finish(): Remove. We can simply call
recv_sys.debug_free() directly.
srv_started_redo: Replaces srv_start_state.
SRV_SHUTDOWN_FLUSH_PHASE: Remove. logs_empty_and_mark_files_at_shutdown()
can communicate with the normal page cleaner loop via the new function
flush_buffer_pool().
buf_flush_remove(): Assert that the calling thread is holding
buf_pool.flush_list_mutex. This removes unnecessary mutex operations
from buf_flush_remove_pages() and buf_flush_dirty_pages(),
which replace buf_LRU_flush_or_remove_pages().
buf_flush_lists(): Renamed from buf_flush_batch(), with simplified
interface. Return the number of flushed pages. Clarified comments and
renamed min_n to max_n. Identify LRU batch by lsn=0. Merge all the functions
buf_flush_start(), buf_flush_batch(), buf_flush_end() directly to this
function, which was their only caller, and remove 2 unnecessary
buf_pool.mutex release/re-acquisition that we used to perform around
the buf_flush_batch() call. At the start, if not all log has been
durably written, wait for a background task to do it, or start a new
task to do it. This allows the log write to run concurrently with our
page flushing batch. Any pages that were skipped due to too recent
FIL_PAGE_LSN or due to them being latched by a writer should be flushed
during the next batch, unless there are further modifications to those
pages. It is possible that a page that we must flush due to small
oldest_modification also carries a recent FIL_PAGE_LSN or is being
constantly modified. In the worst case, all writers would then end up
waiting in log_free_check() to allow the flushing and the checkpoint
to complete.
buf_do_flush_list_batch(): Clarify comments, and rename min_n to max_n.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_flush_space(): Auxiliary function to look up a tablespace for
page flushing.
buf_flush_page(): Defer the computation of space->full_crc32(). Never
call log_write_up_to(), but instead skip persistent pages whose latest
modification (FIL_PAGE_LSN) is newer than the redo log. Also skip
pages on which we cannot acquire a shared latch without waiting.
buf_flush_try_neighbors(): Do not bother checking buf_fix_count
because buf_flush_page() will no longer wait for the page latch.
Take the tablespace as a parameter, and only execute this function
when innodb_flush_neighbors>0. Avoid repeated calls of page_id_t::fold().
buf_flush_relocate_on_flush_list(): Declare as cold, and push down
a condition from the callers.
buf_flush_check_neighbor(): Take id.fold() as a parameter.
buf_flush_sync(): Ensure that the buf_pool.flush_list is empty,
because the flushing batch will skip pages whose modifications have
not yet been written to the log or were latched for modification.
buf_free_from_unzip_LRU_list_batch(): Remove redundant local variables.
buf_flush_LRU_list_batch(): Let the caller buf_do_LRU_batch() initialize
the counters, and report n->evicted.
Cache the last looked up tablespace. If neighbor flushing is not applicable,
invoke buf_flush_page() directly, avoiding a page lookup in between.
buf_do_LRU_batch(): Return the number of pages flushed.
buf_LRU_free_page(): Only release and re-acquire buf_pool.mutex if
adaptive hash index entries are pointing to the block.
buf_LRU_get_free_block(): Do not wake up the page cleaner, because it
will no longer perform any useful work for us, and we do not want it
to compete for I/O while buf_flush_lists(innodb_lru_flush_size, 0)
writes out and evicts at most innodb_lru_flush_size pages. (The
function buf_do_LRU_batch() may complete after writing fewer pages if
more than innodb_lru_scan_depth pages end up in buf_pool.free list.)
Eliminate some mutex release-acquire cycles, and wait for the LRU
flush batch to complete before rescanning.
buf_LRU_check_size_of_non_data_objects(): Simplify the code.
buf_page_write_complete(): Remove the parameter evict, and always
evict pages that were part of an LRU flush.
buf_page_create(): Take a pre-allocated page as a parameter.
buf_pool_t::free_block(): Free a pre-allocated block.
recv_sys_t::recover_low(), recv_sys_t::apply(): Preallocate the block
while not holding recv_sys.mutex. During page allocation, we may
initiate a page flush, which in turn may initiate a log flush, which
would require acquiring log_sys.mutex, which should always be acquired
before recv_sys.mutex in order to avoid deadlocks. Therefore, we must
not be holding recv_sys.mutex while allocating a buffer pool block.
BtrBulk::logFreeCheck(): Skip a redundant condition.
row_undo_step(): Do not invoke srv_inc_activity_count() for every row
that is being rolled back. It should suffice to invoke the function in
trx_flush_log_if_needed() during trx_t::commit_in_memory() when the
rollback completes.
sync_check_enable(): Remove. We will enable innodb_sync_debug from the
very beginning.
Reviewed by: Vladislav Vaintroub
2020-10-15 12:10:42 +03:00
|
|
|
mysql_mutex_unlock(&buf_pool.mutex);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
ut_a(j <= n_pages);
|
|
|
|
n_pages = j;
|
|
|
|
|
|
|
|
for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
|
2020-05-27 09:00:52 +03:00
|
|
|
ret = fprintf(f, "%u,%u\n",
|
|
|
|
dump[j].space(), dump[j].page_no());
|
2020-02-12 14:45:21 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
ut_free(dump);
|
2012-08-01 17:27:34 +03:00
|
|
|
fclose(f);
|
|
|
|
buf_dump_status(STATUS_ERR,
|
2020-02-12 14:45:21 +02:00
|
|
|
"Cannot write to '%s': %s",
|
|
|
|
tmp_filename, strerror(errno));
|
2012-08-01 17:27:34 +03:00
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
2020-02-12 14:45:21 +02:00
|
|
|
if (SHUTTING_DOWN() && !(j & 1023)) {
|
|
|
|
service_manager_extend_timeout(
|
|
|
|
INNODB_EXTEND_TIMEOUT_INTERVAL,
|
|
|
|
"Dumping buffer pool page "
|
|
|
|
ULINTPF "/" ULINTPF, j + 1, n_pages);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
ut_free(dump);
|
|
|
|
|
|
|
|
done:
|
2012-08-01 17:27:34 +03:00
|
|
|
ret = fclose(f);
|
|
|
|
if (ret != 0) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot close '%s': %s",
|
|
|
|
tmp_filename, strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
ret = unlink(full_filename);
|
|
|
|
if (ret != 0 && errno != ENOENT) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot delete '%s': %s",
|
|
|
|
full_filename, strerror(errno));
|
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
ret = rename(tmp_filename, full_filename);
|
|
|
|
if (ret != 0) {
|
|
|
|
buf_dump_status(STATUS_ERR,
|
|
|
|
"Cannot rename '%s' to '%s': %s",
|
|
|
|
tmp_filename, full_filename,
|
|
|
|
strerror(errno));
|
|
|
|
/* leave tmp_filename to exist */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
/* success */
|
|
|
|
|
|
|
|
ut_sprintf_timestamp(now);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Buffer pool(s) dump completed at %s", now);
|
2016-10-06 15:16:18 +02:00
|
|
|
|
|
|
|
/* Though dumping doesn't related to an incomplete load,
|
|
|
|
we reset this to 0 here to indicate that a shutdown can also perform
|
|
|
|
a dump */
|
|
|
|
export_vars.innodb_buffer_pool_load_incomplete = 0;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2015-11-29 18:08:42 +11:00
|
|
|
/*****************************************************************//**
|
|
|
|
Artificially delay the buffer pool loading if necessary. The idea of
|
|
|
|
this function is to prevent hogging the server with IO and slowing down
|
|
|
|
too much normal client queries. */
|
|
|
|
UNIV_INLINE
|
|
|
|
void
|
|
|
|
buf_load_throttle_if_needed(
|
|
|
|
/*========================*/
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint* last_check_time, /*!< in/out: milliseconds since epoch
|
2015-11-29 18:08:42 +11:00
|
|
|
of the last time we did check if
|
|
|
|
throttling is needed, we do the check
|
|
|
|
every srv_io_capacity IO ops. */
|
|
|
|
ulint* last_activity_count,
|
|
|
|
ulint n_io) /*!< in: number of IO ops done since
|
|
|
|
buffer pool load has started */
|
|
|
|
{
|
|
|
|
if (n_io % srv_io_capacity < srv_io_capacity - 1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*last_check_time == 0 || *last_activity_count == 0) {
|
|
|
|
*last_check_time = ut_time_ms();
|
|
|
|
*last_activity_count = srv_get_activity_count();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* srv_io_capacity IO operations have been performed by buffer pool
|
|
|
|
load since the last time we were here. */
|
|
|
|
|
|
|
|
/* If no other activity, then keep going without any delay. */
|
|
|
|
if (srv_get_activity_count() == *last_activity_count) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* There has been other activity, throttle. */
|
|
|
|
|
|
|
|
ulint now = ut_time_ms();
|
|
|
|
ulint elapsed_time = now - *last_check_time;
|
|
|
|
|
|
|
|
/* Notice that elapsed_time is not the time for the last
|
|
|
|
srv_io_capacity IO operations performed by BP load. It is the
|
|
|
|
time elapsed since the last time we detected that there has been
|
|
|
|
other activity. This has a small and acceptable deficiency, e.g.:
|
|
|
|
1. BP load runs and there is no other activity.
|
|
|
|
2. Other activity occurs, we run N IO operations after that and
|
|
|
|
enter here (where 0 <= N < srv_io_capacity).
|
|
|
|
3. last_check_time is very old and we do not sleep at this time, but
|
|
|
|
only update last_check_time and last_activity_count.
|
|
|
|
4. We run srv_io_capacity more IO operations and call this function
|
|
|
|
again.
|
|
|
|
5. There has been more other activity and thus we enter here.
|
|
|
|
6. Now last_check_time is recent and we sleep if necessary to prevent
|
|
|
|
more than srv_io_capacity IO operations per second.
|
|
|
|
The deficiency is that we could have slept at 3., but for this we
|
|
|
|
would have to update last_check_time before the
|
|
|
|
"cur_activity_count == *last_activity_count" check and calling
|
|
|
|
ut_time_ms() that often may turn out to be too expensive. */
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
if (elapsed_time < 1000 /* 1 sec (1000 milli secs) */) {
|
2015-11-29 18:08:42 +11:00
|
|
|
os_thread_sleep((1000 - elapsed_time) * 1000 /* micro secs */);
|
|
|
|
}
|
|
|
|
|
|
|
|
*last_check_time = ut_time_ms();
|
|
|
|
*last_activity_count = srv_get_activity_count();
|
|
|
|
}
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
/*****************************************************************//**
|
|
|
|
Perform a buffer pool load from the file specified by
|
|
|
|
innodb_buffer_pool_filename. If any errors occur then the value of
|
|
|
|
innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
|
|
|
|
The dump filename can be specified by (relative to srv_data_home):
|
|
|
|
SET GLOBAL innodb_buffer_pool_filename='filename'; */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
buf_load()
|
|
|
|
/*======*/
|
|
|
|
{
|
|
|
|
char full_filename[OS_FILE_MAX_PATH];
|
|
|
|
char now[32];
|
|
|
|
FILE* f;
|
2020-05-27 09:00:52 +03:00
|
|
|
page_id_t* dump;
|
2012-08-01 17:27:34 +03:00
|
|
|
ulint dump_n;
|
|
|
|
ulint i;
|
2020-10-15 16:28:19 +03:00
|
|
|
uint32_t space_id;
|
|
|
|
uint32_t page_no;
|
2012-08-01 17:27:34 +03:00
|
|
|
int fscanf_ret;
|
|
|
|
|
|
|
|
/* Ignore any leftovers from before */
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_dump_generate_path(full_filename, sizeof(full_filename));
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_load_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Loading buffer pool(s) from %s", full_filename);
|
|
|
|
|
2018-03-02 10:16:46 +11:00
|
|
|
f = fopen(full_filename, "r" STR_O_CLOEXEC);
|
2012-08-01 17:27:34 +03:00
|
|
|
if (f == NULL) {
|
2017-04-04 12:19:42 +03:00
|
|
|
buf_load_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Cannot open '%s' for reading: %s",
|
|
|
|
full_filename, strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
/* First scan the file to estimate how many entries are in it.
|
|
|
|
This file is tiny (approx 500KB per 1GB buffer pool), reading it
|
|
|
|
two times is fine. */
|
|
|
|
dump_n = 0;
|
2020-10-15 16:28:19 +03:00
|
|
|
while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
|
2012-08-01 17:27:34 +03:00
|
|
|
&& !SHUTTING_DOWN()) {
|
|
|
|
dump_n++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!SHUTTING_DOWN() && !feof(f)) {
|
|
|
|
/* fscanf() returned != 2 */
|
|
|
|
const char* what;
|
|
|
|
if (ferror(f)) {
|
|
|
|
what = "reading";
|
|
|
|
} else {
|
|
|
|
what = "parsing";
|
|
|
|
}
|
|
|
|
fclose(f);
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_load_status(STATUS_ERR, "Error %s '%s',"
|
|
|
|
" unable to load buffer pool (stage 1)",
|
2012-08-01 17:27:34 +03:00
|
|
|
what, full_filename);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If dump is larger than the buffer pool(s), then we ignore the
|
|
|
|
extra trailing. This could happen if a dump is made, then buffer
|
2016-08-12 11:17:45 +03:00
|
|
|
pool is shrunk and then load is attempted. */
|
2020-03-18 21:48:00 +02:00
|
|
|
dump_n = std::min(dump_n, buf_pool.get_n_pages());
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2020-02-12 14:45:21 +02:00
|
|
|
if (dump_n != 0) {
|
2020-05-27 09:00:52 +03:00
|
|
|
dump = static_cast<page_id_t*>(ut_malloc_nokey(
|
2016-09-06 09:43:16 +03:00
|
|
|
dump_n * sizeof(*dump)));
|
|
|
|
} else {
|
|
|
|
fclose(f);
|
|
|
|
ut_sprintf_timestamp(now);
|
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load completed at %s"
|
|
|
|
" (%s was empty)", now, full_filename);
|
|
|
|
return;
|
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (dump == NULL) {
|
|
|
|
fclose(f);
|
|
|
|
buf_load_status(STATUS_ERR,
|
2017-06-06 11:50:42 +03:00
|
|
|
"Cannot allocate " ULINTPF " bytes: %s",
|
|
|
|
dump_n * sizeof(*dump),
|
2012-08-01 17:27:34 +03:00
|
|
|
strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
rewind(f);
|
|
|
|
|
2016-10-06 15:16:18 +02:00
|
|
|
export_vars.innodb_buffer_pool_load_incomplete = 1;
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
|
2020-10-15 16:28:19 +03:00
|
|
|
fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (fscanf_ret != 2) {
|
|
|
|
if (feof(f)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* else */
|
|
|
|
|
|
|
|
ut_free(dump);
|
|
|
|
fclose(f);
|
|
|
|
buf_load_status(STATUS_ERR,
|
2016-08-12 11:17:45 +03:00
|
|
|
"Error parsing '%s', unable"
|
|
|
|
" to load buffer pool (stage 2)",
|
2012-08-01 17:27:34 +03:00
|
|
|
full_filename);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
|
|
|
|
ut_free(dump);
|
|
|
|
fclose(f);
|
|
|
|
buf_load_status(STATUS_ERR,
|
2016-08-12 11:17:45 +03:00
|
|
|
"Error parsing '%s': bogus"
|
2020-10-15 16:28:19 +03:00
|
|
|
" space,page %u,%u at line " ULINTPF
|
|
|
|
", unable to load buffer pool",
|
2012-08-01 17:27:34 +03:00
|
|
|
full_filename,
|
|
|
|
space_id, page_no,
|
|
|
|
i);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-05-27 09:00:52 +03:00
|
|
|
dump[i] = page_id_t(space_id, page_no);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Set dump_n to the actual number of initialized elements,
|
|
|
|
i could be smaller than dump_n here if the file got truncated after
|
|
|
|
we read it the first time. */
|
|
|
|
dump_n = i;
|
|
|
|
|
|
|
|
fclose(f);
|
|
|
|
|
|
|
|
if (dump_n == 0) {
|
|
|
|
ut_free(dump);
|
|
|
|
ut_sprintf_timestamp(now);
|
2016-08-12 11:17:45 +03:00
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load completed at %s"
|
2016-12-06 16:39:23 +11:00
|
|
|
" (%s was empty or had errors)", now, full_filename);
|
2012-08-01 17:27:34 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!SHUTTING_DOWN()) {
|
2016-08-12 11:17:45 +03:00
|
|
|
std::sort(dump, dump + dump_n);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint last_check_time = 0;
|
|
|
|
ulint last_activity_cnt = 0;
|
|
|
|
|
2020-10-26 16:04:12 +02:00
|
|
|
/* Avoid calling the expensive fil_space_t::get() for each
|
2016-08-12 11:17:45 +03:00
|
|
|
page within the same tablespace. dump[] is sorted by (space, page),
|
|
|
|
so all pages from a given tablespace are consecutive. */
|
2020-05-27 09:00:52 +03:00
|
|
|
ulint cur_space_id = dump[0].space();
|
2020-10-26 16:04:12 +02:00
|
|
|
fil_space_t* space = fil_space_t::get(cur_space_id);
|
2019-02-06 19:50:11 +02:00
|
|
|
ulint zip_size = space ? space->zip_size() : 0;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2020-02-15 18:25:57 +01:00
|
|
|
PSI_stage_progress* pfs_stage_progress __attribute__((unused))
|
2016-08-12 11:17:45 +03:00
|
|
|
= mysql_set_stage(srv_stage_buffer_pool_load.m_key);
|
|
|
|
mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
|
|
|
|
mysql_stage_set_work_completed(pfs_stage_progress, 0);
|
2015-11-29 18:08:42 +11:00
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/* space_id for this iteration of the loop */
|
2020-05-27 09:00:52 +03:00
|
|
|
const ulint this_space_id = dump[i].space();
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2019-11-25 22:32:24 +07:00
|
|
|
if (this_space_id == SRV_TMP_SPACE_ID) {
|
2018-03-29 13:22:16 +03:00
|
|
|
/* Ignore the innodb_temporary tablespace. */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
if (this_space_id != cur_space_id) {
|
2020-10-26 15:59:30 +02:00
|
|
|
if (space) {
|
2020-10-26 16:04:12 +02:00
|
|
|
space->release();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
cur_space_id = this_space_id;
|
2020-10-26 16:04:12 +02:00
|
|
|
space = fil_space_t::get(cur_space_id);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2020-10-26 15:59:30 +02:00
|
|
|
if (!space) {
|
|
|
|
continue;
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
2020-10-26 15:59:30 +02:00
|
|
|
|
|
|
|
zip_size = space->zip_size();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* JAN: TODO: As we use background page read below,
|
|
|
|
if tablespace is encrypted we cant use it. */
|
2020-10-26 15:59:30 +02:00
|
|
|
if (!space || dump[i].page_no() >= space->get_size() ||
|
|
|
|
(space->crypt_data &&
|
|
|
|
space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
|
|
|
|
space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
|
2016-08-12 11:17:45 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
MDEV-23855: Improve InnoDB log checkpoint performance
After MDEV-15053, MDEV-22871, MDEV-23399 shifted the scalability
bottleneck, log checkpoints became a new bottleneck.
If innodb_io_capacity is set low or innodb_max_dirty_pct_lwm is
set high and the workload fits in the buffer pool, the page cleaner
thread will perform very little flushing. When we reach the capacity
of the circular redo log file ib_logfile0 and must initiate a checkpoint,
some 'furious flushing' will be necessary. (If innodb_flush_sync=OFF,
then flushing would continue at the innodb_io_capacity rate, and
writers would be throttled.)
We have the best chance of advancing the checkpoint LSN immediately
after a page flush batch has been completed. Hence, it is best to
perform checkpoints after every batch in the page cleaner thread,
attempting to run once per second.
By initiating high-priority flushing in the page cleaner as early
as possible, we aim to make the throughput more stable.
The function buf_flush_wait_flushed() used to sleep for 10ms, hoping
that the page cleaner thread would do something during that time.
The observed end result was that a large number of threads that call
log_free_check() would end up sleeping while nothing useful is happening.
We will revise the design so that in the default innodb_flush_sync=ON
mode, buf_flush_wait_flushed() will wake up the page cleaner thread
to perform the necessary flushing, and it will wait for a signal from
the page cleaner thread.
If innodb_io_capacity is set to a low value (causing the page cleaner to
throttle its work), a write workload would initially perform well, until
the capacity of the circular ib_logfile0 is reached and log_free_check()
will trigger checkpoints. At that point, the extra waiting in
buf_flush_wait_flushed() will start reducing throughput.
The page cleaner thread will also initiate log checkpoints after each
buf_flush_lists() call, because that is the best point of time for
the checkpoint LSN to advance by the maximum amount.
Even in 'furious flushing' mode we invoke buf_flush_lists() with
innodb_io_capacity_max pages at a time, and at the start of each
batch (in the log_flush() callback function that runs in a separate
task) we will invoke os_aio_wait_until_no_pending_writes(). This
tweak allows the checkpoint to advance in smaller steps and
significantly reduces the maximum latency. On an Intel Optane 960
NVMe SSD on Linux, it reduced from 4.6 seconds to 74 milliseconds.
On Microsoft Windows with a slower SSD, it reduced from more than
180 seconds to 0.6 seconds.
We will make innodb_adaptive_flushing=OFF simply flush innodb_io_capacity
per second whenever the dirty proportion of buffer pool pages exceeds
innodb_max_dirty_pages_pct_lwm. For innodb_adaptive_flushing=ON we try
to make page_cleaner_flush_pages_recommendation() more consistent and
predictable: if we are below innodb_adaptive_flushing_lwm, let us flush
pages according to the return value of af_get_pct_for_dirty().
innodb_max_dirty_pages_pct_lwm: Revert the change of the default value
that was made in MDEV-23399. The value innodb_max_dirty_pages_pct_lwm=0
guarantees that a shutdown of an idle server will be fast. Users might
be surprised if normal shutdown suddenly became slower when upgrading
within a GA release series.
innodb_checkpoint_usec: Remove. The master task will no longer perform
periodic log checkpoints. It is the duty of the page cleaner thread.
log_sys.max_modified_age: Remove. The current span of the
buf_pool.flush_list expressed in LSN only matters for adaptive
flushing (outside the 'furious flushing' condition).
For the correctness of checkpoints, the only thing that matters is
the checkpoint age (log_sys.lsn - log_sys.last_checkpoint_lsn).
This run-time constant was also reported as log_max_modified_age_sync.
log_sys.max_checkpoint_age_async: Remove. This does not serve any
purpose, because the checkpoints will now be triggered by the page
cleaner thread. We will retain the log_sys.max_checkpoint_age limit
for engaging 'furious flushing'.
page_cleaner.slot: Remove. It turns out that
page_cleaner_slot.flush_list_time was duplicating
page_cleaner.slot.flush_time and page_cleaner.slot.flush_list_pass
was duplicating page_cleaner.flush_pass.
Likewise, there were some redundant monitor counters, because the
page cleaner thread no longer performs any buf_pool.LRU flushing, and
because there only is one buf_flush_page_cleaner thread.
buf_flush_sync_lsn: Protect writes by buf_pool.flush_list_mutex.
buf_pool_t::get_oldest_modification(): Add a parameter to specify the
return value when no persistent data pages are dirty. Require the
caller to hold buf_pool.flush_list_mutex.
log_buf_pool_get_oldest_modification(): Take the fall-back LSN
as a parameter. All callers will also invoke log_sys.get_lsn().
log_preflush_pool_modified_pages(): Replaced with buf_flush_wait_flushed().
buf_flush_wait_flushed(): Implement two limits. If not enough buffer pool
has been flushed, signal the page cleaner (unless innodb_flush_sync=OFF)
and wait for the page cleaner to complete. If the page cleaner
thread is not running (which can be the case durign shutdown),
initiate the flush and wait for it directly.
buf_flush_ahead(): If innodb_flush_sync=ON (the default),
submit a new buf_flush_sync_lsn target for the page cleaner
but do not wait for the flushing to finish.
log_get_capacity(), log_get_max_modified_age_async(): Remove, to make
it easier to see that af_get_pct_for_lsn() is not acquiring any mutexes.
page_cleaner_flush_pages_recommendation(): Protect all access to
buf_pool.flush_list with buf_pool.flush_list_mutex. Previously there
were some race conditions in the calculation.
buf_flush_sync_for_checkpoint(): New function to process
buf_flush_sync_lsn in the page cleaner thread. At the end of
each batch, we try to wake up any blocked buf_flush_wait_flushed().
If everything up to buf_flush_sync_lsn has been flushed, we will
reset buf_flush_sync_lsn=0. The page cleaner thread will keep
'furious flushing' until the limit is reached. Any threads that
are waiting in buf_flush_wait_flushed() will be able to resume
as soon as their own limit has been satisfied.
buf_flush_page_cleaner: Prioritize buf_flush_sync_lsn and do not
sleep as long as it is set. Do not update any page_cleaner statistics
for this special mode of operation. In the normal mode
(buf_flush_sync_lsn is not set for innodb_flush_sync=ON),
try to wake up once per second. No longer check whether
srv_inc_activity_count() has been called. After each batch,
try to perform a log checkpoint, because the best chances for
the checkpoint LSN to advance by the maximum amount are upon
completing a flushing batch.
log_t: Move buf_free, max_buf_free possibly to the same cache line
with log_sys.mutex.
log_margin_checkpoint_age(): Simplify the logic, and replace
a 0.1-second sleep with a call to buf_flush_wait_flushed() to
initiate flushing. Moved to the same compilation unit
with the only caller.
log_close(): Clean up the calculations. (Should be no functional
change.) Return whether flush-ahead is needed. Moved to the same
compilation unit with the only caller.
mtr_t::finish_write(): Return whether flush-ahead is needed.
mtr_t::commit(): Invoke buf_flush_ahead() when needed. Let us avoid
external calls in mtr_t::commit() and make the logic easier to follow
by having related code in a single compilation unit. Also, we will
invoke srv_stats.log_write_requests.inc() only once per
mini-transaction commit, while not holding mutexes.
log_checkpoint_margin(): Only care about log_sys.max_checkpoint_age.
Upon reaching log_sys.max_checkpoint_age where we must wait to prevent
the log from getting corrupted, let us wait for at most 1MiB of LSN
at a time, before rechecking the condition. This should allow writers
to proceed even if the redo log capacity has been reached and
'furious flushing' is in progress. We no longer care about
log_sys.max_modified_age_sync or log_sys.max_modified_age_async.
The log_sys.max_modified_age_sync could be a relic from the time when
there was a srv_master_thread that wrote dirty pages to data files.
Also, we no longer have any log_sys.max_checkpoint_age_async limit,
because log checkpoints will now be triggered by the page cleaner
thread upon completing buf_flush_lists().
log_set_capacity(): Simplify the calculations of the limit
(no functional change).
log_checkpoint_low(): Split from log_checkpoint(). Moved to the
same compilation unit with the caller.
log_make_checkpoint(): Only wait for everything to be flushed until
the current LSN.
create_log_file(): After checkpoint, invoke log_write_up_to()
to ensure that the FILE_CHECKPOINT record has been written.
This avoids ut_ad(!srv_log_file_created) in create_log_file_rename().
srv_start(): Do not call recv_recovery_from_checkpoint_start()
if the log has just been created. Set fil_system.space_id_reuse_warned
before dict_boot() has been executed, and clear it after recovery
has finished.
dict_boot(): Initialize fil_system.max_assigned_id.
srv_check_activity(): Remove. The activity count is counting transaction
commits and therefore mostly interesting for the purge of history.
BtrBulk::insert(): Do not explicitly wake up the page cleaner,
but do invoke srv_inc_activity_count(), because that counter is
still being used in buf_load_throttle_if_needed() for some
heuristics. (It might be cleaner to execute buf_load() in the
page cleaner thread!)
Reviewed by: Vladislav Vaintroub
2020-10-26 16:35:47 +02:00
|
|
|
if (space->is_stopping()) {
|
2020-10-26 16:04:12 +02:00
|
|
|
space->release();
|
MDEV-23855: Improve InnoDB log checkpoint performance
After MDEV-15053, MDEV-22871, MDEV-23399 shifted the scalability
bottleneck, log checkpoints became a new bottleneck.
If innodb_io_capacity is set low or innodb_max_dirty_pct_lwm is
set high and the workload fits in the buffer pool, the page cleaner
thread will perform very little flushing. When we reach the capacity
of the circular redo log file ib_logfile0 and must initiate a checkpoint,
some 'furious flushing' will be necessary. (If innodb_flush_sync=OFF,
then flushing would continue at the innodb_io_capacity rate, and
writers would be throttled.)
We have the best chance of advancing the checkpoint LSN immediately
after a page flush batch has been completed. Hence, it is best to
perform checkpoints after every batch in the page cleaner thread,
attempting to run once per second.
By initiating high-priority flushing in the page cleaner as early
as possible, we aim to make the throughput more stable.
The function buf_flush_wait_flushed() used to sleep for 10ms, hoping
that the page cleaner thread would do something during that time.
The observed end result was that a large number of threads that call
log_free_check() would end up sleeping while nothing useful is happening.
We will revise the design so that in the default innodb_flush_sync=ON
mode, buf_flush_wait_flushed() will wake up the page cleaner thread
to perform the necessary flushing, and it will wait for a signal from
the page cleaner thread.
If innodb_io_capacity is set to a low value (causing the page cleaner to
throttle its work), a write workload would initially perform well, until
the capacity of the circular ib_logfile0 is reached and log_free_check()
will trigger checkpoints. At that point, the extra waiting in
buf_flush_wait_flushed() will start reducing throughput.
The page cleaner thread will also initiate log checkpoints after each
buf_flush_lists() call, because that is the best point of time for
the checkpoint LSN to advance by the maximum amount.
Even in 'furious flushing' mode we invoke buf_flush_lists() with
innodb_io_capacity_max pages at a time, and at the start of each
batch (in the log_flush() callback function that runs in a separate
task) we will invoke os_aio_wait_until_no_pending_writes(). This
tweak allows the checkpoint to advance in smaller steps and
significantly reduces the maximum latency. On an Intel Optane 960
NVMe SSD on Linux, it reduced from 4.6 seconds to 74 milliseconds.
On Microsoft Windows with a slower SSD, it reduced from more than
180 seconds to 0.6 seconds.
We will make innodb_adaptive_flushing=OFF simply flush innodb_io_capacity
per second whenever the dirty proportion of buffer pool pages exceeds
innodb_max_dirty_pages_pct_lwm. For innodb_adaptive_flushing=ON we try
to make page_cleaner_flush_pages_recommendation() more consistent and
predictable: if we are below innodb_adaptive_flushing_lwm, let us flush
pages according to the return value of af_get_pct_for_dirty().
innodb_max_dirty_pages_pct_lwm: Revert the change of the default value
that was made in MDEV-23399. The value innodb_max_dirty_pages_pct_lwm=0
guarantees that a shutdown of an idle server will be fast. Users might
be surprised if normal shutdown suddenly became slower when upgrading
within a GA release series.
innodb_checkpoint_usec: Remove. The master task will no longer perform
periodic log checkpoints. It is the duty of the page cleaner thread.
log_sys.max_modified_age: Remove. The current span of the
buf_pool.flush_list expressed in LSN only matters for adaptive
flushing (outside the 'furious flushing' condition).
For the correctness of checkpoints, the only thing that matters is
the checkpoint age (log_sys.lsn - log_sys.last_checkpoint_lsn).
This run-time constant was also reported as log_max_modified_age_sync.
log_sys.max_checkpoint_age_async: Remove. This does not serve any
purpose, because the checkpoints will now be triggered by the page
cleaner thread. We will retain the log_sys.max_checkpoint_age limit
for engaging 'furious flushing'.
page_cleaner.slot: Remove. It turns out that
page_cleaner_slot.flush_list_time was duplicating
page_cleaner.slot.flush_time and page_cleaner.slot.flush_list_pass
was duplicating page_cleaner.flush_pass.
Likewise, there were some redundant monitor counters, because the
page cleaner thread no longer performs any buf_pool.LRU flushing, and
because there only is one buf_flush_page_cleaner thread.
buf_flush_sync_lsn: Protect writes by buf_pool.flush_list_mutex.
buf_pool_t::get_oldest_modification(): Add a parameter to specify the
return value when no persistent data pages are dirty. Require the
caller to hold buf_pool.flush_list_mutex.
log_buf_pool_get_oldest_modification(): Take the fall-back LSN
as a parameter. All callers will also invoke log_sys.get_lsn().
log_preflush_pool_modified_pages(): Replaced with buf_flush_wait_flushed().
buf_flush_wait_flushed(): Implement two limits. If not enough buffer pool
has been flushed, signal the page cleaner (unless innodb_flush_sync=OFF)
and wait for the page cleaner to complete. If the page cleaner
thread is not running (which can be the case durign shutdown),
initiate the flush and wait for it directly.
buf_flush_ahead(): If innodb_flush_sync=ON (the default),
submit a new buf_flush_sync_lsn target for the page cleaner
but do not wait for the flushing to finish.
log_get_capacity(), log_get_max_modified_age_async(): Remove, to make
it easier to see that af_get_pct_for_lsn() is not acquiring any mutexes.
page_cleaner_flush_pages_recommendation(): Protect all access to
buf_pool.flush_list with buf_pool.flush_list_mutex. Previously there
were some race conditions in the calculation.
buf_flush_sync_for_checkpoint(): New function to process
buf_flush_sync_lsn in the page cleaner thread. At the end of
each batch, we try to wake up any blocked buf_flush_wait_flushed().
If everything up to buf_flush_sync_lsn has been flushed, we will
reset buf_flush_sync_lsn=0. The page cleaner thread will keep
'furious flushing' until the limit is reached. Any threads that
are waiting in buf_flush_wait_flushed() will be able to resume
as soon as their own limit has been satisfied.
buf_flush_page_cleaner: Prioritize buf_flush_sync_lsn and do not
sleep as long as it is set. Do not update any page_cleaner statistics
for this special mode of operation. In the normal mode
(buf_flush_sync_lsn is not set for innodb_flush_sync=ON),
try to wake up once per second. No longer check whether
srv_inc_activity_count() has been called. After each batch,
try to perform a log checkpoint, because the best chances for
the checkpoint LSN to advance by the maximum amount are upon
completing a flushing batch.
log_t: Move buf_free, max_buf_free possibly to the same cache line
with log_sys.mutex.
log_margin_checkpoint_age(): Simplify the logic, and replace
a 0.1-second sleep with a call to buf_flush_wait_flushed() to
initiate flushing. Moved to the same compilation unit
with the only caller.
log_close(): Clean up the calculations. (Should be no functional
change.) Return whether flush-ahead is needed. Moved to the same
compilation unit with the only caller.
mtr_t::finish_write(): Return whether flush-ahead is needed.
mtr_t::commit(): Invoke buf_flush_ahead() when needed. Let us avoid
external calls in mtr_t::commit() and make the logic easier to follow
by having related code in a single compilation unit. Also, we will
invoke srv_stats.log_write_requests.inc() only once per
mini-transaction commit, while not holding mutexes.
log_checkpoint_margin(): Only care about log_sys.max_checkpoint_age.
Upon reaching log_sys.max_checkpoint_age where we must wait to prevent
the log from getting corrupted, let us wait for at most 1MiB of LSN
at a time, before rechecking the condition. This should allow writers
to proceed even if the redo log capacity has been reached and
'furious flushing' is in progress. We no longer care about
log_sys.max_modified_age_sync or log_sys.max_modified_age_async.
The log_sys.max_modified_age_sync could be a relic from the time when
there was a srv_master_thread that wrote dirty pages to data files.
Also, we no longer have any log_sys.max_checkpoint_age_async limit,
because log checkpoints will now be triggered by the page cleaner
thread upon completing buf_flush_lists().
log_set_capacity(): Simplify the calculations of the limit
(no functional change).
log_checkpoint_low(): Split from log_checkpoint(). Moved to the
same compilation unit with the caller.
log_make_checkpoint(): Only wait for everything to be flushed until
the current LSN.
create_log_file(): After checkpoint, invoke log_write_up_to()
to ensure that the FILE_CHECKPOINT record has been written.
This avoids ut_ad(!srv_log_file_created) in create_log_file_rename().
srv_start(): Do not call recv_recovery_from_checkpoint_start()
if the log has just been created. Set fil_system.space_id_reuse_warned
before dict_boot() has been executed, and clear it after recovery
has finished.
dict_boot(): Initialize fil_system.max_assigned_id.
srv_check_activity(): Remove. The activity count is counting transaction
commits and therefore mostly interesting for the purge of history.
BtrBulk::insert(): Do not explicitly wake up the page cleaner,
but do invoke srv_inc_activity_count(), because that counter is
still being used in buf_load_throttle_if_needed() for some
heuristics. (It might be cleaner to execute buf_load() in the
page cleaner thread!)
Reviewed by: Vladislav Vaintroub
2020-10-26 16:35:47 +02:00
|
|
|
space = nullptr;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-10-26 16:04:12 +02:00
|
|
|
space->reacquire();
|
2021-09-06 10:14:24 +03:00
|
|
|
buf_read_page_background(space, dump[i], zip_size);
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
if (buf_load_abort_flag) {
|
2020-10-26 15:59:30 +02:00
|
|
|
if (space) {
|
2020-10-26 16:04:12 +02:00
|
|
|
space->release();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
ut_free(dump);
|
|
|
|
buf_load_status(
|
2016-08-12 11:17:45 +03:00
|
|
|
STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Buffer pool(s) load aborted on request");
|
2016-08-12 11:17:45 +03:00
|
|
|
/* Premature end, set estimated = completed = i and
|
|
|
|
end the current stage event. */
|
2020-02-15 18:25:57 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
mysql_stage_set_work_estimated(pfs_stage_progress, i);
|
2020-02-15 18:25:57 +01:00
|
|
|
mysql_stage_set_work_completed(pfs_stage_progress, i);
|
|
|
|
|
|
|
|
mysql_end_stage();
|
2012-08-01 17:27:34 +03:00
|
|
|
return;
|
|
|
|
}
|
2015-11-29 18:08:42 +11:00
|
|
|
|
|
|
|
buf_load_throttle_if_needed(
|
|
|
|
&last_check_time, &last_activity_cnt, i);
|
2016-12-06 16:39:23 +11:00
|
|
|
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
if ((i+1) >= srv_buf_pool_load_pages_abort) {
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag = true;
|
2016-12-06 16:39:23 +11:00
|
|
|
}
|
|
|
|
#endif
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2020-10-26 15:59:30 +02:00
|
|
|
if (space) {
|
2020-10-26 16:04:12 +02:00
|
|
|
space->release();
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
2012-08-01 17:27:34 +03:00
|
|
|
ut_free(dump);
|
|
|
|
|
|
|
|
ut_sprintf_timestamp(now);
|
|
|
|
|
2016-10-06 15:16:18 +02:00
|
|
|
if (i == dump_n) {
|
|
|
|
buf_load_status(STATUS_INFO,
|
2012-08-01 17:27:34 +03:00
|
|
|
"Buffer pool(s) load completed at %s", now);
|
2016-10-06 15:16:18 +02:00
|
|
|
export_vars.innodb_buffer_pool_load_incomplete = 0;
|
|
|
|
} else if (!buf_load_abort_flag) {
|
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load aborted due to user instigated abort at %s",
|
|
|
|
now);
|
|
|
|
/* intentionally don't reset innodb_buffer_pool_load_incomplete
|
|
|
|
as we don't want a shutdown to save the buffer pool */
|
|
|
|
} else {
|
|
|
|
buf_load_status(STATUS_INFO,
|
|
|
|
"Buffer pool(s) load aborted due to shutdown at %s",
|
|
|
|
now);
|
|
|
|
/* intentionally don't reset innodb_buffer_pool_load_incomplete
|
|
|
|
as we want to abort without saving the buffer pool */
|
|
|
|
}
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/* Make sure that estimated = completed when we end. */
|
2020-02-15 18:25:57 +01:00
|
|
|
mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
|
2016-08-12 11:17:45 +03:00
|
|
|
/* End the stage progress event. */
|
2020-02-15 18:25:57 +01:00
|
|
|
mysql_end_stage();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/** Abort a currently running buffer pool load. */
|
|
|
|
void buf_load_abort()
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
buf_load_abort_flag= true;
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************//**
|
2019-10-29 22:37:12 +01:00
|
|
|
This is the main task for buffer pool dump/load. when scheduled
|
|
|
|
either performs a dump or load, depending on server state, state of the variables etc- */
|
|
|
|
static void buf_dump_load_func(void *)
|
2012-08-01 17:27:34 +03:00
|
|
|
{
|
2013-03-26 00:03:13 +02:00
|
|
|
ut_ad(!srv_read_only_mode);
|
2019-10-29 22:37:12 +01:00
|
|
|
static bool first_time = true;
|
|
|
|
if (first_time && srv_buffer_pool_load_at_startup) {
|
2017-08-31 08:27:59 +03:00
|
|
|
|
2017-03-02 17:53:16 +01:00
|
|
|
#ifdef WITH_WSREP
|
2019-01-23 15:30:00 +04:00
|
|
|
if (!get_wsrep_recovery()) {
|
2017-03-02 17:53:16 +01:00
|
|
|
#endif /* WITH_WSREP */
|
2017-08-31 08:27:59 +03:00
|
|
|
buf_load();
|
2017-03-02 17:53:16 +01:00
|
|
|
#ifdef WITH_WSREP
|
2017-08-31 08:27:59 +03:00
|
|
|
}
|
2017-03-02 17:53:16 +01:00
|
|
|
#endif /* WITH_WSREP */
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
2019-10-29 22:37:12 +01:00
|
|
|
first_time = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
while (!SHUTTING_DOWN()) {
|
|
|
|
if (buf_dump_should_start) {
|
2017-02-17 10:32:21 +02:00
|
|
|
buf_dump_should_start = false;
|
2020-02-12 14:45:21 +02:00
|
|
|
buf_dump(true);
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
if (buf_load_should_start) {
|
2017-02-17 10:32:21 +02:00
|
|
|
buf_load_should_start = false;
|
2012-08-01 17:27:34 +03:00
|
|
|
buf_load();
|
|
|
|
}
|
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
if (!buf_dump_should_start && !buf_load_should_start) {
|
|
|
|
return;
|
2017-02-17 10:32:21 +02:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
/* In shutdown */
|
2012-08-01 17:27:34 +03:00
|
|
|
if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
|
2016-10-06 15:16:18 +02:00
|
|
|
if (export_vars.innodb_buffer_pool_load_incomplete) {
|
|
|
|
buf_dump_status(STATUS_INFO,
|
|
|
|
"Dumping of buffer pool not started"
|
|
|
|
" as load was incomplete");
|
2017-03-02 17:53:16 +01:00
|
|
|
#ifdef WITH_WSREP
|
2019-01-23 15:30:00 +04:00
|
|
|
} else if (get_wsrep_recovery()) {
|
2017-03-02 17:53:16 +01:00
|
|
|
#endif /* WITH_WSREP */
|
2016-10-06 15:16:18 +02:00
|
|
|
} else {
|
2020-02-12 14:45:21 +02:00
|
|
|
buf_dump(false/* do complete dump at shutdown */);
|
2017-03-02 17:53:16 +01:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|
2019-10-29 22:37:12 +01:00
|
|
|
}
|
2012-08-01 17:27:34 +03:00
|
|
|
|
|
|
|
|
2019-11-13 18:14:44 +01:00
|
|
|
/* Execute task with max.concurrency */
|
|
|
|
static tpool::task_group tpool_group(1);
|
2019-10-29 22:37:12 +01:00
|
|
|
static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
|
|
|
|
static bool load_dump_enabled;
|
2012-08-01 17:27:34 +03:00
|
|
|
|
2019-10-29 22:37:12 +01:00
|
|
|
/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
|
|
|
|
void buf_load_at_startup()
|
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
load_dump_enabled= true;
|
|
|
|
if (srv_buffer_pool_load_at_startup)
|
|
|
|
buf_do_load_dump();
|
2019-10-29 22:37:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void buf_do_load_dump()
|
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
if (load_dump_enabled && !buf_dump_load_task.is_running())
|
|
|
|
srv_thread_pool->submit_task(&buf_dump_load_task);
|
2019-10-29 22:37:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/** Wait for currently running load/dumps to finish*/
|
|
|
|
void buf_load_dump_end()
|
|
|
|
{
|
2019-11-13 18:14:44 +01:00
|
|
|
ut_ad(SHUTTING_DOWN());
|
|
|
|
buf_dump_load_task.wait();
|
2012-08-01 17:27:34 +03:00
|
|
|
}
|