mirror of
https://github.com/MariaDB/server.git
synced 2025-02-01 19:41:47 +01:00
branches/zip rb://133
This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki
This commit is contained in:
parent
449e6af3c6
commit
43fceb74f2
7 changed files with 241 additions and 2 deletions
150
buf/buf0flu.c
150
buf/buf0flu.c
|
@ -44,6 +44,39 @@ Created 11/11/1995 Heikki Tuuri
|
|||
#include "os0file.h"
|
||||
#include "trx0sys.h"
|
||||
|
||||
/**********************************************************************
|
||||
These statistics are generated for heuristics used in estimating the
|
||||
rate at which we should flush the dirty blocks to avoid bursty IO
|
||||
activity. Note that the rate of flushing not only depends on how many
|
||||
dirty pages we have in the buffer pool but it is also a fucntion of
|
||||
how much redo the workload is generating and at what rate. */
|
||||
/* @{ */
|
||||
|
||||
/** Number of intervals for which we keep the history of these stats.
|
||||
Each interval is 1 second, defined by the rate at which
|
||||
srv_error_monitor_thread() calls buf_flush_stat_update(). */
|
||||
#define BUF_FLUSH_STAT_N_INTERVAL 20
|
||||
|
||||
/** Sampled values buf_flush_stat_cur.
|
||||
Not protected by any mutex. Updated by buf_flush_stat_update(). */
|
||||
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
|
||||
|
||||
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
|
||||
static ulint buf_flush_stat_arr_ind;
|
||||
|
||||
/** Values at start of the current interval. Reset by
|
||||
buf_flush_stat_update(). */
|
||||
static buf_flush_stat_t buf_flush_stat_cur;
|
||||
|
||||
/** Running sum of past values of buf_flush_stat_cur.
|
||||
Updated by buf_flush_stat_update(). Not protected by any mutex. */
|
||||
static buf_flush_stat_t buf_flush_stat_sum;
|
||||
|
||||
/** Number of pages flushed through non flush_list flushes. */
|
||||
static ulint buf_lru_flush_page_count = 0;
|
||||
|
||||
/* @} */
|
||||
|
||||
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
||||
/******************************************************************//**
|
||||
Validates the flush list.
|
||||
|
@ -1132,6 +1165,13 @@ flush_next:
|
|||
|
||||
srv_buf_pool_flushed += page_count;
|
||||
|
||||
/* We keep track of all flushes happening as part of LRU
|
||||
flush. When estimating the desired rate at which flush_list
|
||||
should be flushed we factor in this value. */
|
||||
if (flush_type == BUF_FLUSH_LRU) {
|
||||
buf_lru_flush_page_count += page_count;
|
||||
}
|
||||
|
||||
return(page_count);
|
||||
}
|
||||
|
||||
|
@ -1227,6 +1267,116 @@ buf_flush_free_margin(void)
|
|||
}
|
||||
}
|
||||
|
||||
/*********************************************************************
|
||||
Update the historical stats that we are collecting for flush rate
|
||||
heuristics at the end of each interval.
|
||||
Flush rate heuristic depends on (a) rate of redo log generation and
|
||||
(b) the rate at which LRU flush is happening. */
|
||||
UNIV_INTERN
|
||||
void
|
||||
buf_flush_stat_update(void)
|
||||
/*=======================*/
|
||||
{
|
||||
buf_flush_stat_t* item;
|
||||
ib_uint64_t lsn_diff;
|
||||
ib_uint64_t lsn;
|
||||
ulint n_flushed;
|
||||
|
||||
lsn = log_get_lsn();
|
||||
if (buf_flush_stat_cur.redo == 0) {
|
||||
/* First time around. Just update the current LSN
|
||||
and return. */
|
||||
buf_flush_stat_cur.redo = lsn;
|
||||
return;
|
||||
}
|
||||
|
||||
item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
|
||||
|
||||
/* values for this interval */
|
||||
lsn_diff = lsn - buf_flush_stat_cur.redo;
|
||||
n_flushed = buf_lru_flush_page_count
|
||||
- buf_flush_stat_cur.n_flushed;
|
||||
|
||||
/* add the current value and subtract the obsolete entry. */
|
||||
buf_flush_stat_sum.redo += lsn_diff - item->redo;
|
||||
buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
|
||||
|
||||
/* put current entry in the array. */
|
||||
item->redo = lsn_diff;
|
||||
item->n_flushed = n_flushed;
|
||||
|
||||
/* update the index */
|
||||
buf_flush_stat_arr_ind++;
|
||||
buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
|
||||
|
||||
/* reset the current entry. */
|
||||
buf_flush_stat_cur.redo = lsn;
|
||||
buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
|
||||
}
|
||||
|
||||
/*********************************************************************
|
||||
Determines the fraction of dirty pages that need to be flushed based
|
||||
on the speed at which we generate redo log. Note that if redo log
|
||||
is generated at a significant rate without corresponding increase
|
||||
in the number of dirty pages (for example, an in-memory workload)
|
||||
it can cause IO bursts of flushing. This function implements heuristics
|
||||
to avoid this burstiness.
|
||||
@return number of dirty pages to be flushed / second */
|
||||
UNIV_INTERN
|
||||
ulint
|
||||
buf_flush_get_desired_flush_rate(void)
|
||||
/*==================================*/
|
||||
{
|
||||
ulint redo_avg;
|
||||
ulint lru_flush_avg;
|
||||
ulint n_dirty;
|
||||
ulint n_flush_req;
|
||||
lint rate;
|
||||
ib_uint64_t lsn = log_get_lsn();
|
||||
ib_uint64_t log_capacity = log_get_capacity();
|
||||
|
||||
/* log_capacity should never be zero after the initialization
|
||||
of log subsystem. */
|
||||
ut_ad(log_capacity != 0);
|
||||
|
||||
/* Get total number of dirty pages. It is OK to access
|
||||
flush_list without holding any mtex as we are using this
|
||||
only for heuristics. */
|
||||
n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
|
||||
|
||||
/* An overflow can happen if we generate more than 2^32 bytes
|
||||
of redo in this interval i.e.: 4G of redo in 1 second. We can
|
||||
safely consider this as infinity because if we ever come close
|
||||
to 4G we'll start a synchronous flush of dirty pages. */
|
||||
/* redo_avg below is average at which redo is generated in
|
||||
past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
|
||||
interval. */
|
||||
redo_avg = buf_flush_stat_sum.redo / BUF_FLUSH_STAT_N_INTERVAL
|
||||
+ (lsn - buf_flush_stat_cur.redo);
|
||||
|
||||
/* An overflow can happen possibly if we flush more than 2^32
|
||||
pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
|
||||
unlikely scenario. Even when this happens it means that our
|
||||
flush rate will be off the mark. It won't affect correctness
|
||||
of any subsystem. */
|
||||
/* lru_flush_avg below is rate at which pages are flushed as
|
||||
part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
|
||||
number of pages flushed in the current interval. */
|
||||
lru_flush_avg = buf_flush_stat_sum.n_flushed
|
||||
/ BUF_FLUSH_STAT_N_INTERVAL
|
||||
+ (buf_lru_flush_page_count
|
||||
- buf_flush_stat_cur.n_flushed);
|
||||
|
||||
n_flush_req = (n_dirty * redo_avg) / log_capacity;
|
||||
|
||||
/* The number of pages that we want to flush from the flush
|
||||
list is the difference between the required rate and the
|
||||
number of pages that we are historically flushing from the
|
||||
LRU list */
|
||||
rate = n_flush_req - lru_flush_avg;
|
||||
return(rate > 0 ? (ulint) rate : 0);
|
||||
}
|
||||
|
||||
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
||||
/******************************************************************//**
|
||||
Validates the flush list.
|
||||
|
|
|
@ -9687,6 +9687,11 @@ static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
|
|||
"Percentage of dirty pages allowed in bufferpool.",
|
||||
NULL, NULL, 75, 0, 99, 0);
|
||||
|
||||
static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
|
||||
PLUGIN_VAR_NOCMDARG,
|
||||
"Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
|
||||
NULL, NULL, TRUE);
|
||||
|
||||
static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
|
||||
PLUGIN_VAR_RQCMDARG,
|
||||
"Desired maximum length of the purge queue (0 = no limit)",
|
||||
|
@ -9886,6 +9891,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
|
|||
MYSQL_SYSVAR(log_files_in_group),
|
||||
MYSQL_SYSVAR(log_group_home_dir),
|
||||
MYSQL_SYSVAR(max_dirty_pages_pct),
|
||||
MYSQL_SYSVAR(adaptive_flushing),
|
||||
MYSQL_SYSVAR(max_purge_lag),
|
||||
MYSQL_SYSVAR(mirrored_log_groups),
|
||||
MYSQL_SYSVAR(open_files),
|
||||
|
|
|
@ -127,6 +127,44 @@ buf_flush_ready_for_replace(
|
|||
/*========================*/
|
||||
buf_page_t* bpage); /*!< in: buffer control block, must be
|
||||
buf_page_in_file(bpage) and in the LRU list */
|
||||
|
||||
/** @brief Statistics for selecting flush rate based on redo log
|
||||
generation speed.
|
||||
|
||||
These statistics are generated for heuristics used in estimating the
|
||||
rate at which we should flush the dirty blocks to avoid bursty IO
|
||||
activity. Note that the rate of flushing not only depends on how many
|
||||
dirty pages we have in the buffer pool but it is also a fucntion of
|
||||
how much redo the workload is generating and at what rate. */
|
||||
|
||||
struct buf_flush_stat_struct
|
||||
{
|
||||
ib_uint64_t redo; /**< amount of redo generated. */
|
||||
ulint n_flushed; /**< number of pages flushed. */
|
||||
};
|
||||
|
||||
/** Statistics for selecting flush rate of dirty pages. */
|
||||
typedef struct buf_flush_stat_struct buf_flush_stat_t;
|
||||
/*********************************************************************
|
||||
Update the historical stats that we are collecting for flush rate
|
||||
heuristics at the end of each interval. */
|
||||
UNIV_INTERN
|
||||
void
|
||||
buf_flush_stat_update(void);
|
||||
/*=======================*/
|
||||
/*********************************************************************
|
||||
Determines the fraction of dirty pages that need to be flushed based
|
||||
on the speed at which we generate redo log. Note that if redo log
|
||||
is generated at significant rate without a corresponding increase
|
||||
in the number of dirty pages (for example, an in-memory workload)
|
||||
it can cause IO bursts of flushing. This function implements heuristics
|
||||
to avoid this burstiness.
|
||||
@return number of dirty pages to be flushed / second */
|
||||
UNIV_INTERN
|
||||
ulint
|
||||
buf_flush_get_desired_flush_rate(void);
|
||||
/*==================================*/
|
||||
|
||||
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
||||
/******************************************************************//**
|
||||
Validates the flush list.
|
||||
|
|
|
@ -169,6 +169,14 @@ UNIV_INLINE
|
|||
ib_uint64_t
|
||||
log_get_lsn(void);
|
||||
/*=============*/
|
||||
/****************************************************************
|
||||
Gets the log group capacity. It is OK to read the value without
|
||||
holding log_sys->mutex because it is constant.
|
||||
@return log group capacity */
|
||||
UNIV_INLINE
|
||||
ib_uint64_t
|
||||
log_get_capacity(void);
|
||||
/*==================*/
|
||||
/******************************************************//**
|
||||
Initializes the log. */
|
||||
UNIV_INTERN
|
||||
|
|
|
@ -385,6 +385,18 @@ log_get_lsn(void)
|
|||
return(lsn);
|
||||
}
|
||||
|
||||
/****************************************************************
|
||||
Gets the log group capacity. It is OK to read the value without
|
||||
holding log_sys->mutex because it is constant.
|
||||
@return log group capacity */
|
||||
UNIV_INLINE
|
||||
ib_uint64_t
|
||||
log_get_capacity(void)
|
||||
/*==================*/
|
||||
{
|
||||
return(log_sys->log_group_capacity);
|
||||
}
|
||||
|
||||
/***********************************************************************//**
|
||||
Checks if there is need for a log buffer flush or a new checkpoint, and does
|
||||
this if yes. Any database operation should call this when it has modified
|
||||
|
|
|
@ -139,6 +139,8 @@ extern ulint srv_n_log_files;
|
|||
extern ulint srv_log_file_size;
|
||||
extern ulint srv_log_buffer_size;
|
||||
extern ulong srv_flush_log_at_trx_commit;
|
||||
extern char srv_adaptive_flushing;
|
||||
|
||||
|
||||
/* The sort order table of the MySQL latin1_swedish_ci character set
|
||||
collation */
|
||||
|
|
|
@ -183,6 +183,10 @@ UNIV_INTERN ulint srv_log_file_size = ULINT_MAX;
|
|||
UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX;
|
||||
UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
|
||||
|
||||
/* Try to flush dirty pages so as to avoid IO bursts at
|
||||
the checkpoints. */
|
||||
UNIV_INTERN char srv_adaptive_flushing = TRUE;
|
||||
|
||||
/* The sort order table of the MySQL latin1_swedish_ci character set
|
||||
collation */
|
||||
UNIV_INTERN const byte* srv_latin1_ordering;
|
||||
|
@ -2175,13 +2179,16 @@ loop:
|
|||
}
|
||||
|
||||
/* Update the statistics collected for deciding LRU
|
||||
eviction policy. */
|
||||
eviction policy. */
|
||||
buf_LRU_stat_update();
|
||||
|
||||
/* Update the statistics collected for flush rate policy. */
|
||||
buf_flush_stat_update();
|
||||
|
||||
/* In case mutex_exit is not a memory barrier, it is
|
||||
theoretically possible some threads are left waiting though
|
||||
the semaphore is already released. Wake up those threads: */
|
||||
|
||||
|
||||
sync_arr_wake_threads_if_sema_free();
|
||||
|
||||
if (sync_array_print_long_waits()) {
|
||||
|
@ -2423,6 +2430,22 @@ loop:
|
|||
iteration of this loop. */
|
||||
|
||||
skip_sleep = TRUE;
|
||||
} else if (srv_adaptive_flushing) {
|
||||
|
||||
/* Try to keep the rate of flushing of dirty
|
||||
pages such that redo log generation does not
|
||||
produce bursts of IO at checkpoint time. */
|
||||
ulint n_flush = buf_flush_get_desired_flush_rate();
|
||||
|
||||
if (n_flush) {
|
||||
n_flush = ut_min(PCT_IO(100), n_flush);
|
||||
n_pages_flushed =
|
||||
buf_flush_batch(
|
||||
BUF_FLUSH_LIST,
|
||||
n_flush,
|
||||
IB_ULONGLONG_MAX);
|
||||
skip_sleep = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
if (srv_activity_count == old_activity_count) {
|
||||
|
|
Loading…
Add table
Reference in a new issue