mirror of
https://github.com/MariaDB/server.git
synced 2025-01-19 21:42:35 +01:00
branches/innodb+ rb://48
This patch is to improve recovery performance in InnoDB+. It includes introduction of red-black tree for sorted insertion into the flush_list and couple of other quirks. More can be found at: https://svn.innodb.com/innobase/Recovery_Performance_Improvements Reviewed by: Marko
This commit is contained in:
parent
c1d4665514
commit
90c00c9e52
5 changed files with 228 additions and 9 deletions
182
buf/buf0flu.c
182
buf/buf0flu.c
|
@ -37,6 +37,142 @@ buf_flush_validate_low(void);
|
|||
/* out: TRUE if ok */
|
||||
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
|
||||
|
||||
/**********************************************************************
|
||||
Insert a block in the flush_rbt and returns a pointer to its
|
||||
predecessor or NULL if no predecessor. The ordering is maintained
|
||||
on the basis of the <oldest_modification, space, offset> key. */
|
||||
static
|
||||
buf_page_t*
|
||||
buf_flush_insert_in_flush_rbt(
|
||||
/*==========================*/
|
||||
/* out: pointer to the predecessor or
|
||||
NULL if no predecessor. */
|
||||
buf_page_t* bpage) /* in: bpage to be inserted. */
|
||||
{
|
||||
buf_page_t* prev = NULL;
|
||||
const ib_rbt_node_t* c_node;
|
||||
const ib_rbt_node_t* p_node;
|
||||
|
||||
ut_ad(buf_pool_mutex_own());
|
||||
|
||||
/* Insert this buffer into the rbt. */
|
||||
c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
|
||||
ut_a(c_node != NULL);
|
||||
|
||||
/* Get the predecessor. */
|
||||
p_node = rbt_prev(buf_pool->flush_rbt, c_node);
|
||||
|
||||
if (p_node != NULL) {
|
||||
prev = *rbt_value(buf_page_t*, p_node);
|
||||
ut_a(prev != NULL);
|
||||
}
|
||||
|
||||
return(prev);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
Delete a bpage from the flush_rbt. */
|
||||
static
|
||||
void
|
||||
buf_flush_delete_from_flush_rbt(
|
||||
/*============================*/
|
||||
buf_page_t* bpage) /* in: bpage to be removed. */
|
||||
{
|
||||
|
||||
ibool ret = FALSE;
|
||||
|
||||
ut_ad(buf_pool_mutex_own());
|
||||
ret = rbt_delete(buf_pool->flush_rbt, &bpage);
|
||||
ut_ad(ret);
|
||||
}
|
||||
|
||||
/*********************************************************************
|
||||
Compare two modified blocks in the buffer pool. The key for comparison
|
||||
is:
|
||||
key = <oldest_modification, space, offset>
|
||||
This comparison is used to maintian ordering of blocks in the
|
||||
buf_pool->flush_rbt.
|
||||
Note that for the purpose of flush_rbt, we only need to order blocks
|
||||
on the oldest_modification. The other two fields are used to uniquely
|
||||
identify the blocks. */
|
||||
static
|
||||
int
|
||||
buf_flush_block_cmp(
|
||||
/*================*/
|
||||
/* out:
|
||||
< 0 if b2 < b1,
|
||||
0 if b2 == b1,
|
||||
> 0 if b2 > b1 */
|
||||
const void* p1, /* in: block1 */
|
||||
const void* p2) /* in: block2 */
|
||||
{
|
||||
int ret;
|
||||
|
||||
ut_ad(p1 != NULL);
|
||||
ut_ad(p2 != NULL);
|
||||
|
||||
const buf_page_t* b1 = *(const buf_page_t**) p1;
|
||||
const buf_page_t* b2 = *(const buf_page_t**) p2;
|
||||
|
||||
ut_ad(b1 != NULL);
|
||||
ut_ad(b2 != NULL);
|
||||
|
||||
ut_ad(b1->in_flush_list);
|
||||
ut_ad(b2->in_flush_list);
|
||||
|
||||
if (b2->oldest_modification
|
||||
> b1->oldest_modification) {
|
||||
return(1);
|
||||
}
|
||||
|
||||
if (b2->oldest_modification
|
||||
< b1->oldest_modification) {
|
||||
return(-1);
|
||||
}
|
||||
|
||||
/* If oldest_modification is same then decide on the space. */
|
||||
ret = (int)(b2->space - b1->space);
|
||||
|
||||
/* Or else decide ordering on the offset field. */
|
||||
return(ret ? ret : (int)(b2->offset - b1->offset));
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
Initialize the red-black tree to speed up insertions into the flush_list
|
||||
during recovery process. Should be called at the start of recovery
|
||||
process before any page has been read/written. */
|
||||
UNIV_INTERN
|
||||
void
|
||||
buf_flush_init_flush_rbt(void)
|
||||
/*==========================*/
|
||||
{
|
||||
buf_pool_mutex_enter();
|
||||
|
||||
/* Create red black tree for speedy insertions in flush list. */
|
||||
buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
|
||||
buf_flush_block_cmp);
|
||||
buf_pool_mutex_exit();
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
Frees up the red-black tree. */
|
||||
UNIV_INTERN
|
||||
void
|
||||
buf_flush_free_flush_rbt(void)
|
||||
/*==========================*/
|
||||
{
|
||||
buf_pool_mutex_enter();
|
||||
|
||||
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
|
||||
ut_a(buf_flush_validate_low());
|
||||
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
|
||||
|
||||
rbt_free(buf_pool->flush_rbt);
|
||||
buf_pool->flush_rbt = NULL;
|
||||
|
||||
buf_pool_mutex_exit();
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
Inserts a modified block into the flush list. */
|
||||
UNIV_INTERN
|
||||
|
@ -50,6 +186,13 @@ buf_flush_insert_into_flush_list(
|
|||
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
|
||||
<= bpage->oldest_modification));
|
||||
|
||||
/* If we are in the recovery then we need to update the flush
|
||||
red-black tree as well. */
|
||||
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
|
||||
buf_flush_insert_sorted_into_flush_list(bpage);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (buf_page_get_state(bpage)) {
|
||||
case BUF_BLOCK_ZIP_PAGE:
|
||||
mutex_enter(&buf_pool_zip_mutex);
|
||||
|
@ -120,12 +263,27 @@ buf_flush_insert_sorted_into_flush_list(
|
|||
}
|
||||
|
||||
prev_b = NULL;
|
||||
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
|
||||
|
||||
while (b && b->oldest_modification > bpage->oldest_modification) {
|
||||
ut_ad(b->in_flush_list);
|
||||
prev_b = b;
|
||||
b = UT_LIST_GET_NEXT(list, b);
|
||||
/* For the most part when this function is called the flush_rbt
|
||||
should not be NULL. In a very rare boundary case it is possible
|
||||
that the flush_rbt has already been freed by the recovery thread
|
||||
before the last page was hooked up in the flush_list by the
|
||||
io-handler thread. In that case we'll just do a simple
|
||||
linear search in the else block. */
|
||||
if (buf_pool->flush_rbt) {
|
||||
|
||||
prev_b = buf_flush_insert_in_flush_rbt(bpage);
|
||||
|
||||
} else {
|
||||
|
||||
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
|
||||
|
||||
while (b && b->oldest_modification
|
||||
> bpage->oldest_modification) {
|
||||
ut_ad(b->in_flush_list);
|
||||
prev_b = b;
|
||||
b = UT_LIST_GET_NEXT(list, b);
|
||||
}
|
||||
}
|
||||
|
||||
if (prev_b == NULL) {
|
||||
|
@ -242,6 +400,11 @@ buf_flush_remove(
|
|||
break;
|
||||
}
|
||||
|
||||
/* If the flush_rbt is active then delete from it as well. */
|
||||
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
|
||||
buf_flush_delete_from_flush_rbt(bpage);
|
||||
}
|
||||
|
||||
bpage->oldest_modification = 0;
|
||||
|
||||
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list));
|
||||
|
@ -1275,6 +1438,15 @@ buf_flush_validate_low(void)
|
|||
ut_a(buf_page_in_file(bpage));
|
||||
ut_a(om > 0);
|
||||
|
||||
/* If we are in recovery mode i.e.: flush_rbt != NULL
|
||||
then each block in the flush_list must also be present
|
||||
in the flush_rbt. */
|
||||
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
|
||||
ut_a(*rbt_value(buf_page_t*,
|
||||
rbt_lookup(buf_pool->flush_rbt, &bpage))
|
||||
== bpage);
|
||||
}
|
||||
|
||||
bpage = UT_LIST_GET_NEXT(list, bpage);
|
||||
|
||||
ut_a(!bpage || om >= bpage->oldest_modification);
|
||||
|
|
|
@ -745,14 +745,14 @@ buf_read_recv_pages(
|
|||
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
|
||||
|
||||
os_aio_simulated_wake_handler_threads();
|
||||
os_thread_sleep(500000);
|
||||
os_thread_sleep(10000);
|
||||
|
||||
count++;
|
||||
|
||||
if (count > 100) {
|
||||
if (count > 1000) {
|
||||
fprintf(stderr,
|
||||
"InnoDB: Error: InnoDB has waited for"
|
||||
" 50 seconds for pending\n"
|
||||
" 10 seconds for pending\n"
|
||||
"InnoDB: reads to the buffer pool to"
|
||||
" be finished.\n"
|
||||
"InnoDB: Number of pending reads %lu,"
|
||||
|
|
|
@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri
|
|||
#include "sync0rw.h"
|
||||
#include "hash0hash.h"
|
||||
#include "ut0byte.h"
|
||||
#include "ut0rbt.h"
|
||||
#include "os0proc.h"
|
||||
#include "page0types.h"
|
||||
|
||||
|
@ -1285,6 +1286,19 @@ struct buf_pool_struct{
|
|||
/* this is in the set state when there
|
||||
is no flush batch of the given type
|
||||
running */
|
||||
ib_rbt_t* flush_rbt; /* a red-black tree is used
|
||||
exclusively during recovery to
|
||||
speed up insertions in the
|
||||
flush_list. This tree contains
|
||||
blocks in order of
|
||||
oldest_modification LSN and is
|
||||
kept in sync with the
|
||||
flush_list.
|
||||
Each member of the tree MUST
|
||||
also be on the flush_list.
|
||||
This tree is relevant only in
|
||||
recovery and is set to NULL
|
||||
once the recovery is over. */
|
||||
ulint ulint_clock; /* a sequence number used to count
|
||||
time. NOTE! This counter wraps
|
||||
around at 4 billion (if ulint ==
|
||||
|
|
|
@ -126,6 +126,22 @@ buf_flush_validate(void);
|
|||
/* out: TRUE if ok */
|
||||
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
|
||||
|
||||
/************************************************************************
|
||||
Initialize the red-black tree to speed up insertions into the flush_list
|
||||
during recovery process. Should be called at the start of recovery
|
||||
process before any page has been read/written. */
|
||||
UNIV_INTERN
|
||||
void
|
||||
buf_flush_init_flush_rbt(void);
|
||||
/*==========================*/
|
||||
|
||||
/************************************************************************
|
||||
Frees up the red-black tree. */
|
||||
UNIV_INTERN
|
||||
void
|
||||
buf_flush_free_flush_rbt(void);
|
||||
/*==========================*/
|
||||
|
||||
/* When buf_flush_free_margin is called, it tries to make this many blocks
|
||||
available to replacement in the free list and at the end of the LRU list (to
|
||||
make sure that a read-ahead batch can be read efficiently in a single
|
||||
|
|
|
@ -101,7 +101,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no = 0;
|
|||
/* This many frames must be left free in the buffer pool when we scan
|
||||
the log and store the scanned log records in the buffer pool: we will
|
||||
use these free frames to read in pages when we start applying the
|
||||
log records to the database. */
|
||||
log records to the database.
|
||||
This is the default value. If the actual size of the buffer pool is
|
||||
larger than 10 MB we'll set this value to 512. */
|
||||
|
||||
UNIV_INTERN ulint recv_n_pool_free_frames = 256;
|
||||
|
||||
|
@ -156,6 +158,12 @@ recv_sys_init(
|
|||
return;
|
||||
}
|
||||
|
||||
/* Initialize red-black tree for fast insertions into the
|
||||
flush_list during recovery process.
|
||||
As this initialization is done while holding the buffer pool
|
||||
mutex we perform it before acquiring recv_sys->mutex. */
|
||||
buf_flush_init_flush_rbt();
|
||||
|
||||
mutex_enter(&(recv_sys->mutex));
|
||||
|
||||
if (!recover_from_backup) {
|
||||
|
@ -165,6 +173,12 @@ recv_sys_init(
|
|||
recv_is_from_backup = TRUE;
|
||||
}
|
||||
|
||||
/* Set appropriate value of recv_n_pool_free_frames. */
|
||||
if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
|
||||
/* Buffer pool of size greater than 10 MB. */
|
||||
recv_n_pool_free_frames = 512;
|
||||
}
|
||||
|
||||
recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
|
||||
recv_sys->len = 0;
|
||||
recv_sys->recovered_offset = 0;
|
||||
|
@ -231,6 +245,9 @@ recv_sys_free(void)
|
|||
recv_sys->heap = NULL;
|
||||
|
||||
mutex_exit(&(recv_sys->mutex));
|
||||
|
||||
/* Free up the flush_rbt. */
|
||||
buf_flush_free_flush_rbt();
|
||||
}
|
||||
#endif /* UNIV_LOG_DEBUG */
|
||||
|
||||
|
|
Loading…
Reference in a new issue