branches/innodb+ rb://48

This patch is to improve recovery performance in InnoDB+.
It includes introduction of red-black tree for sorted insertion into
the flush_list and couple of other quirks. More can be found
at: https://svn.innodb.com/innobase/Recovery_Performance_Improvements

Reviewed by: Marko
This commit is contained in:
inaam 2008-11-11 10:31:51 +00:00
parent c1d4665514
commit 90c00c9e52
5 changed files with 228 additions and 9 deletions

View file

@ -37,6 +37,142 @@ buf_flush_validate_low(void);
/* out: TRUE if ok */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/**********************************************************************
Insert a block in the flush_rbt and returns a pointer to its
predecessor or NULL if no predecessor. The ordering is maintained
on the basis of the <oldest_modification, space, offset> key. */
static
buf_page_t*
buf_flush_insert_in_flush_rbt(
/*==========================*/
/* out: pointer to the predecessor or
NULL if no predecessor. */
buf_page_t* bpage) /* in: bpage to be inserted. */
{
buf_page_t* prev = NULL;
const ib_rbt_node_t* c_node;
const ib_rbt_node_t* p_node;
ut_ad(buf_pool_mutex_own());
/* Insert this buffer into the rbt. */
c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
ut_a(c_node != NULL);
/* Get the predecessor. */
p_node = rbt_prev(buf_pool->flush_rbt, c_node);
if (p_node != NULL) {
prev = *rbt_value(buf_page_t*, p_node);
ut_a(prev != NULL);
}
return(prev);
}
/*************************************************************
Delete a bpage from the flush_rbt. */
static
void
buf_flush_delete_from_flush_rbt(
/*============================*/
buf_page_t* bpage) /* in: bpage to be removed. */
{
ibool ret = FALSE;
ut_ad(buf_pool_mutex_own());
ret = rbt_delete(buf_pool->flush_rbt, &bpage);
ut_ad(ret);
}
/*********************************************************************
Compare two modified blocks in the buffer pool. The key for comparison
is:
key = <oldest_modification, space, offset>
This comparison is used to maintian ordering of blocks in the
buf_pool->flush_rbt.
Note that for the purpose of flush_rbt, we only need to order blocks
on the oldest_modification. The other two fields are used to uniquely
identify the blocks. */
static
int
buf_flush_block_cmp(
/*================*/
/* out:
< 0 if b2 < b1,
0 if b2 == b1,
> 0 if b2 > b1 */
const void* p1, /* in: block1 */
const void* p2) /* in: block2 */
{
int ret;
ut_ad(p1 != NULL);
ut_ad(p2 != NULL);
const buf_page_t* b1 = *(const buf_page_t**) p1;
const buf_page_t* b2 = *(const buf_page_t**) p2;
ut_ad(b1 != NULL);
ut_ad(b2 != NULL);
ut_ad(b1->in_flush_list);
ut_ad(b2->in_flush_list);
if (b2->oldest_modification
> b1->oldest_modification) {
return(1);
}
if (b2->oldest_modification
< b1->oldest_modification) {
return(-1);
}
/* If oldest_modification is same then decide on the space. */
ret = (int)(b2->space - b1->space);
/* Or else decide ordering on the offset field. */
return(ret ? ret : (int)(b2->offset - b1->offset));
}
/************************************************************************
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
UNIV_INTERN
void
buf_flush_init_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
/* Create red black tree for speedy insertions in flush list. */
buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
buf_flush_block_cmp);
buf_pool_mutex_exit();
}
/************************************************************************
Frees up the red-black tree. */
UNIV_INTERN
void
buf_flush_free_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
rbt_free(buf_pool->flush_rbt);
buf_pool->flush_rbt = NULL;
buf_pool_mutex_exit();
}
/************************************************************************
Inserts a modified block into the flush list. */
UNIV_INTERN
@ -50,6 +186,13 @@ buf_flush_insert_into_flush_list(
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
<= bpage->oldest_modification));
/* If we are in the recovery then we need to update the flush
red-black tree as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_insert_sorted_into_flush_list(bpage);
return;
}
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
mutex_enter(&buf_pool_zip_mutex);
@ -120,12 +263,27 @@ buf_flush_insert_sorted_into_flush_list(
}
prev_b = NULL;
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
while (b && b->oldest_modification > bpage->oldest_modification) {
ut_ad(b->in_flush_list);
prev_b = b;
b = UT_LIST_GET_NEXT(list, b);
/* For the most part when this function is called the flush_rbt
should not be NULL. In a very rare boundary case it is possible
that the flush_rbt has already been freed by the recovery thread
before the last page was hooked up in the flush_list by the
io-handler thread. In that case we'll just do a simple
linear search in the else block. */
if (buf_pool->flush_rbt) {
prev_b = buf_flush_insert_in_flush_rbt(bpage);
} else {
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
while (b && b->oldest_modification
> bpage->oldest_modification) {
ut_ad(b->in_flush_list);
prev_b = b;
b = UT_LIST_GET_NEXT(list, b);
}
}
if (prev_b == NULL) {
@ -242,6 +400,11 @@ buf_flush_remove(
break;
}
/* If the flush_rbt is active then delete from it as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_delete_from_flush_rbt(bpage);
}
bpage->oldest_modification = 0;
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list));
@ -1275,6 +1438,15 @@ buf_flush_validate_low(void)
ut_a(buf_page_in_file(bpage));
ut_a(om > 0);
/* If we are in recovery mode i.e.: flush_rbt != NULL
then each block in the flush_list must also be present
in the flush_rbt. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
ut_a(*rbt_value(buf_page_t*,
rbt_lookup(buf_pool->flush_rbt, &bpage))
== bpage);
}
bpage = UT_LIST_GET_NEXT(list, bpage);
ut_a(!bpage || om >= bpage->oldest_modification);

View file

@ -745,14 +745,14 @@ buf_read_recv_pages(
while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
os_aio_simulated_wake_handler_threads();
os_thread_sleep(500000);
os_thread_sleep(10000);
count++;
if (count > 100) {
if (count > 1000) {
fprintf(stderr,
"InnoDB: Error: InnoDB has waited for"
" 50 seconds for pending\n"
" 10 seconds for pending\n"
"InnoDB: reads to the buffer pool to"
" be finished.\n"
"InnoDB: Number of pending reads %lu,"

View file

@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri
#include "sync0rw.h"
#include "hash0hash.h"
#include "ut0byte.h"
#include "ut0rbt.h"
#include "os0proc.h"
#include "page0types.h"
@ -1285,6 +1286,19 @@ struct buf_pool_struct{
/* this is in the set state when there
is no flush batch of the given type
running */
ib_rbt_t* flush_rbt; /* a red-black tree is used
exclusively during recovery to
speed up insertions in the
flush_list. This tree contains
blocks in order of
oldest_modification LSN and is
kept in sync with the
flush_list.
Each member of the tree MUST
also be on the flush_list.
This tree is relevant only in
recovery and is set to NULL
once the recovery is over. */
ulint ulint_clock; /* a sequence number used to count
time. NOTE! This counter wraps
around at 4 billion (if ulint ==

View file

@ -126,6 +126,22 @@ buf_flush_validate(void);
/* out: TRUE if ok */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/************************************************************************
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
UNIV_INTERN
void
buf_flush_init_flush_rbt(void);
/*==========================*/
/************************************************************************
Frees up the red-black tree. */
UNIV_INTERN
void
buf_flush_free_flush_rbt(void);
/*==========================*/
/* When buf_flush_free_margin is called, it tries to make this many blocks
available to replacement in the free list and at the end of the LRU list (to
make sure that a read-ahead batch can be read efficiently in a single

View file

@ -101,7 +101,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no = 0;
/* This many frames must be left free in the buffer pool when we scan
the log and store the scanned log records in the buffer pool: we will
use these free frames to read in pages when we start applying the
log records to the database. */
log records to the database.
This is the default value. If the actual size of the buffer pool is
larger than 10 MB we'll set this value to 512. */
UNIV_INTERN ulint recv_n_pool_free_frames = 256;
@ -156,6 +158,12 @@ recv_sys_init(
return;
}
/* Initialize red-black tree for fast insertions into the
flush_list during recovery process.
As this initialization is done while holding the buffer pool
mutex we perform it before acquiring recv_sys->mutex. */
buf_flush_init_flush_rbt();
mutex_enter(&(recv_sys->mutex));
if (!recover_from_backup) {
@ -165,6 +173,12 @@ recv_sys_init(
recv_is_from_backup = TRUE;
}
/* Set appropriate value of recv_n_pool_free_frames. */
if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
/* Buffer pool of size greater than 10 MB. */
recv_n_pool_free_frames = 512;
}
recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
recv_sys->len = 0;
recv_sys->recovered_offset = 0;
@ -231,6 +245,9 @@ recv_sys_free(void)
recv_sys->heap = NULL;
mutex_exit(&(recv_sys->mutex));
/* Free up the flush_rbt. */
buf_flush_free_flush_rbt();
}
#endif /* UNIV_LOG_DEBUG */