diff --git a/buf/buf0flu.c b/buf/buf0flu.c index bd511869aaa..5def9bb7ce7 100644 --- a/buf/buf0flu.c +++ b/buf/buf0flu.c @@ -37,6 +37,142 @@ buf_flush_validate_low(void); /* out: TRUE if ok */ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +/********************************************************************** +Insert a block in the flush_rbt and returns a pointer to its +predecessor or NULL if no predecessor. The ordering is maintained +on the basis of the key. */ +static +buf_page_t* +buf_flush_insert_in_flush_rbt( +/*==========================*/ + /* out: pointer to the predecessor or + NULL if no predecessor. */ + buf_page_t* bpage) /* in: bpage to be inserted. */ +{ + buf_page_t* prev = NULL; + const ib_rbt_node_t* c_node; + const ib_rbt_node_t* p_node; + + ut_ad(buf_pool_mutex_own()); + + /* Insert this buffer into the rbt. */ + c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); + ut_a(c_node != NULL); + + /* Get the predecessor. */ + p_node = rbt_prev(buf_pool->flush_rbt, c_node); + + if (p_node != NULL) { + prev = *rbt_value(buf_page_t*, p_node); + ut_a(prev != NULL); + } + + return(prev); +} + +/************************************************************* +Delete a bpage from the flush_rbt. */ +static +void +buf_flush_delete_from_flush_rbt( +/*============================*/ + buf_page_t* bpage) /* in: bpage to be removed. */ +{ + + ibool ret = FALSE; + + ut_ad(buf_pool_mutex_own()); + ret = rbt_delete(buf_pool->flush_rbt, &bpage); + ut_ad(ret); +} + +/********************************************************************* +Compare two modified blocks in the buffer pool. The key for comparison +is: +key = +This comparison is used to maintian ordering of blocks in the +buf_pool->flush_rbt. +Note that for the purpose of flush_rbt, we only need to order blocks +on the oldest_modification. The other two fields are used to uniquely +identify the blocks. */ +static +int +buf_flush_block_cmp( +/*================*/ + /* out: + < 0 if b2 < b1, + 0 if b2 == b1, + > 0 if b2 > b1 */ + const void* p1, /* in: block1 */ + const void* p2) /* in: block2 */ +{ + int ret; + + ut_ad(p1 != NULL); + ut_ad(p2 != NULL); + + const buf_page_t* b1 = *(const buf_page_t**) p1; + const buf_page_t* b2 = *(const buf_page_t**) p2; + + ut_ad(b1 != NULL); + ut_ad(b2 != NULL); + + ut_ad(b1->in_flush_list); + ut_ad(b2->in_flush_list); + + if (b2->oldest_modification + > b1->oldest_modification) { + return(1); + } + + if (b2->oldest_modification + < b1->oldest_modification) { + return(-1); + } + + /* If oldest_modification is same then decide on the space. */ + ret = (int)(b2->space - b1->space); + + /* Or else decide ordering on the offset field. */ + return(ret ? ret : (int)(b2->offset - b1->offset)); +} + +/************************************************************************ +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void) +/*==========================*/ +{ + buf_pool_mutex_enter(); + + /* Create red black tree for speedy insertions in flush list. */ + buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*), + buf_flush_block_cmp); + buf_pool_mutex_exit(); +} + +/************************************************************************ +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void) +/*==========================*/ +{ + buf_pool_mutex_enter(); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + rbt_free(buf_pool->flush_rbt); + buf_pool->flush_rbt = NULL; + + buf_pool_mutex_exit(); +} + /************************************************************************ Inserts a modified block into the flush list. */ UNIV_INTERN @@ -50,6 +186,13 @@ buf_flush_insert_into_flush_list( || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification <= bpage->oldest_modification)); + /* If we are in the recovery then we need to update the flush + red-black tree as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_insert_sorted_into_flush_list(bpage); + return; + } + switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: mutex_enter(&buf_pool_zip_mutex); @@ -120,12 +263,27 @@ buf_flush_insert_sorted_into_flush_list( } prev_b = NULL; - b = UT_LIST_GET_FIRST(buf_pool->flush_list); - while (b && b->oldest_modification > bpage->oldest_modification) { - ut_ad(b->in_flush_list); - prev_b = b; - b = UT_LIST_GET_NEXT(list, b); + /* For the most part when this function is called the flush_rbt + should not be NULL. In a very rare boundary case it is possible + that the flush_rbt has already been freed by the recovery thread + before the last page was hooked up in the flush_list by the + io-handler thread. In that case we'll just do a simple + linear search in the else block. */ + if (buf_pool->flush_rbt) { + + prev_b = buf_flush_insert_in_flush_rbt(bpage); + + } else { + + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (b && b->oldest_modification + > bpage->oldest_modification) { + ut_ad(b->in_flush_list); + prev_b = b; + b = UT_LIST_GET_NEXT(list, b); + } } if (prev_b == NULL) { @@ -242,6 +400,11 @@ buf_flush_remove( break; } + /* If the flush_rbt is active then delete from it as well. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + buf_flush_delete_from_flush_rbt(bpage); + } + bpage->oldest_modification = 0; ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list)); @@ -1275,6 +1438,15 @@ buf_flush_validate_low(void) ut_a(buf_page_in_file(bpage)); ut_a(om > 0); + /* If we are in recovery mode i.e.: flush_rbt != NULL + then each block in the flush_list must also be present + in the flush_rbt. */ + if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) { + ut_a(*rbt_value(buf_page_t*, + rbt_lookup(buf_pool->flush_rbt, &bpage)) + == bpage); + } + bpage = UT_LIST_GET_NEXT(list, bpage); ut_a(!bpage || om >= bpage->oldest_modification); diff --git a/buf/buf0rea.c b/buf/buf0rea.c index e2491570fb4..83e75ff593e 100644 --- a/buf/buf0rea.c +++ b/buf/buf0rea.c @@ -745,14 +745,14 @@ buf_read_recv_pages( while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); - os_thread_sleep(500000); + os_thread_sleep(10000); count++; - if (count > 100) { + if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" - " 50 seconds for pending\n" + " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," diff --git a/include/buf0buf.h b/include/buf0buf.h index efd6bd92091..c5701586619 100644 --- a/include/buf0buf.h +++ b/include/buf0buf.h @@ -30,6 +30,7 @@ Created 11/5/1995 Heikki Tuuri #include "sync0rw.h" #include "hash0hash.h" #include "ut0byte.h" +#include "ut0rbt.h" #include "os0proc.h" #include "page0types.h" @@ -1285,6 +1286,19 @@ struct buf_pool_struct{ /* this is in the set state when there is no flush batch of the given type running */ + ib_rbt_t* flush_rbt; /* a red-black tree is used + exclusively during recovery to + speed up insertions in the + flush_list. This tree contains + blocks in order of + oldest_modification LSN and is + kept in sync with the + flush_list. + Each member of the tree MUST + also be on the flush_list. + This tree is relevant only in + recovery and is set to NULL + once the recovery is over. */ ulint ulint_clock; /* a sequence number used to count time. NOTE! This counter wraps around at 4 billion (if ulint == diff --git a/include/buf0flu.h b/include/buf0flu.h index b11801e9fe8..da497899a2c 100644 --- a/include/buf0flu.h +++ b/include/buf0flu.h @@ -126,6 +126,22 @@ buf_flush_validate(void); /* out: TRUE if ok */ #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +/************************************************************************ +Initialize the red-black tree to speed up insertions into the flush_list +during recovery process. Should be called at the start of recovery +process before any page has been read/written. */ +UNIV_INTERN +void +buf_flush_init_flush_rbt(void); +/*==========================*/ + +/************************************************************************ +Frees up the red-black tree. */ +UNIV_INTERN +void +buf_flush_free_flush_rbt(void); +/*==========================*/ + /* When buf_flush_free_margin is called, it tries to make this many blocks available to replacement in the free list and at the end of the LRU list (to make sure that a read-ahead batch can be read efficiently in a single diff --git a/log/log0recv.c b/log/log0recv.c index a36eabce9a4..7f3df4eaefc 100644 --- a/log/log0recv.c +++ b/log/log0recv.c @@ -101,7 +101,9 @@ UNIV_INTERN ulint recv_max_parsed_page_no = 0; /* This many frames must be left free in the buffer pool when we scan the log and store the scanned log records in the buffer pool: we will use these free frames to read in pages when we start applying the -log records to the database. */ +log records to the database. +This is the default value. If the actual size of the buffer pool is +larger than 10 MB we'll set this value to 512. */ UNIV_INTERN ulint recv_n_pool_free_frames = 256; @@ -156,6 +158,12 @@ recv_sys_init( return; } + /* Initialize red-black tree for fast insertions into the + flush_list during recovery process. + As this initialization is done while holding the buffer pool + mutex we perform it before acquiring recv_sys->mutex. */ + buf_flush_init_flush_rbt(); + mutex_enter(&(recv_sys->mutex)); if (!recover_from_backup) { @@ -165,6 +173,12 @@ recv_sys_init( recv_is_from_backup = TRUE; } + /* Set appropriate value of recv_n_pool_free_frames. */ + if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) { + /* Buffer pool of size greater than 10 MB. */ + recv_n_pool_free_frames = 512; + } + recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE); recv_sys->len = 0; recv_sys->recovered_offset = 0; @@ -231,6 +245,9 @@ recv_sys_free(void) recv_sys->heap = NULL; mutex_exit(&(recv_sys->mutex)); + + /* Free up the flush_rbt. */ + buf_flush_free_flush_rbt(); } #endif /* UNIV_LOG_DEBUG */