mariadb/buf/buf0flu.c
marko 86b8525254 branches/innodb+: Merge revisions 3602:3931 from branches/zip:
------------------------------------------------------------------------
  r3607 | marko | 2008-12-30 22:33:31 +0200 (Tue, 30 Dec 2008) | 20 lines

  branches/zip: Remove the dependency on the MySQL HASH table implementation.
  Use the InnoDB hash table for keeping track of INNOBASE_SHARE objects.

  struct st_innobase_share: Make table_name const uchar*.  Add the member
  table_name_hash.

  innobase_open_tables: Change the type from HASH to hash_table_t*.

  innobase_get_key(): Remove.

  innobase_fold_name(): New function, for computing the fold value for the
  InnoDB hash table.

  get_share(), free_share(): Use the InnoDB hash functions.

  innobase_end(): Free innobase_open_tables before shutting down InnoDB.
  Shutting down InnoDB will invalidate all memory allocated via InnoDB.

  rb://65 approved by Heikki Tuuri.  This addresses Issue #104.
  ------------------------------------------------------------------------
  r3608 | marko | 2008-12-30 22:45:04 +0200 (Tue, 30 Dec 2008) | 22 lines

  branches/zip: When setting the PAGE_LEVEL of a compressed B-tree page
  from or to 0, compress the page at the same time.  This is necessary,
  because the column information stored on the compressed page will
  differ between leaf and non-leaf pages.  Leaf pages are identified by
  PAGE_LEVEL=0.  This bug was reported as Issue #150.

  Document the similarity between btr_page_create() and
  btr_page_empty().  Make the function signature of btr_page_empty()
  identical with btr_page_create().  (This will add the parameter "level".)

  btr_root_raise_and_insert(): Replace some code with a call to
  btr_page_empty().

  btr_attach_half_pages(): Assert that the page level has already been
  set on both block and new_block.  Do not set it again.

  btr_discard_only_page_on_level(): Document that this function is
  probably never called.  Make it work on any height tree.  (Tested on
  2-high tree by disabling btr_lift_page_up().)

  rb://68
  ------------------------------------------------------------------------
  r3612 | marko | 2009-01-02 11:02:44 +0200 (Fri, 02 Jan 2009) | 14 lines

  branches/zip: Merge c2998 from branches/6.0, so that the same InnoDB Plugin
  source tree will work both under 5.1 and 6.0.  Do not add the test case
  innodb_ctype_ldml.test, because it would not work under MySQL 5.1.

  Refuse to create tables whose columns contain collation IDs above 255.
  This removes an assertion failure that was introduced in WL#4164
  (Two-byte collation IDs).

  create_table_def(): Do not fail an assertion if a column contains a
  charset-collation ID greater than 256. Instead, issue an error and
  refuse to create the table.

  The original change (branches/6.0 r2998) was rb://51 approved by Calvin Sun.
  ------------------------------------------------------------------------
  r3613 | inaam | 2009-01-02 15:10:50 +0200 (Fri, 02 Jan 2009) | 6 lines

  branches/zip: Implement the parameter innodb_use_sys_malloc
  (false by default), for disabling InnoDB's internal memory allocator
  and using system malloc/free instead.

  rb://62 approved by Marko
  ------------------------------------------------------------------------
  r3614 | marko | 2009-01-02 15:55:12 +0200 (Fri, 02 Jan 2009) | 1 line

  branches/zip: ChangeLog: Document r3608 and r3613.
  ------------------------------------------------------------------------
  r3615 | marko | 2009-01-02 15:57:51 +0200 (Fri, 02 Jan 2009) | 1 line

  branches/zip: ChangeLog: Clarify the impact of r3608.
  ------------------------------------------------------------------------
  r3616 | marko | 2009-01-03 00:23:30 +0200 (Sat, 03 Jan 2009) | 1 line

  branches/zip: srv_suspend_mysql_thread(): Add some clarifying comments.
  ------------------------------------------------------------------------
  r3618 | marko | 2009-01-05 12:54:53 +0200 (Mon, 05 Jan 2009) | 15 lines

  branches/zip: Merge revisions 3598:3601 from branches/5.1:

    ------------------------------------------------------------------------
    r3601 | marko | 2008-12-22 16:05:19 +0200 (Mon, 22 Dec 2008) | 9 lines

    branches/5.1: Make
    SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED
    a true replacement of SET GLOBAL INNODB_LOCKS_UNSAFE_FOR_BINLOG=1.
    This fixes an error that was introduced in r370, causing
    semi-consistent read not to not unlock rows in READ COMMITTED mode.
    (Bug #41671, Issue #146)

    rb://67 approved by Heikki Tuuri
    ------------------------------------------------------------------------
  ------------------------------------------------------------------------
  r3623 | vasil | 2009-01-06 09:56:32 +0200 (Tue, 06 Jan 2009) | 7 lines

  branches/zip:

  Add patch to fix the failing main.variables mysql-test. It started failing
  after the variable innodb_use_sys_malloc was added because it matches
  '%alloc%' and the test is badly written and expects that no new variables
  like that will ever be added.
  ------------------------------------------------------------------------
  r3795 | marko | 2009-01-07 16:17:47 +0200 (Wed, 07 Jan 2009) | 7 lines

  branches/zip: row_merge_tuple_cmp(): Do not report a duplicate key value
  if any of the fields are NULL.  While the tuples are equal in the
  sorting order, SQL NULL is defined to be logically inequal to
  anything else. (Bug #41904)

  rb://70 approved by Heikki Tuuri
  ------------------------------------------------------------------------
  r3796 | marko | 2009-01-07 16:19:32 +0200 (Wed, 07 Jan 2009) | 1 line

  branches/zip: Add the tests that were forgotten from r3795.
  ------------------------------------------------------------------------
  r3797 | marko | 2009-01-07 16:22:18 +0200 (Wed, 07 Jan 2009) | 22 lines

  branches/zip: Do not call trx_allocate_for_mysql() directly, but use
  helper functions that initialize some members of the transaction struct.
  (Bug #41680)

  innobase_trx_init(): New function: initialize some fields of a
  transaction struct from a MySQL THD object.

  innobase_trx_allocate(): New function: allocate and initialize a
  transaction struct.

  check_trx_exists(): Use the above two functions.

  ha_innobase::delete_table(), ha_innobase::rename_table(),
  ha_innobase::add_index(), ha_innobase::final_drop_index():
  Use innobase_trx_allocate().

  innobase_drop_database(): In the Windows plugin, initialize the trx_t
  specially, because the THD is not available.  Otherwise, use
  innobase_trx_allocate().

  rb://69 accepted by Heikki Tuuri
  ------------------------------------------------------------------------
  r3798 | marko | 2009-01-07 16:42:42 +0200 (Wed, 07 Jan 2009) | 8 lines

  branches/zip: row_merge_drop_temp_indexes(): Do not lock the rows of
  SYS_INDEXES when looking for partially created indexes.  Use the
  transaction isolation level READ UNCOMMITTED to avoid interfering with
  locks held by incomplete transactions that will be rolled back in a
  subsequent step in the recovery.  (Issue #152)

  Approved by Heikki Tuuri
  ------------------------------------------------------------------------
  r3852 | vasil | 2009-01-08 22:10:10 +0200 (Thu, 08 Jan 2009) | 4 lines

  branches/zip:

  Add ChangeLog entries for r3795 r3796 r3797 r3798.

  ------------------------------------------------------------------------
  r3866 | marko | 2009-01-09 15:09:51 +0200 (Fri, 09 Jan 2009) | 2 lines

  branches/zip: buf_flush_try_page(): Move some common code from each
  switch case before the switch block.
  ------------------------------------------------------------------------
  r3867 | marko | 2009-01-09 15:13:14 +0200 (Fri, 09 Jan 2009) | 2 lines

  branches/zip: buf_flush_try_page(): Introduce the variable is_compressed
  for caching the result of buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE.
  ------------------------------------------------------------------------
  r3868 | marko | 2009-01-09 15:40:11 +0200 (Fri, 09 Jan 2009) | 4 lines

  branches/zip: buf_flush_insert_into_flush_list(),
  buf_flush_insert_sorted_into_flush_list(): Remove unused code.
  Change the parameter to buf_block_t* block and assert that
  block->state == BUF_BLOCK_FILE_PAGE.  This is part of Issue #155.
  ------------------------------------------------------------------------
  r3873 | marko | 2009-01-09 22:27:40 +0200 (Fri, 09 Jan 2009) | 17 lines

  branches/zip: Some non-functional changes related to Issue #155.

  buf_page_struct: Note that space and offset are also protected by
  buf_pool_mutex.  They are only assigned to by
  buf_block_set_file_page().  Thus, it suffices for buf_flush_batch() to
  hold just buf_pool_mutex when checking these fields.

  buf_flush_try_page(): Rename "locked" to "is_s_latched", per Heikki's request.

  buf_flush_batch(): Move the common statement mutex_exit(block_mutex)
  from all if-else if-else branches before the if block.  Remove the
  redundant test (buf_pool->init_flush[flush_type] == FALSE) that was
  apparently copied from buf_flush_write_complete().

  buf_flush_write_block_low(): Note why it is safe not to hold buf_pool_mutex
  or block_mutex.  Enumerate the assumptions in debug assertions.
  ------------------------------------------------------------------------
  r3874 | marko | 2009-01-09 23:09:06 +0200 (Fri, 09 Jan 2009) | 4 lines

  branches/zip: Add comments related to Issue #155.

  buf_flush_try_page(): Note why it is safe to access bpage without
  holding buf_pool_mutex or block_mutex.
  ------------------------------------------------------------------------
  r3875 | marko | 2009-01-09 23:15:12 +0200 (Fri, 09 Jan 2009) | 11 lines

  branches/zip: Non-functional change: Tighten debug assertions and
  remove dead code.

  buf_flush_ready_for_flush(), buf_flush_try_page(): Assert that
  flush_type is one of BUF_FLUSH_LRU or BUF_FLUSH_LIST.  The flush_type
  comes from buf_flush_batch(), which already asserts this.  The
  assertion holds for all calls in the source code.

  buf_flush_try_page(): Remove the dead case BUF_FLUSH_SINGLE_PAGE
  of switch (flush_type).
  ------------------------------------------------------------------------
  r3879 | marko | 2009-01-12 12:46:44 +0200 (Mon, 12 Jan 2009) | 14 lines

  branches/zip: Simplify the flushing of dirty pages from the buffer pool.

  buf_flush_try_page(): Rename to buf_flush_page(), and change the
  return type to void.  Replace the parameters space, offset with bpage,
  and remove the second page hash lookup.  Note and assert that both
  buf_pool_mutex and block_mutex must now be held upon entering the
  function.  They will still be released by this function.

  buf_flush_try_neighbors(): Replace buf_flush_try_page() with
  buf_flush_page().  Make the logic easier to follow by not negating the
  precondition of buf_flush_page().

  rb://73 approved by Sunny Bains.  This is related to Issue #157.
  ------------------------------------------------------------------------
  r3880 | marko | 2009-01-12 13:24:37 +0200 (Mon, 12 Jan 2009) | 2 lines

  branches/zip: buf_flush_page(): Fix a comment that should have been fixed
  in r3879.  Spotted by Sunny.
  ------------------------------------------------------------------------
  r3881 | marko | 2009-01-12 14:25:22 +0200 (Mon, 12 Jan 2009) | 2 lines

  branches/zip: buf_page_get_newest_modification(): Use the block mutex
  instead of the buffer pool mutex.  This is related to Issue #157.
  ------------------------------------------------------------------------
  r3882 | marko | 2009-01-12 14:40:08 +0200 (Mon, 12 Jan 2009) | 3 lines

  branches/zip: struct mtr_struct: Remove the unused field magic_n
  unless UNIV_DEBUG is defined.  mtr->magic_n is only assigned to
  and checked in UNIV_DEBUG builds.
  ------------------------------------------------------------------------
  r3883 | marko | 2009-01-12 14:48:59 +0200 (Mon, 12 Jan 2009) | 1 line

  branches/zip: Non-functional change: Use ut_d when assigning to mtr->state.
  ------------------------------------------------------------------------
  r3884 | marko | 2009-01-12 18:56:11 +0200 (Mon, 12 Jan 2009) | 16 lines

  branches/zip: Non-functional change: Add some debug assertions and comments.

  buf_page_t: Note that the LRU fields are protected by buf_pool_mutex
  only, not block->mutex or buf_pool_zip_mutex.

  buf_page_get_freed_page_clock(): Note that this is sometimes invoked
  without mutex protection.

  buf_pool_get_oldest_modification(): Note that the result may be out of
  date.

  buf_page_get_LRU_position(), buf_page_is_old(): Assert that the buffer
  pool mutex is being held.

  buf_page_release(): Assert that dirty blocks are in the flush list.
  ------------------------------------------------------------------------
  r3896 | marko | 2009-01-13 09:30:26 +0200 (Tue, 13 Jan 2009) | 2 lines

  branches/zip: buf_flush_try_neighbors(): Fix a bug
  that was introduced in r3879 (rb://73).
  ------------------------------------------------------------------------
  r3900 | marko | 2009-01-13 10:32:24 +0200 (Tue, 13 Jan 2009) | 1 line

  branches/zip: Fix some comments to say buf_pool_mutex.
  ------------------------------------------------------------------------
  r3907 | marko | 2009-01-13 11:54:01 +0200 (Tue, 13 Jan 2009) | 3 lines

  branches/zip: row_merge_create_temporary_table(): On error,
  row_create_table_for_mysql() already frees new_table.
  Do not attempt to free it again.
  ------------------------------------------------------------------------
  r3908 | marko | 2009-01-13 12:34:32 +0200 (Tue, 13 Jan 2009) | 1 line

  branches/zip: Enable HASH_ASSERT_OWNED independently of UNIV_SYNC_DEBUG.
  ------------------------------------------------------------------------
  r3914 | marko | 2009-01-13 21:46:22 +0200 (Tue, 13 Jan 2009) | 37 lines

  branches/zip: In hash table lookups, assert that the traversed items
  satisfy some conditions when UNIV_DEBUG is defined.

  HASH_SEARCH(): New parameter: ASSERTION. All users will pass an appropriate
  ut_ad() or nothing.

  dict_table_add_to_columns(): Assert that the table being added to the data
  dictionary cache is not already being pointed to by the name_hash and
  id_hash tables.

  HASH_SEARCH_ALL(): New macro, for use in dict_table_add_to_columns().

  dict_mem_table_free(): Set ut_d(table->cached = FALSE), so that we can
  check ut_ad(table->cached) when traversing the hash tables, as in
  HASH_SEARCH(name_hash, dict_sys->table_hash, ...) and
  HASH_SEARCH(id_hash, dict_sys->table_id_hash, ...).

  dict_table_get_low(), dict_table_get_on_id_low(): Assert
  ut_ad(!table || table->cached).

  fil_space_get_by_id(): Check ut_ad(space->magic_n == FIL_SPACE_MAGIC_N)
  in HASH_SEARCH(hash, fil_system->spaces, ...).

  fil_space_get_by_name(): Check ut_ad(space->magic_n == FIL_SPACE_MAGIC_N)
  in HASH_SEARCH(name_hash, fil_system->name_hash, ...).

  buf_buddy_block_free(): Check that the blocks are in valid state in
  HASH_SEARCH(hash, buf_pool->zip_hash, ...).

  buf_page_hash_get(): Check that the blocks are in valid state in
  HASH_SEARCH(hash, buf_pool->page_hash, ...).

  get_share(), free_share(): Check ut_ad(share->use_count > 0) in
  HASH_SEARCH(table_name_hash, innobase_open_tables, ...).

  This was posted as rb://75 for tracking down errors similar to Issue #153.
  ------------------------------------------------------------------------
  r3931 | marko | 2009-01-14 16:06:22 +0200 (Wed, 14 Jan 2009) | 26 lines

  branches/zip: Merge revisions 3601:3930 from branches/5.1:

    ------------------------------------------------------------------------
    r3911 | sunny | 2009-01-13 14:15:24 +0200 (Tue, 13 Jan 2009) | 13 lines

    branches/5.1: Fix Bug#38187 Error 153 when creating savepoints
    InnoDB previously treated savepoints as a stack e.g.,
      SAVEPOINT a;
      SAVEPOINT b;
      SAVEPOINT c;
      SAVEPOINT b; <- This would delete b and c.

    This fix changes the behavior to:
      SAVEPOINT a;
      SAVEPOINT b;
      SAVEPOINT c;
      SAVEPOINT b; <- Does not delete savepoint c
    ------------------------------------------------------------------------
    r3930 | marko | 2009-01-14 15:51:30 +0200 (Wed, 14 Jan 2009) | 4 lines

    branches/5.1: dict_load_table(): If dict_load_indexes() fails,
    invoke dict_table_remove_from_cache() instead of dict_mem_table_free(),
    so that the data dictionary will not point to freed data.
    (Bug #42075, Issue #153, rb://76 approved by Heikki Tuuri)
    ------------------------------------------------------------------------
  ------------------------------------------------------------------------
2009-01-14 14:25:45 +00:00

1473 lines
40 KiB
C

/******************************************************
The database buffer buf_pool flush algorithm
(c) 1995-2001 Innobase Oy
Created 11/11/1995 Heikki Tuuri
*******************************************************/
#include "buf0flu.h"
#ifdef UNIV_NONINL
#include "buf0flu.ic"
#include "trx0sys.h"
#endif
#include "ut0byte.h"
#include "ut0lst.h"
#include "page0page.h"
#include "page0zip.h"
#include "fil0fil.h"
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
#include "ibuf0ibuf.h"
#include "log0log.h"
#include "os0file.h"
#include "trx0sys.h"
#include "srv0srv.h"
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/**********************************************************************
Validates the flush list. */
static
ibool
buf_flush_validate_low(void);
/*========================*/
/* out: TRUE if ok */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
/**********************************************************************
Insert a block in the flush_rbt and returns a pointer to its
predecessor or NULL if no predecessor. The ordering is maintained
on the basis of the <oldest_modification, space, offset> key. */
static
buf_page_t*
buf_flush_insert_in_flush_rbt(
/*==========================*/
/* out: pointer to the predecessor or
NULL if no predecessor. */
buf_page_t* bpage) /* in: bpage to be inserted. */
{
buf_page_t* prev = NULL;
const ib_rbt_node_t* c_node;
const ib_rbt_node_t* p_node;
ut_ad(buf_pool_mutex_own());
/* Insert this buffer into the rbt. */
c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
ut_a(c_node != NULL);
/* Get the predecessor. */
p_node = rbt_prev(buf_pool->flush_rbt, c_node);
if (p_node != NULL) {
prev = *rbt_value(buf_page_t*, p_node);
ut_a(prev != NULL);
}
return(prev);
}
/*************************************************************
Delete a bpage from the flush_rbt. */
static
void
buf_flush_delete_from_flush_rbt(
/*============================*/
buf_page_t* bpage) /* in: bpage to be removed. */
{
ibool ret = FALSE;
ut_ad(buf_pool_mutex_own());
ret = rbt_delete(buf_pool->flush_rbt, &bpage);
ut_ad(ret);
}
/*********************************************************************
Compare two modified blocks in the buffer pool. The key for comparison
is:
key = <oldest_modification, space, offset>
This comparison is used to maintian ordering of blocks in the
buf_pool->flush_rbt.
Note that for the purpose of flush_rbt, we only need to order blocks
on the oldest_modification. The other two fields are used to uniquely
identify the blocks. */
static
int
buf_flush_block_cmp(
/*================*/
/* out:
< 0 if b2 < b1,
0 if b2 == b1,
> 0 if b2 > b1 */
const void* p1, /* in: block1 */
const void* p2) /* in: block2 */
{
int ret;
ut_ad(p1 != NULL);
ut_ad(p2 != NULL);
const buf_page_t* b1 = *(const buf_page_t**) p1;
const buf_page_t* b2 = *(const buf_page_t**) p2;
ut_ad(b1 != NULL);
ut_ad(b2 != NULL);
ut_ad(b1->in_flush_list);
ut_ad(b2->in_flush_list);
if (b2->oldest_modification
> b1->oldest_modification) {
return(1);
}
if (b2->oldest_modification
< b1->oldest_modification) {
return(-1);
}
/* If oldest_modification is same then decide on the space. */
ret = (int)(b2->space - b1->space);
/* Or else decide ordering on the offset field. */
return(ret ? ret : (int)(b2->offset - b1->offset));
}
/************************************************************************
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
UNIV_INTERN
void
buf_flush_init_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
/* Create red black tree for speedy insertions in flush list. */
buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
buf_flush_block_cmp);
buf_pool_mutex_exit();
}
/************************************************************************
Frees up the red-black tree. */
UNIV_INTERN
void
buf_flush_free_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
rbt_free(buf_pool->flush_rbt);
buf_pool->flush_rbt = NULL;
buf_pool_mutex_exit();
}
/************************************************************************
Inserts a modified block into the flush list. */
UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
buf_block_t* block) /* in/out: block which is modified */
{
ut_ad(buf_pool_mutex_own());
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
<= block->page.oldest_modification));
/* If we are in the recovery then we need to update the flush
red-black tree as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_insert_sorted_into_flush_list(block);
return;
}
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
ut_ad(!block->page.in_zip_hash);
ut_ad(!block->page.in_flush_list);
ut_d(block->page.in_flush_list = TRUE);
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
}
/************************************************************************
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
necessarily come in the order of lsn's. */
UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
buf_block_t* block) /* in/out: block which is modified */
{
buf_page_t* prev_b;
buf_page_t* b;
ut_ad(buf_pool_mutex_own());
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
ut_ad(!block->page.in_zip_hash);
ut_ad(!block->page.in_flush_list);
ut_d(block->page.in_flush_list = TRUE);
prev_b = NULL;
/* For the most part when this function is called the flush_rbt
should not be NULL. In a very rare boundary case it is possible
that the flush_rbt has already been freed by the recovery thread
before the last page was hooked up in the flush_list by the
io-handler thread. In that case we'll just do a simple
linear search in the else block. */
if (buf_pool->flush_rbt) {
prev_b = buf_flush_insert_in_flush_rbt(&block->page);
} else {
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
while (b && b->oldest_modification
> block->page.oldest_modification) {
ut_ad(b->in_flush_list);
prev_b = b;
b = UT_LIST_GET_NEXT(list, b);
}
}
if (prev_b == NULL) {
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
} else {
UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
prev_b, &block->page);
}
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
}
/************************************************************************
Returns TRUE if the file page block is immediately suitable for replacement,
i.e., the transition FILE_PAGE => NOT_USED allowed. */
UNIV_INTERN
ibool
buf_flush_ready_for_replace(
/*========================*/
/* out: TRUE if can replace immediately */
buf_page_t* bpage) /* in: buffer control block, must be
buf_page_in_file(bpage) and in the LRU list */
{
ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(bpage->in_LRU_list);
if (UNIV_LIKELY(buf_page_in_file(bpage))) {
return(bpage->oldest_modification == 0
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE
&& bpage->buf_fix_count == 0);
}
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: buffer block state %lu"
" in the LRU list!\n",
(ulong) buf_page_get_state(bpage));
ut_print_buf(stderr, bpage, sizeof(buf_page_t));
putc('\n', stderr);
return(FALSE);
}
/************************************************************************
Returns TRUE if the block is modified and ready for flushing. */
UNIV_INLINE
ibool
buf_flush_ready_for_flush(
/*======================*/
/* out: TRUE if can flush immediately */
buf_page_t* bpage, /* in: buffer control block, must be
buf_page_in_file(bpage) */
enum buf_flush flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
ut_a(buf_page_in_file(bpage));
ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
if (bpage->oldest_modification != 0
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
ut_ad(bpage->in_flush_list);
if (flush_type != BUF_FLUSH_LRU) {
return(TRUE);
} else if (bpage->buf_fix_count == 0) {
/* If we are flushing the LRU list, to avoid deadlocks
we require the block not to be bufferfixed, and hence
not latched. */
return(TRUE);
}
}
return(FALSE);
}
/************************************************************************
Remove a block from the flush list of modified blocks. */
UNIV_INTERN
void
buf_flush_remove(
/*=============*/
buf_page_t* bpage) /* in: pointer to the block in question */
{
ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(bpage->in_flush_list);
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
/* clean compressed pages should not be on the flush list */
case BUF_BLOCK_ZIP_FREE:
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
case BUF_BLOCK_REMOVE_HASH:
ut_error;
return;
case BUF_BLOCK_ZIP_DIRTY:
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
buf_LRU_insert_zip_clean(bpage);
break;
case BUF_BLOCK_FILE_PAGE:
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
break;
}
/* If the flush_rbt is active then delete from it as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_delete_from_flush_rbt(bpage);
}
/* Must be done after we have removed it from the flush_rbt
because we assert on in_flush_list in comparison function. */
ut_d(bpage->in_flush_list = FALSE);
bpage->oldest_modification = 0;
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list));
}
/***********************************************************************
Relocates a buffer control block on the flush_list.
Note that it is assumed that the contents of bpage has already been
copied to dpage. */
UNIV_INTERN
void
buf_flush_relocate_on_flush_list(
/*=============================*/
buf_page_t* bpage, /* in/out: control block being moved */
buf_page_t* dpage) /* in/out: destination block */
{
buf_page_t* prev;
buf_page_t* prev_b = NULL;
ut_ad(buf_pool_mutex_own());
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(bpage->in_flush_list);
ut_ad(dpage->in_flush_list);
/* If recovery is active we must swap the control blocks in
the flush_rbt as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_delete_from_flush_rbt(bpage);
prev_b = buf_flush_insert_in_flush_rbt(dpage);
}
/* Must be done after we have removed it from the flush_rbt
because we assert on in_flush_list in comparison function. */
ut_d(bpage->in_flush_list = FALSE);
prev = UT_LIST_GET_PREV(list, bpage);
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
if (prev) {
ut_ad(prev->in_flush_list);
UT_LIST_INSERT_AFTER(
list,
buf_pool->flush_list,
prev, dpage);
} else {
UT_LIST_ADD_FIRST(
list,
buf_pool->flush_list,
dpage);
}
/* Just an extra check. Previous in flush_list
should be the same control block as in flush_rbt. */
ut_a(!buf_pool->flush_rbt || prev_b == prev);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
}
/************************************************************************
Updates the flush system data structures when a write is completed. */
UNIV_INTERN
void
buf_flush_write_complete(
/*=====================*/
buf_page_t* bpage) /* in: pointer to the block in question */
{
enum buf_flush flush_type;
ut_ad(bpage);
buf_flush_remove(bpage);
flush_type = buf_page_get_flush_type(bpage);
buf_pool->n_flush[flush_type]--;
if (flush_type == BUF_FLUSH_LRU) {
/* Put the block to the end of the LRU list to wait to be
moved to the free list */
buf_LRU_make_block_old(bpage);
buf_pool->LRU_flush_ended++;
}
/* fprintf(stderr, "n pending flush %lu\n",
buf_pool->n_flush[flush_type]); */
if ((buf_pool->n_flush[flush_type] == 0)
&& (buf_pool->init_flush[flush_type] == FALSE)) {
/* The running flush batch has ended */
os_event_set(buf_pool->no_flush[flush_type]);
}
}
/************************************************************************
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
byte* write_buf;
ulint len;
ulint len2;
ulint i;
if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
os_aio_simulated_wake_handler_threads();
return;
}
mutex_enter(&(trx_doublewrite->mutex));
/* Write first to doublewrite buffer blocks. We use synchronous
aio and thus know that file write has been completed when the
control returns. */
if (trx_doublewrite->first_free == 0) {
mutex_exit(&(trx_doublewrite->mutex));
return;
}
for (i = 0; i < trx_doublewrite->first_free; i++) {
const buf_block_t* block;
block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
|| block->page.zip.data) {
/* No simple validate for compressed pages exists. */
continue;
}
if (UNIV_UNLIKELY
(memcmp(block->frame + (FIL_PAGE_LSN + 4),
block->frame + (UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
4))) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: ERROR: The page to be written"
" seems corrupt!\n"
"InnoDB: The lsn fields do not match!"
" Noticed in the buffer pool\n"
"InnoDB: before posting to the"
" doublewrite buffer.\n");
}
if (!block->check_index_page_at_flush) {
} else if (page_is_comp(block->frame)) {
if (UNIV_UNLIKELY
(!page_simple_validate_new(block->frame))) {
corrupted_page:
buf_page_print(block->frame, 0);
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Apparent corruption of an"
" index page n:o %lu in space %lu\n"
"InnoDB: to be written to data file."
" We intentionally crash server\n"
"InnoDB: to prevent corrupt data"
" from ending up in data\n"
"InnoDB: files.\n",
(ulong) buf_block_get_page_no(block),
(ulong) buf_block_get_space(block));
ut_error;
}
} else if (UNIV_UNLIKELY
(!page_simple_validate_old(block->frame))) {
goto corrupted_page;
}
}
/* increment the doublewrite flushed pages counter */
srv_dblwr_pages_written+= trx_doublewrite->first_free;
srv_dblwr_writes++;
len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
write_buf = trx_doublewrite->write_buf;
i = 0;
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
trx_doublewrite->block1, 0, len,
(void*) write_buf, NULL);
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
len2 += UNIV_PAGE_SIZE, i++) {
const buf_block_t* block = (buf_block_t*)
trx_doublewrite->buf_block_arr[i];
if (UNIV_LIKELY(!block->page.zip.data)
&& UNIV_LIKELY(buf_block_get_state(block)
== BUF_BLOCK_FILE_PAGE)
&& UNIV_UNLIKELY
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
write_buf + len2
+ (UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: ERROR: The page to be written"
" seems corrupt!\n"
"InnoDB: The lsn fields do not match!"
" Noticed in the doublewrite block1.\n");
}
}
if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
goto flush;
}
len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
* UNIV_PAGE_SIZE;
write_buf = trx_doublewrite->write_buf
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
trx_doublewrite->block2, 0, len,
(void*) write_buf, NULL);
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
len2 += UNIV_PAGE_SIZE, i++) {
const buf_block_t* block = (buf_block_t*)
trx_doublewrite->buf_block_arr[i];
if (UNIV_LIKELY(!block->page.zip.data)
&& UNIV_LIKELY(buf_block_get_state(block)
== BUF_BLOCK_FILE_PAGE)
&& UNIV_UNLIKELY
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
write_buf + len2
+ (UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: ERROR: The page to be"
" written seems corrupt!\n"
"InnoDB: The lsn fields do not match!"
" Noticed in"
" the doublewrite block2.\n");
}
}
flush:
/* Now flush the doublewrite buffer data to disk */
fil_flush(TRX_SYS_SPACE);
/* We know that the writes have been flushed to disk now
and in recovery we will find them in the doublewrite buffer
blocks. Next do the writes to the intended positions. */
for (i = 0; i < trx_doublewrite->first_free; i++) {
const buf_block_t* block = (buf_block_t*)
trx_doublewrite->buf_block_arr[i];
ut_a(buf_page_in_file(&block->page));
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, buf_page_get_space(&block->page),
buf_page_get_zip_size(&block->page),
buf_page_get_page_no(&block->page), 0,
buf_page_get_zip_size(&block->page),
(void*)block->page.zip.data,
(void*)block);
/* Increment the counter of I/O operations used
for selecting LRU policy. */
buf_LRU_stat_inc_io();
continue;
}
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
block->frame
+ (UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
4))) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: ERROR: The page to be written"
" seems corrupt!\n"
"InnoDB: The lsn fields do not match!"
" Noticed in the buffer pool\n"
"InnoDB: after posting and flushing"
" the doublewrite buffer.\n"
"InnoDB: Page buf fix count %lu,"
" io fix %lu, state %lu\n",
(ulong)block->page.buf_fix_count,
(ulong)buf_block_get_io_fix(block),
(ulong)buf_block_get_state(block));
}
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, buf_block_get_space(block), 0,
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
/* Increment the counter of I/O operations used
for selecting LRU policy. */
buf_LRU_stat_inc_io();
}
/* Wake possible simulated aio thread to actually post the
writes to the operating system */
os_aio_simulated_wake_handler_threads();
/* Wait that all async writes to tablespaces have been posted to
the OS */
os_aio_wait_until_no_pending_writes();
/* Now we flush the data to disk (for example, with fsync) */
fil_flush_file_spaces(FIL_TABLESPACE);
/* We can now reuse the doublewrite memory buffer: */
trx_doublewrite->first_free = 0;
mutex_exit(&(trx_doublewrite->mutex));
}
/************************************************************************
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
buf_page_t* bpage) /* in: buffer block to write */
{
ulint zip_size;
try_again:
mutex_enter(&(trx_doublewrite->mutex));
ut_a(buf_page_in_file(bpage));
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
buf_flush_buffered_writes();
goto try_again;
}
zip_size = buf_page_get_zip_size(bpage);
if (UNIV_UNLIKELY(zip_size)) {
/* Copy the compressed page and clear the rest. */
memcpy(trx_doublewrite->write_buf
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
bpage->zip.data, zip_size);
memset(trx_doublewrite->write_buf
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
} else {
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
memcpy(trx_doublewrite->write_buf
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
}
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
trx_doublewrite->first_free++;
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
buf_flush_buffered_writes();
return;
}
mutex_exit(&(trx_doublewrite->mutex));
}
/************************************************************************
Initializes a page for writing to the tablespace. */
UNIV_INTERN
void
buf_flush_init_for_writing(
/*=======================*/
byte* page, /* in/out: page */
void* page_zip_, /* in/out: compressed page, or NULL */
ib_uint64_t newest_lsn) /* in: newest modification lsn
to the page */
{
ut_ad(page);
if (page_zip_) {
page_zip_des_t* page_zip = page_zip_;
ulint zip_size = page_zip_get_size(page_zip);
ut_ad(zip_size);
ut_ad(ut_is_2pow(zip_size));
ut_ad(zip_size <= UNIV_PAGE_SIZE);
switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
case FIL_PAGE_TYPE_ALLOCATED:
case FIL_PAGE_INODE:
case FIL_PAGE_IBUF_BITMAP:
case FIL_PAGE_TYPE_FSP_HDR:
case FIL_PAGE_TYPE_XDES:
/* These are essentially uncompressed pages. */
memcpy(page_zip->data, page, zip_size);
/* fall through */
case FIL_PAGE_TYPE_ZBLOB:
case FIL_PAGE_TYPE_ZBLOB2:
case FIL_PAGE_INDEX:
mach_write_ull(page_zip->data
+ FIL_PAGE_LSN, newest_lsn);
memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
mach_write_to_4(page_zip->data
+ FIL_PAGE_SPACE_OR_CHKSUM,
srv_use_checksums
? page_zip_calc_checksum(
page_zip->data, zip_size)
: BUF_NO_CHECKSUM_MAGIC);
return;
}
ut_print_timestamp(stderr);
fputs(" InnoDB: ERROR: The compressed page to be written"
" seems corrupt:", stderr);
ut_print_buf(stderr, page, zip_size);
fputs("\nInnoDB: Possibly older version of the page:", stderr);
ut_print_buf(stderr, page_zip->data, zip_size);
putc('\n', stderr);
ut_error;
}
/* Write the newest modification lsn to the page header and trailer */
mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
newest_lsn);
/* Store the new formula checksum */
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
srv_use_checksums
? buf_calc_page_new_checksum(page)
: BUF_NO_CHECKSUM_MAGIC);
/* We overwrite the first 4 bytes of the end lsn field to store
the old formula checksum. Since it depends also on the field
FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
new formula checksum. */
mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
srv_use_checksums
? buf_calc_page_old_checksum(page)
: BUF_NO_CHECKSUM_MAGIC);
}
/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
/*======================*/
buf_page_t* bpage) /* in: buffer block to write */
{
ulint zip_size = buf_page_get_zip_size(bpage);
page_t* frame = NULL;
#ifdef UNIV_LOG_DEBUG
static ibool univ_log_debug_warned;
#endif /* UNIV_LOG_DEBUG */
ut_ad(buf_page_in_file(bpage));
/* We are not holding buf_pool_mutex or block_mutex here.
Nevertheless, it is safe to access bpage, because it is
io_fixed and oldest_modification != 0. Thus, it cannot be
relocated in the buffer pool or removed from flush_list or
LRU_list. */
ut_ad(!buf_pool_mutex_own());
ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
ut_ad(bpage->oldest_modification != 0);
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
#endif
ut_ad(bpage->newest_modification != 0);
#ifdef UNIV_LOG_DEBUG
if (!univ_log_debug_warned) {
univ_log_debug_warned = TRUE;
fputs("Warning: cannot force log to disk if"
" UNIV_LOG_DEBUG is defined!\n"
"Crash recovery will not work!\n",
stderr);
}
#else
/* Force the log to the disk before writing the modified block */
log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
#endif
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_FREE:
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
case BUF_BLOCK_REMOVE_HASH:
ut_error;
break;
case BUF_BLOCK_ZIP_DIRTY:
frame = bpage->zip.data;
if (UNIV_LIKELY(srv_use_checksums)) {
ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
== page_zip_calc_checksum(frame, zip_size));
}
mach_write_ull(frame + FIL_PAGE_LSN,
bpage->newest_modification);
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
break;
case BUF_BLOCK_FILE_PAGE:
frame = bpage->zip.data;
if (!frame) {
frame = ((buf_block_t*) bpage)->frame;
}
buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
bpage->zip.data
? &bpage->zip : NULL,
bpage->newest_modification);
break;
}
if (!srv_use_doublewrite_buf || !trx_doublewrite) {
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, buf_page_get_space(bpage), zip_size,
buf_page_get_page_no(bpage), 0,
zip_size ? zip_size : UNIV_PAGE_SIZE,
frame, bpage);
} else {
buf_flush_post_to_doublewrite_buf(bpage);
}
}
/************************************************************************
Writes a flushable page asynchronously from the buffer pool to a file.
NOTE: in simulated aio we must call
os_aio_simulated_wake_handler_threads after we have posted a batch of
writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
held upon entering this function, and they will be released by this
function. */
static
void
buf_flush_page(
/*===========*/
buf_page_t* bpage, /* in: buffer control block */
enum buf_flush flush_type) /* in: BUF_FLUSH_LRU
or BUF_FLUSH_LIST */
{
mutex_t* block_mutex;
ibool is_uncompressed;
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
ut_ad(buf_pool_mutex_own());
ut_ad(buf_page_in_file(bpage));
block_mutex = buf_page_get_mutex(bpage);
ut_ad(mutex_own(block_mutex));
ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
buf_page_set_flush_type(bpage, flush_type);
if (buf_pool->n_flush[flush_type] == 0) {
os_event_reset(buf_pool->no_flush[flush_type]);
}
buf_pool->n_flush[flush_type]++;
is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
switch (flush_type) {
ibool is_s_latched;
case BUF_FLUSH_LIST:
/* If the simulated aio thread is not running, we must
not wait for any latch, as we may end up in a deadlock:
if buf_fix_count == 0, then we know we need not wait */
is_s_latched = (bpage->buf_fix_count == 0);
if (is_s_latched && is_uncompressed) {
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
BUF_IO_WRITE);
}
mutex_exit(block_mutex);
buf_pool_mutex_exit();
/* Even though bpage is not protected by any mutex at
this point, it is safe to access bpage, because it is
io_fixed and oldest_modification != 0. Thus, it
cannot be relocated in the buffer pool or removed from
flush_list or LRU_list. */
if (!is_s_latched) {
buf_flush_buffered_writes();
if (is_uncompressed) {
rw_lock_s_lock_gen(&((buf_block_t*) bpage)
->lock, BUF_IO_WRITE);
}
}
break;
case BUF_FLUSH_LRU:
/* VERY IMPORTANT:
Because any thread may call the LRU flush, even when owning
locks on pages, to avoid deadlocks, we must make sure that the
s-lock is acquired on the page without waiting: this is
accomplished because buf_flush_ready_for_flush() must hold,
and that requires the page not to be bufferfixed. */
if (is_uncompressed) {
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
BUF_IO_WRITE);
}
/* Note that the s-latch is acquired before releasing the
buf_pool mutex: this ensures that the latch is acquired
immediately. */
mutex_exit(block_mutex);
buf_pool_mutex_exit();
break;
default:
ut_error;
}
/* Even though bpage is not protected by any mutex at this
point, it is safe to access bpage, because it is io_fixed and
oldest_modification != 0. Thus, it cannot be relocated in the
buffer pool or removed from flush_list or LRU_list. */
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr,
"Flushing %u space %u page %u\n",
flush_type, bpage->space, bpage->offset);
}
#endif /* UNIV_DEBUG */
buf_flush_write_block_low(bpage);
}
/***************************************************************
Flushes to disk all flushable pages within the flush area. */
static
ulint
buf_flush_try_neighbors(
/*====================*/
/* out: number of pages flushed */
ulint space, /* in: space id */
ulint offset, /* in: page offset */
enum buf_flush flush_type) /* in: BUF_FLUSH_LRU or
BUF_FLUSH_LIST */
{
buf_page_t* bpage;
ulint low, high;
ulint count = 0;
ulint i;
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
/* If there is little space, it is better not to flush any
block except from the end of the LRU list */
low = offset;
high = offset + 1;
} else {
/* When flushed, dirty blocks are searched in neighborhoods of
this size, and flushed along with the original page. */
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
buf_pool->curr_size / 16);
low = (offset / buf_flush_area) * buf_flush_area;
high = (offset / buf_flush_area + 1) * buf_flush_area;
}
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
if (high > fil_space_get_size(space)) {
high = fil_space_get_size(space);
}
buf_pool_mutex_enter();
for (i = low; i < high; i++) {
bpage = buf_page_hash_get(space, i);
if (!bpage) {
continue;
}
ut_a(buf_page_in_file(bpage));
/* We avoid flushing 'non-old' blocks in an LRU flush,
because the flushed blocks are soon freed */
if (flush_type != BUF_FLUSH_LRU
|| i == offset
|| buf_page_is_old(bpage)) {
mutex_t* block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
if (buf_flush_ready_for_flush(bpage, flush_type)
&& (i == offset || !bpage->buf_fix_count)) {
/* We only try to flush those
neighbors != offset where the buf fix count is
zero, as we then know that we probably can
latch the page without a semaphore wait.
Semaphore waits are expensive because we must
flush the doublewrite buffer before we start
waiting. */
buf_flush_page(bpage, flush_type);
ut_ad(!mutex_own(block_mutex));
count++;
buf_pool_mutex_enter();
} else {
mutex_exit(block_mutex);
}
}
}
buf_pool_mutex_exit();
return(count);
}
/***********************************************************************
This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it cannot
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
the calling thread is not allowed to own any latches on pages! */
UNIV_INTERN
ulint
buf_flush_batch(
/*============*/
/* out: number of blocks for which the
write request was queued;
ULINT_UNDEFINED if there was a flush
of the same type already running */
enum buf_flush flush_type, /* in: BUF_FLUSH_LRU or
BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
then the caller must not own any
latches on pages */
ulint min_n, /* in: wished minimum mumber of blocks
flushed (it is not guaranteed that the
actual number is that big, though) */
ib_uint64_t lsn_limit) /* in the case BUF_FLUSH_LIST all
blocks whose oldest_modification is
smaller than this should be flushed
(if their number does not exceed
min_n), otherwise ignored */
{
buf_page_t* bpage;
ulint page_count = 0;
ulint old_page_count;
ulint space;
ulint offset;
ut_ad((flush_type == BUF_FLUSH_LRU)
|| (flush_type == BUF_FLUSH_LIST));
#ifdef UNIV_SYNC_DEBUG
ut_ad((flush_type != BUF_FLUSH_LIST)
|| sync_thread_levels_empty_gen(TRUE));
#endif /* UNIV_SYNC_DEBUG */
buf_pool_mutex_enter();
if ((buf_pool->n_flush[flush_type] > 0)
|| (buf_pool->init_flush[flush_type] == TRUE)) {
/* There is already a flush batch of the same type running */
buf_pool_mutex_exit();
return(ULINT_UNDEFINED);
}
buf_pool->init_flush[flush_type] = TRUE;
for (;;) {
flush_next:
/* If we have flushed enough, leave the loop */
if (page_count >= min_n) {
break;
}
/* Start from the end of the list looking for a suitable
block to be flushed. */
if (flush_type == BUF_FLUSH_LRU) {
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
if (!bpage
|| bpage->oldest_modification >= lsn_limit) {
/* We have flushed enough */
break;
}
ut_ad(bpage->in_flush_list);
}
/* Note that after finding a single flushable page, we try to
flush also all its neighbors, and after that start from the
END of the LRU list or flush list again: the list may change
during the flushing and we cannot safely preserve within this
function a pointer to a block in the list! */
do {
mutex_t*block_mutex = buf_page_get_mutex(bpage);
ibool ready;
ut_a(buf_page_in_file(bpage));
mutex_enter(block_mutex);
ready = buf_flush_ready_for_flush(bpage, flush_type);
mutex_exit(block_mutex);
if (ready) {
space = buf_page_get_space(bpage);
offset = buf_page_get_page_no(bpage);
buf_pool_mutex_exit();
old_page_count = page_count;
/* Try to flush also all the neighbors */
page_count += buf_flush_try_neighbors(
space, offset, flush_type);
/* fprintf(stderr,
"Flush type %lu, page no %lu, neighb %lu\n",
flush_type, offset,
page_count - old_page_count); */
buf_pool_mutex_enter();
goto flush_next;
} else if (flush_type == BUF_FLUSH_LRU) {
bpage = UT_LIST_GET_PREV(LRU, bpage);
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
bpage = UT_LIST_GET_PREV(list, bpage);
ut_ad(!bpage || bpage->in_flush_list);
}
} while (bpage != NULL);
/* If we could not find anything to flush, leave the loop */
break;
}
buf_pool->init_flush[flush_type] = FALSE;
if (buf_pool->n_flush[flush_type] == 0) {
/* The running flush batch has ended */
os_event_set(buf_pool->no_flush[flush_type]);
}
buf_pool_mutex_exit();
buf_flush_buffered_writes();
#ifdef UNIV_DEBUG
if (buf_debug_prints && page_count > 0) {
ut_a(flush_type == BUF_FLUSH_LRU
|| flush_type == BUF_FLUSH_LIST);
fprintf(stderr, flush_type == BUF_FLUSH_LRU
? "Flushed %lu pages in LRU flush\n"
: "Flushed %lu pages in flush list flush\n",
(ulong) page_count);
}
#endif /* UNIV_DEBUG */
srv_buf_pool_flushed += page_count;
return(page_count);
}
/**********************************************************************
Waits until a flush batch of the given type ends */
UNIV_INTERN
void
buf_flush_wait_batch_end(
/*=====================*/
enum buf_flush type) /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
os_event_wait(buf_pool->no_flush[type]);
}
/**********************************************************************
Gives a recommendation of how many blocks should be flushed to establish
a big enough margin of replaceable blocks near the end of the LRU list
and in the free list. */
static
ulint
buf_flush_LRU_recommendation(void)
/*==============================*/
/* out: number of blocks which should be flushed
from the end of the LRU list */
{
buf_page_t* bpage;
ulint n_replaceable;
ulint distance = 0;
buf_pool_mutex_enter();
n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
while ((bpage != NULL)
&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
+ BUF_FLUSH_EXTRA_MARGIN)
&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
mutex_t* block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
if (buf_flush_ready_for_replace(bpage)) {
n_replaceable++;
}
mutex_exit(block_mutex);
distance++;
bpage = UT_LIST_GET_PREV(LRU, bpage);
}
buf_pool_mutex_exit();
if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
return(0);
}
return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
- n_replaceable);
}
/*************************************************************************
Flushes pages from the end of the LRU list if there is too small a margin
of replaceable pages there or in the free list. VERY IMPORTANT: this function
is called also by threads which have locks on pages. To avoid deadlocks, we
flush only pages such that the s-lock required for flushing can be acquired
immediately, without waiting. */
UNIV_INTERN
void
buf_flush_free_margin(void)
/*=======================*/
{
ulint n_to_flush;
ulint n_flushed;
n_to_flush = buf_flush_LRU_recommendation();
if (n_to_flush > 0) {
n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
if (n_flushed == ULINT_UNDEFINED) {
/* There was an LRU type flush batch already running;
let us wait for it to end */
buf_flush_wait_batch_end(BUF_FLUSH_LRU);
}
}
}
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/**********************************************************************
Validates the flush list. */
static
ibool
buf_flush_validate_low(void)
/*========================*/
/* out: TRUE if ok */
{
buf_page_t* bpage;
const ib_rbt_node_t* rnode = NULL;
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list);
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
/* If we are in recovery mode i.e.: flush_rbt != NULL
then each block in the flush_list must also be present
in the flush_rbt. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
rnode = rbt_first(buf_pool->flush_rbt);
}
while (bpage != NULL) {
const ib_uint64_t om = bpage->oldest_modification;
ut_ad(bpage->in_flush_list);
ut_a(buf_page_in_file(bpage));
ut_a(om > 0);
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
ut_a(rnode);
buf_page_t* rpage = *rbt_value(buf_page_t*,
rnode);
ut_a(rpage);
ut_a(rpage == bpage);
rnode = rbt_next(buf_pool->flush_rbt, rnode);
}
bpage = UT_LIST_GET_NEXT(list, bpage);
ut_a(!bpage || om >= bpage->oldest_modification);
}
/* By this time we must have exhausted the traversal of
flush_rbt (if active) as well. */
ut_a(rnode == NULL);
return(TRUE);
}
/**********************************************************************
Validates the flush list. */
UNIV_INTERN
ibool
buf_flush_validate(void)
/*====================*/
/* out: TRUE if ok */
{
ibool ret;
buf_pool_mutex_enter();
ret = buf_flush_validate_low();
buf_pool_mutex_exit();
return(ret);
}
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */