MDEV-22871: Reduce InnoDB buf_pool.page_hash contention

The rw_lock_s_lock() calls for the buf_pool.page_hash became a
clear bottleneck after MDEV-15053 reduced the contention on
buf_pool.mutex. We will replace that use of rw_lock_t with a
special implementation that is optimized for memory bus traffic.

The hash_table_locks instrumentation will be removed.

buf_pool_t::page_hash: Use a special implementation whose API is
compatible with hash_table_t, and store the custom rw-locks
directly in buf_pool.page_hash.array, intentionally sharing
cache lines with the hash table pointers.

rw_lock: A low-level rw-lock implementation based on std::atomic<uint32_t>
where read_trylock() becomes a simple fetch_add(1).

buf_pool_t::page_hash_latch: The special of rw_lock for the page_hash.

buf_pool_t::page_hash_latch::read_lock(): Assert that buf_pool.mutex
is not being held by the caller.

buf_pool_t::page_hash_latch::write_lock() may be called while not holding
buf_pool.mutex. buf_pool_t::watch_set() is such a caller.

buf_pool_t::page_hash_latch::read_lock_wait(),
page_hash_latch::write_lock_wait(): The spin loops.
These will obey the global parameters innodb_sync_spin_loops and
innodb_sync_spin_wait_delay.

buf_pool_t::freed_page_hash: A singly linked list of copies of
buf_pool.page_hash that ever existed. The fact that we never
free any buf_pool.page_hash.array guarantees that all
page_hash_latch that ever existed will remain valid until shutdown.

buf_pool_t::resize_hash(): Replaces buf_pool_resize_hash().
Prepend a shallow copy of the old page_hash to freed_page_hash.

buf_pool_t::page_hash_table::n_cells: Declare as Atomic_relaxed.

buf_pool_t::page_hash_table::lock(): Explain what prevents a
race condition with buf_pool_t::resize_hash().
This commit is contained in:
Marko Mäkelä 2020-06-18 13:38:30 +03:00
parent cfd3d70ccb
commit 5155a300fa
23 changed files with 478 additions and 327 deletions

View file

@ -17,7 +17,6 @@ wait/synch/sxlock/innodb/dict_table_stats
wait/synch/sxlock/innodb/fil_space_latch
wait/synch/sxlock/innodb/fts_cache_init_rw_lock
wait/synch/sxlock/innodb/fts_cache_rw_lock
wait/synch/sxlock/innodb/hash_table_locks
wait/synch/sxlock/innodb/index_online_log
wait/synch/sxlock/innodb/index_tree_rw_lock
wait/synch/sxlock/innodb/trx_i_s_cache_lock

View file

@ -486,7 +486,6 @@ insert into test.sanity values
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_OPTIMIZE_FULLTEXT_ONLY"),
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_CLEANERS"),
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_CLEANER_DISABLED_DEBUG"),
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_HASH_LOCKS"),
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PAGE_SIZE"),
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PRINT_ALL_DEADLOCKS"),
("JUNK: GLOBAL-ONLY", "I_S.SESSION_VARIABLES", "INNODB_PURGE_BATCH_SIZE"),

View file

@ -1,24 +0,0 @@
select @@global.innodb_page_hash_locks between 1 and 1024;
@@global.innodb_page_hash_locks between 1 and 1024
1
select @@global.innodb_page_hash_locks;
@@global.innodb_page_hash_locks
64
select @@session.innodb_page_hash_locks;
ERROR HY000: Variable 'innodb_page_hash_locks' is a GLOBAL variable
show global variables like 'innodb_page_hash_locks';
Variable_name Value
innodb_page_hash_locks 64
show session variables like 'innodb_page_hash_locks';
Variable_name Value
innodb_page_hash_locks 64
select * from information_schema.global_variables where variable_name='innodb_page_hash_locks';
VARIABLE_NAME VARIABLE_VALUE
INNODB_PAGE_HASH_LOCKS 64
select * from information_schema.session_variables where variable_name='innodb_page_hash_locks';
VARIABLE_NAME VARIABLE_VALUE
INNODB_PAGE_HASH_LOCKS 64
set global innodb_page_hash_locks=1;
ERROR HY000: Variable 'innodb_page_hash_locks' is a read only variable
set @@session.innodb_page_hash_locks='some';
ERROR HY000: Variable 'innodb_page_hash_locks' is a read only variable

View file

@ -380,15 +380,6 @@
VARIABLE_COMMENT Deprecated parameter with no effect.
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 64
@@ -1513,7 +1513,7 @@
SESSION_VALUE NULL
DEFAULT_VALUE 16
VARIABLE_SCOPE GLOBAL
-VARIABLE_TYPE BIGINT UNSIGNED
+VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 1024
@@ -1525,7 +1525,7 @@
SESSION_VALUE NULL
DEFAULT_VALUE 16384

View file

@ -1509,18 +1509,6 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_PAGE_HASH_LOCKS
SESSION_VALUE NULL
DEFAULT_VALUE 64
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BIGINT UNSIGNED
VARIABLE_COMMENT Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 1024
NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_PAGE_SIZE
SESSION_VALUE NULL
DEFAULT_VALUE 16384

View file

@ -1,24 +0,0 @@
--source include/have_innodb.inc
--source include/have_debug.inc
#
# exists as global only
#
select @@global.innodb_page_hash_locks between 1 and 1024;
select @@global.innodb_page_hash_locks;
--error ER_INCORRECT_GLOBAL_LOCAL_VAR
select @@session.innodb_page_hash_locks;
show global variables like 'innodb_page_hash_locks';
show session variables like 'innodb_page_hash_locks';
--disable_warnings
select * from information_schema.global_variables where variable_name='innodb_page_hash_locks';
select * from information_schema.session_variables where variable_name='innodb_page_hash_locks';
--enable_warnings
#
# show that it's read-only
#
--error ER_INCORRECT_GLOBAL_LOCAL_VAR
set global innodb_page_hash_locks=1;
--error ER_INCORRECT_GLOBAL_LOCAL_VAR
set @@session.innodb_page_hash_locks='some';

View file

@ -1085,15 +1085,15 @@ fail:
buf_block_t* block = buf_pool.block_from_ahi(rec);
if (!ahi_latch) {
rw_lock_t* hash_lock = buf_pool.hash_lock_get(
page_hash_latch* hash_lock = buf_pool.hash_lock_get(
block->page.id());
rw_lock_s_lock(hash_lock);
hash_lock->read_lock();
if (block->page.state() == BUF_BLOCK_REMOVE_HASH) {
/* Another thread is just freeing the block
from the LRU list of the buffer pool: do not
try to access this page. */
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
goto fail;
}
@ -1104,7 +1104,7 @@ fail:
DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED);
buf_block_buf_fix_inc(block, __FILE__, __LINE__);
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
block->page.set_accessed();
buf_page_make_young_if_needed(&block->page);

View file

@ -555,8 +555,8 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
return false;
}
rw_lock_t * hash_lock = buf_pool.hash_lock_get_low(fold);
rw_lock_x_lock(hash_lock);
page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (bpage->can_relocate()) {
/* Relocate the compressed page. */
@ -567,7 +567,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
memcpy(dst, src, size);
bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
buf_buddy_mem_invalid(
reinterpret_cast<buf_buddy_free_t*>(src), i);
@ -578,7 +578,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
return(true);
}
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
return(false);
}

View file

@ -40,6 +40,7 @@ Created 11/5/1995 Heikki Tuuri
#include <string.h>
#ifndef UNIV_INNOCHECKSUM
#include "my_cpu.h"
#include "mem0mem.h"
#include "btr0btr.h"
#include "fil0fil.h"
@ -278,6 +279,47 @@ the read requests for the whole area.
*/
#ifndef UNIV_INNOCHECKSUM
void page_hash_latch::read_lock_wait()
{
auto l= read_lock_yield();
/* First, try busy spinning for a while. */
for (auto spin= srv_n_spin_wait_rounds; spin--; )
{
if (l & WRITER_PENDING)
ut_delay(srv_spin_wait_delay);
if (read_trylock())
return;
l= read_lock_yield();
}
/* Fall back to yielding to other threads. */
for (;;)
{
if (l & WRITER_PENDING)
os_thread_yield();
if (read_trylock())
return;
l= read_lock_yield();
}
}
void page_hash_latch::write_lock_wait()
{
write_lock_wait_start();
/* First, try busy spinning for a while. */
for (auto spin= srv_n_spin_wait_rounds; spin--; )
{
if (write_lock_poll())
return;
ut_delay(srv_spin_wait_delay);
}
/* Fall back to yielding to other threads. */
do
os_thread_yield();
while (!write_lock_poll());
}
/** Value in microseconds */
constexpr int WAIT_FOR_READ= 100;
constexpr int WAIT_FOR_WRITE= 100;
@ -1441,6 +1483,15 @@ static void buf_block_free_mutexes(buf_block_t* block)
ut_d(ut_free(block->debug_latch));
}
/** Create the hash table.
@param n the lower bound of n_cells */
void buf_pool_t::page_hash_table::create(ulint n)
{
n_cells= ut_find_prime(n);
array= static_cast<hash_cell_t*>
(ut_zalloc_nokey(pad(n_cells) * sizeof *array));
}
/** Create the buffer pool.
@return whether the creation failed */
bool buf_pool_t::create()
@ -1517,16 +1568,7 @@ bool buf_pool_t::create()
n_chunks_new= n_chunks;
/* Number of locks protecting page_hash must be a power of two */
srv_n_page_hash_locks= my_round_up_to_next_power(static_cast<uint32_t>
(srv_n_page_hash_locks));
ut_a(srv_n_page_hash_locks != 0);
ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
page_hash.create(2 * curr_size);
for (auto i= srv_n_page_hash_locks; i--; )
rw_lock_create(hash_table_locks_key, &page_hash_latches[i],
SYNC_BUF_PAGE_HASH);
zip_hash.create(2 * curr_size);
last_printout_time= time(NULL);
@ -1604,9 +1646,14 @@ void buf_pool_t::close()
ut_free(chunks);
chunks= nullptr;
for (auto i= srv_n_page_hash_locks; i--; )
rw_lock_free(&page_hash_latches[i]);
page_hash.free();
while (page_hash_table *old_page_hash= freed_page_hash)
{
freed_page_hash= static_cast<page_hash_table*>
(old_page_hash->array[1].node);
old_page_hash->free();
UT_DELETE(old_page_hash);
}
zip_hash.free();
io_buf.close();
@ -1632,8 +1679,8 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
}
const page_id_t id(block->page.id());
rw_lock_t* hash_lock = hash_lock_get(id);
rw_lock_x_lock(hash_lock);
page_hash_latch* hash_lock = hash_lock_get(id);
hash_lock->write_lock();
if (block->page.can_relocate()) {
memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
@ -1722,13 +1769,13 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
ut_ad(new_block->lock_hash_val == lock_rec_hash(
id.space(), id.page_no()));
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
/* free block */
ut_d(block->page.set_state(BUF_BLOCK_MEMORY));
buf_LRU_block_free_non_file_page(block);
} else {
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
buf_LRU_block_free_non_file_page(new_block);
}
@ -1922,30 +1969,39 @@ inline bool buf_pool_t::withdraw_blocks()
}
/** resize page_hash and zip_hash */
static void buf_pool_resize_hash()
inline void buf_pool_t::resize_hash()
{
hash_table_t new_hash;
new_hash.create(2 * buf_pool.curr_size);
page_hash_table *new_page_hash= UT_NEW_NOKEY(page_hash_table());
new_page_hash->create(2 * buf_pool.curr_size);
new_page_hash->write_lock_all();
for (ulint i= 0; i < buf_pool.page_hash.n_cells; i++)
for (auto i= page_hash.pad(page_hash.n_cells); i--; )
{
while (buf_page_t *bpage= static_cast<buf_page_t*>
(HASH_GET_FIRST(&buf_pool.page_hash, i)))
static_assert(!((page_hash_table::ELEMENTS_PER_LATCH + 1) &
page_hash_table::ELEMENTS_PER_LATCH),
"must be one less than a power of 2");
if (!(i & page_hash_table::ELEMENTS_PER_LATCH))
{
ut_ad(reinterpret_cast<page_hash_latch*>
(&page_hash.array[i])->is_write_locked());
continue;
}
while (buf_page_t *bpage= static_cast<buf_page_t*>
(page_hash.array[i].node))
{
buf_page_t *prev_bpage= bpage;
ut_ad(bpage->in_page_hash);
bpage= static_cast<buf_page_t*>(HASH_GET_NEXT(hash, prev_bpage));
const ulint fold= prev_bpage->id().fold();
HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, fold, prev_bpage);
HASH_INSERT(buf_page_t, hash, &new_hash, fold, prev_bpage);
const ulint fold= bpage->id().fold();
HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
HASH_INSERT(buf_page_t, hash, new_page_hash, fold, bpage);
}
}
std::swap(buf_pool.page_hash.array, new_hash.array);
buf_pool.page_hash.n_cells= new_hash.n_cells;
new_hash.free();
buf_pool.page_hash.array[1].node= freed_page_hash;
std::swap(buf_pool.page_hash, *new_page_hash);
freed_page_hash= new_page_hash;
/* recreate zip_hash */
hash_table_t new_hash;
new_hash.create(2 * buf_pool.curr_size);
for (ulint i= 0; i < buf_pool.zip_hash.n_cells; i++)
@ -1953,11 +2009,9 @@ static void buf_pool_resize_hash()
while (buf_page_t *bpage= static_cast<buf_page_t*>
(HASH_GET_FIRST(&buf_pool.zip_hash, i)))
{
buf_page_t *prev_bpage= bpage;
bpage= static_cast<buf_page_t*>(HASH_GET_NEXT(hash, prev_bpage));
const ulint fold= BUF_POOL_ZIP_FOLD_BPAGE(prev_bpage);
HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, prev_bpage);
HASH_INSERT(buf_page_t, hash, &new_hash, fold, prev_bpage);
const ulint fold= BUF_POOL_ZIP_FOLD_BPAGE(bpage);
HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
HASH_INSERT(buf_page_t, hash, &new_hash, fold, bpage);
}
}
@ -1967,6 +2021,49 @@ static void buf_pool_resize_hash()
}
inline void buf_pool_t::page_hash_table::write_lock_all()
{
for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
{
reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
if (!n)
break;
}
}
inline void buf_pool_t::page_hash_table::write_unlock_all()
{
for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
{
reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
if (!n)
break;
}
}
inline void buf_pool_t::write_lock_all_page_hash()
{
ut_ad(mutex_own(&mutex));
page_hash.write_lock_all();
for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash;
old_page_hash= static_cast<page_hash_table*>
(old_page_hash->array[1].node))
old_page_hash->write_lock_all();
}
inline void buf_pool_t::write_unlock_all_page_hash()
{
page_hash.write_unlock_all();
for (page_hash_table *old_page_hash= freed_page_hash; old_page_hash;
old_page_hash= static_cast<page_hash_table*>
(old_page_hash->array[1].node))
old_page_hash->write_unlock_all();
}
/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
inline void buf_pool_t::resize()
{
@ -2131,8 +2228,7 @@ withdraw_retry:
resizing.store(true, std::memory_order_relaxed);
mutex_enter(&mutex);
for (auto i= srv_n_page_hash_locks; i--; )
rw_lock_x_lock(&page_hash_latches[i]);
write_lock_all_page_hash();
chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
@ -2278,13 +2374,12 @@ calc_buf_pool_size:
if the new size is too different */
if (!warning && new_size_too_diff) {
buf_resize_status("Resizing hash table");
buf_pool_resize_hash();
resize_hash();
ib::info() << "hash tables were resized";
}
mutex_exit(&mutex);
for (auto i= srv_n_page_hash_locks; i--; )
rw_lock_x_unlock(&page_hash_latches[i]);
write_unlock_all_page_hash();
UT_DELETE(chunk_map_old);
@ -2390,7 +2485,7 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
const ulint fold= bpage->id().fold();
ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
ut_ad(mutex_own(&buf_pool.mutex));
ut_ad(rw_lock_own(buf_pool.hash_lock_get(bpage->id()), RW_LOCK_X));
ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
ut_a(bpage->io_fix() == BUF_IO_NONE);
ut_a(!bpage->buf_fix_count());
ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
@ -2443,11 +2538,11 @@ relocated, and reacquired.
@return a buffer pool block corresponding to id
@retval nullptr if the block was not present, and a watch was installed */
inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
rw_lock_t **hash_lock)
page_hash_latch **hash_lock)
{
const ulint fold= id.fold();
ut_ad(*hash_lock == hash_lock_get_low(fold));
ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X));
ut_ad(*hash_lock == page_hash.lock_get(fold));
ut_ad((*hash_lock)->is_write_locked());
retry:
if (buf_page_t *bpage= page_hash_get_low(id, fold))
@ -2460,7 +2555,7 @@ retry:
return nullptr;
}
rw_lock_x_unlock(*hash_lock);
(*hash_lock)->write_unlock();
/* Allocate a watch[] and then try to insert it into the page_hash. */
mutex_enter(&mutex);
@ -2484,18 +2579,18 @@ retry:
w->set_state(BUF_BLOCK_ZIP_PAGE);
w->id_= id;
*hash_lock= hash_lock_get_low(fold);
rw_lock_x_lock(*hash_lock);
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
mutex_exit(&mutex);
buf_page_t *bpage= page_hash_get_low(id, fold);
if (UNIV_LIKELY_NULL(bpage))
{
rw_lock_x_unlock(*hash_lock);
(*hash_lock)->write_unlock();
mutex_enter(&mutex);
w->set_state(BUF_BLOCK_NOT_USED);
*hash_lock= hash_lock_get_low(fold);
rw_lock_x_lock(*hash_lock);
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
mutex_exit(&mutex);
goto retry;
}
@ -2533,7 +2628,7 @@ void buf_page_free(const page_id_t page_id,
buf_pool.stat.n_page_gets++;
const ulint fold= page_id.fold();
rw_lock_t *hash_lock= buf_pool.page_hash_lock<false>(fold);
page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
buf_block_t *block= reinterpret_cast<buf_block_t*>
(buf_pool.page_hash_get_low(page_id, fold));
@ -2544,7 +2639,7 @@ void buf_page_free(const page_id_t page_id,
{
/* FIXME: if block!=NULL, convert to BUF_BLOCK_FILE_PAGE,
but avoid buf_zip_decompress() */
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
return;
}
@ -2559,7 +2654,7 @@ void buf_page_free(const page_id_t page_id,
block->page.status= buf_page_t::FREED;
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
}
/** Get read access to a compressed page (usually of type
@ -2581,7 +2676,7 @@ buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
bool discard_attempted= false;
const ulint fold= page_id.fold();
buf_page_t *bpage;
rw_lock_t *hash_lock;
page_hash_latch *hash_lock;
for (;;)
{
@ -2604,13 +2699,13 @@ lookup:
#endif /* UNIV_DEBUG */
}
ut_ad(rw_lock_own(hash_lock, RW_LOCK_S));
ut_ad(hash_lock->is_read_locked());
if (!bpage->zip.data)
{
/* There is no compressed page. */
err_exit:
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
return nullptr;
}
@ -2625,7 +2720,7 @@ err_exit:
if (!discard_attempted)
{
discard_attempted= true;
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
mutex_enter(&buf_pool.mutex);
if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
buf_LRU_free_page(bpage, false);
@ -2645,7 +2740,7 @@ err_exit:
got_block:
bool must_read= bpage->io_fix() == BUF_IO_READ;
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
DBUG_ASSERT(bpage->status != buf_page_t::FREED);
@ -2981,7 +3076,7 @@ loop:
buf_block_t* fix_block;
block = guess;
rw_lock_t* hash_lock = buf_pool.page_hash_lock<false>(fold);
page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
if (block) {
@ -3006,14 +3101,14 @@ lookup:
}
if (!block || buf_pool.watch_is_sentinel(block->page)) {
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
block = nullptr;
}
if (UNIV_UNLIKELY(!block)) {
/* Page not in buf_pool: needs to be read from file */
if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
hash_lock = buf_pool.page_hash_lock<true>(fold);
hash_lock = buf_pool.page_hash.lock<true>(fold);
if (buf_page_t *bpage= buf_pool.watch_set(
page_id, &hash_lock)) {
@ -3021,13 +3116,13 @@ lookup:
increment the fix count to make
sure that no state change takes place. */
bpage->fix();
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
block = reinterpret_cast<buf_block_t*>(bpage);
fix_block = block;
goto got_block;
}
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
}
switch (mode) {
@ -3121,7 +3216,7 @@ lookup:
}
fix_block->fix();
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
got_block:
switch (mode) {
@ -3212,9 +3307,9 @@ evict_from_pool:
buf_block_init_low(block);
mutex_enter(&buf_pool.mutex);
hash_lock = buf_pool.hash_lock_get_low(fold);
hash_lock = buf_pool.page_hash.lock_get(fold);
rw_lock_x_lock(hash_lock);
hash_lock->write_lock();
/* Buffer-fixing prevents the page_hash from changing. */
ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
@ -3228,7 +3323,7 @@ evict_from_pool:
This should be extremely unlikely, for example,
if buf_page_get_zip() was invoked. */
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
buf_LRU_block_free_non_file_page(block);
mutex_exit(&buf_pool.mutex);
@ -3276,7 +3371,7 @@ evict_from_pool:
UNIV_MEM_INVALID(bpage, sizeof *bpage);
mutex_exit(&buf_pool.mutex);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
buf_pool.n_pend_unzip++;
access_time = block->page.is_accessed();
@ -3312,9 +3407,6 @@ evict_from_pool:
ut_ad(block == fix_block);
ut_ad(fix_block->page.buf_fix_count());
ut_ad(!rw_lock_own_flagged(hash_lock,
RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
@ -3336,8 +3428,8 @@ evict_from_pool:
if (buf_LRU_free_page(&fix_block->page, true)) {
space->release_for_io();
hash_lock = buf_pool.hash_lock_get_low(fold);
rw_lock_x_lock(hash_lock);
hash_lock = buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
mutex_exit(&buf_pool.mutex);
/* We may set the watch, as it would have
been set if the page were not in the
@ -3346,7 +3438,7 @@ evict_from_pool:
mode == BUF_GET_IF_IN_POOL_OR_WATCH
? buf_pool.watch_set(page_id, &hash_lock)
: buf_pool.page_hash_get_low(page_id, fold));
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
if (block != NULL) {
/* Either the page has been read in or
@ -3467,9 +3559,6 @@ get_latch:
buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
}
ut_ad(!rw_lock_own_flagged(hash_lock,
RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
return(fix_block);
}
@ -3558,17 +3647,17 @@ buf_page_optimistic_get(
return FALSE;
}
rw_lock_t *hash_lock = buf_pool.hash_lock_get(block->page.id());
rw_lock_s_lock(hash_lock);
page_hash_latch *hash_lock = buf_pool.hash_lock_get(block->page.id());
hash_lock->read_lock();
if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
|| block->page.io_fix() != BUF_IO_NONE)) {
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
return(FALSE);
}
buf_block_buf_fix_inc(block, file, line);
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
const bool first_access = block->page.set_accessed();
@ -3645,7 +3734,7 @@ buf_page_try_get_func(
ut_ad(mtr);
ut_ad(mtr->is_active());
rw_lock_t *hash_lock;
page_hash_latch *hash_lock;
buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
page_id.fold(),
&hash_lock);
@ -3653,13 +3742,13 @@ buf_page_try_get_func(
return nullptr;
if (bpage->state() != BUF_BLOCK_FILE_PAGE)
{
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
return nullptr;
}
buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
buf_block_buf_fix_inc(block, file, line);
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX;
if (!rw_lock_s_lock_nowait(&block->lock, file, line))
@ -3770,8 +3859,8 @@ buf_page_create(fil_space_t *space, uint32_t offset,
/* The block must be put to the LRU list */
buf_LRU_add_block(&block->page, false);
rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id);
rw_lock_x_lock(hash_lock);
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
block->page.set_state(BUF_BLOCK_FILE_PAGE);
ut_d(block->page.in_page_hash= true);
HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
@ -3783,7 +3872,7 @@ buf_page_create(fil_space_t *space, uint32_t offset,
the block. */
block->page.set_io_fix(BUF_IO_READ);
rw_lock_x_lock(&block->lock);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
/* buf_pool.mutex may be released and reacquired by
buf_buddy_alloc(). We must defer this operation until
@ -3801,7 +3890,7 @@ buf_page_create(fil_space_t *space, uint32_t offset,
rw_lock_x_unlock(&block->lock);
}
else
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
mutex_exit(&buf_pool.mutex);
@ -3954,10 +4043,10 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
void buf_pool_t::corrupted_evict(buf_page_t *bpage)
{
const page_id_t id(bpage->id());
rw_lock_t *hash_lock= hash_lock_get(id);
page_hash_latch *hash_lock= hash_lock_get(id);
mutex_enter(&mutex);
rw_lock_x_lock(hash_lock);
hash_lock->write_lock();
ut_ad(bpage->io_fix() == BUF_IO_READ);
ut_ad(!bpage->oldest_modification());

View file

@ -141,7 +141,7 @@ caller needs to free the page to the free list
@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
this case the block is already returned to the buddy allocator. */
static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
rw_lock_t *hash_lock, bool zip);
page_hash_latch *hash_lock, bool zip);
/** Free a block to buf_pool */
static void buf_LRU_block_free_hashed_page(buf_block_t *block)
@ -1160,8 +1160,8 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
bpage->can_relocate() from changing due to a concurrent
execution of buf_page_get_low(). */
const ulint fold = id.fold();
rw_lock_t* hash_lock = buf_pool.hash_lock_get_low(fold);
rw_lock_x_lock(hash_lock);
page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
if (UNIV_UNLIKELY(!bpage->can_relocate())) {
/* Do not free buffer fixed and I/O-fixed blocks. */
@ -1178,7 +1178,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
} else if (bpage->oldest_modification()
&& bpage->state() != BUF_BLOCK_FILE_PAGE) {
func_exit:
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
return(false);
} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
@ -1201,10 +1201,6 @@ func_exit:
return(true);
}
/* buf_LRU_block_remove_hashed() releases the hash_lock */
ut_ad(!rw_lock_own_flagged(hash_lock,
RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr
then it was a compressed page with an uncompressed frame and
we are interested in freeing only the uncompressed frame.
@ -1215,7 +1211,7 @@ func_exit:
if (UNIV_LIKELY_NULL(b)) {
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
rw_lock_x_lock(hash_lock);
hash_lock->write_lock();
ut_ad(!buf_pool.page_hash_get_low(id, fold));
ut_ad(b->zip_size());
@ -1301,7 +1297,7 @@ func_exit:
decompressing the block while we release
hash_lock. */
b->set_io_fix(BUF_IO_PIN);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
}
mutex_exit(&buf_pool.mutex);
@ -1405,10 +1401,10 @@ caller needs to free the page to the free list
@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
this case the block is already returned to the buddy allocator. */
static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
rw_lock_t *hash_lock, bool zip)
page_hash_latch *hash_lock, bool zip)
{
ut_ad(mutex_own(&buf_pool.mutex));
ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
ut_ad(hash_lock->is_write_locked());
ut_a(bpage->io_fix() == BUF_IO_NONE);
ut_a(!bpage->buf_fix_count());
@ -1501,7 +1497,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
#ifdef UNIV_DEBUG
UT_LIST_REMOVE(buf_pool.zip_clean, bpage);
#endif /* UNIV_DEBUG */
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
buf_pool_mutex_exit_forbid();
buf_buddy_free(bpage->zip.data, bpage->zip_size());
@ -1542,7 +1538,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
and by the time we'll release it in the caller we'd
have inserted the compressed only descriptor in the
page_hash. */
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
if (zip && bpage->zip.data) {
/* Free the compressed page. */
@ -1578,20 +1574,15 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
@param id page identifier
@param hash_lock buf_pool.page_hash latch (will be released here) */
void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
rw_lock_t *hash_lock)
page_hash_latch *hash_lock)
{
while (bpage->buf_fix_count())
{
/* Wait for other threads to release the fix count
before releasing the bpage from LRU list. */
ut_delay(1);
}
(void) LF_BACKOFF();
if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true))
buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
/* buf_LRU_block_remove_hashed() releases hash_lock */
ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
}
/** Update buf_pool.LRU_old_ratio.

View file

@ -53,7 +53,7 @@ that the block has been replaced with the real block.
@param watch sentinel */
inline void buf_pool_t::watch_remove(buf_page_t *watch)
{
ut_ad(rw_lock_own(hash_lock_get(watch->id()), RW_LOCK_X));
ut_ad(hash_lock_get(watch->id())->is_write_locked());
ut_a(watch_is_sentinel(*watch));
if (watch->buf_fix_count())
{
@ -125,14 +125,14 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
/* We must acquire hash_lock this early to prevent
a race condition with buf_pool_t::watch_remove() */
rw_lock_t *hash_lock= buf_pool.hash_lock_get_low(fold);
rw_lock_x_lock(hash_lock);
page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
{
/* The page is already in the buffer pool. */
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
if (block)
{
rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
@ -160,7 +160,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
ut_ad(!block->page.in_page_hash);
ut_d(block->page.in_page_hash= true);
HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
/* The block must be put to the LRU list, to the old blocks */
buf_LRU_add_block(bpage, true/* to old blocks */);
@ -184,7 +184,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
}
else
{
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
/* The compressed page must be allocated before the
control block (bpage), in order to avoid the
@ -193,7 +193,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bool lru= false;
void *data= buf_buddy_alloc(zip_size, &lru);
rw_lock_x_lock(hash_lock);
hash_lock->write_lock();
/* If buf_buddy_alloc() allocated storage from the LRU list,
it released and reacquired buf_pool.mutex. Thus, we must
@ -205,7 +205,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
{
/* The block was added by some other thread. */
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
buf_buddy_free(data, zip_size);
goto func_exit;
}
@ -234,7 +234,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
ut_d(bpage->in_page_hash= true);
HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
bpage->set_io_fix(BUF_IO_READ);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
/* The block must be put to the LRU list, to the old blocks.
The zip size is already set into the page zip */
@ -253,7 +253,6 @@ func_exit_no_mutex:
if (mode == BUF_READ_IBUF_PAGES_ONLY)
ibuf_mtr_commit(&mtr);
ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
ut_ad(!bpage || bpage->in_file());
return bpage;
@ -426,10 +425,10 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
for (page_id_t i= low; i < high; ++i)
{
const ulint fold= i.fold();
rw_lock_t *hash_lock= buf_pool.page_hash_lock<false>(fold);
const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
if (found && !--count)
goto read_ahead;
}
@ -620,7 +619,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
for (page_id_t i= low; i != high_1; ++i)
{
const ulint fold= i.fold();
rw_lock_t *hash_lock= buf_pool.page_hash_lock<false>(fold);
page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
if (i == page_id)
{
@ -632,7 +631,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
if (!bpage)
{
hard_fail:
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
space->release();
return 0;
}
@ -673,7 +672,7 @@ hard_fail:
else if (!bpage)
{
failed:
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
if (--count)
continue;
space->release();
@ -694,7 +693,7 @@ failed:
prev_accessed= accessed;
if (fail)
goto failed;
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
}
/* If we got this far, read-ahead can be sensible: do it */

View file

@ -590,8 +590,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] = {
PSI_RWLOCK_KEY(trx_purge_latch),
PSI_RWLOCK_KEY(index_tree_rw_lock),
PSI_RWLOCK_KEY(index_online_log),
PSI_RWLOCK_KEY(dict_table_stats),
PSI_RWLOCK_KEY(hash_table_locks)
PSI_RWLOCK_KEY(dict_table_stats)
};
# endif /* UNIV_PFS_RWLOCK */
@ -19500,11 +19499,6 @@ static MYSQL_SYSVAR_ULONG(buffer_pool_chunk_size, srv_buf_pool_chunk_unit,
128 * 1024 * 1024, 1024 * 1024, LONG_MAX, 1024 * 1024);
#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks,
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
"Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2",
NULL, NULL, 64, 1, MAX_PAGE_HASH_LOCKS, 0);
static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
"Number of pages reserved in doublewrite buffer for batch flushing",
@ -20393,7 +20387,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(merge_threshold_set_all_debug),
#endif /* UNIV_DEBUG */
#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
MYSQL_SYSVAR(page_hash_locks),
MYSQL_SYSVAR(doublewrite_batch_size),
#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
MYSQL_SYSVAR(status_output),

View file

@ -70,9 +70,6 @@ struct fil_addr_t;
#define BUF_EVICT_IF_IN_POOL 20 /*!< evict a clean block if found */
/* @} */
#define MAX_PAGE_HASH_LOCKS 1024 /*!< The maximum number of
page_hash locks */
# ifdef UNIV_DEBUG
extern my_bool buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing
buffer pool is not allowed. */
@ -1605,47 +1602,9 @@ public:
}
/** Get the page_hash latch for a page */
rw_lock_t *hash_lock_get(const page_id_t id) const
page_hash_latch *hash_lock_get(const page_id_t id) const
{
return hash_lock_get_low(id.fold());
}
/** Get a page_hash latch. */
rw_lock_t *hash_lock_get_low(ulint fold) const
{
return page_hash_latches +
ut_2pow_remainder(page_hash.calc_hash(fold),
ulint{srv_n_page_hash_locks});
}
private:
/** Get a page_hash latch. */
rw_lock_t *hash_lock_get_low(ulint fold, ulint n_cells) const
{
return page_hash_latches +
ut_2pow_remainder(ut_hash_ulint(fold, n_cells),
ulint{srv_n_page_hash_locks});
}
public:
/** Acquire a page_hash bucket latch, tolerating concurrent resize()
@tparam exclusive whether the latch is to be acquired exclusively
@param fold hash bucket key */
template<bool exclusive> rw_lock_t *page_hash_lock(ulint fold)
{
for (;;)
{
auto n_cells= page_hash.n_cells;
rw_lock_t *latch= hash_lock_get_low(fold, n_cells);
if (exclusive)
rw_lock_x_lock(latch);
else
rw_lock_s_lock(latch);
if (UNIV_LIKELY(n_cells == page_hash.n_cells))
return latch;
if (exclusive)
rw_lock_x_unlock(latch);
else
rw_lock_s_unlock(latch);
}
return page_hash.lock_get(id.fold());
}
/** Look up a block descriptor.
@ -1656,9 +1615,7 @@ public:
buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
{
ut_ad(id.fold() == fold);
ut_ad(mutex_own(&mutex) ||
rw_lock_own_flagged(hash_lock_get_low(fold),
RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
ut_ad(mutex_own(&mutex) || page_hash.lock_get(fold)->is_locked());
buf_page_t *bpage;
/* Look for the page in the hash table */
HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
@ -1676,17 +1633,14 @@ private:
@retval nullptr if no block was found; !lock || !*lock will also hold */
template<bool exclusive,bool watch>
buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
rw_lock_t **hash_lock)
page_hash_latch **hash_lock)
{
ut_ad(hash_lock || !exclusive);
rw_lock_t *latch= page_hash_lock<exclusive>(fold);
page_hash_latch *latch= page_hash.lock<exclusive>(fold);
buf_page_t *bpage= page_hash_get_low(page_id, fold);
if (!bpage || watch_is_sentinel(*bpage))
{
if (exclusive)
rw_lock_x_unlock(latch);
else
rw_lock_s_unlock(latch);
latch->release<exclusive>();
if (hash_lock)
*hash_lock= nullptr;
return watch ? bpage : nullptr;
@ -1697,10 +1651,8 @@ private:
if (hash_lock)
*hash_lock= latch; /* to be released by the caller */
else if (exclusive)
rw_lock_x_unlock(latch);
else
rw_lock_s_unlock(latch);
latch->release<exclusive>();
return bpage;
}
public:
@ -1713,7 +1665,7 @@ public:
@retval nullptr if no block was found; !lock || !*lock will also hold */
template<bool exclusive>
buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
rw_lock_t **hash_lock)
page_hash_latch **hash_lock)
{ return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
/** @return whether the buffer pool contains a page
@ -1730,9 +1682,7 @@ public:
@return whether bpage a sentinel for a buffer pool watch */
bool watch_is_sentinel(const buf_page_t &bpage)
{
ut_ad(mutex_own(&mutex) ||
rw_lock_own_flagged(hash_lock_get(bpage.id()),
RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
ut_ad(mutex_own(&mutex) || hash_lock_get(bpage.id())->is_locked());
ut_ad(bpage.in_file());
if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
@ -1754,11 +1704,11 @@ public:
bool watch_occurred(const page_id_t id)
{
const ulint fold= id.fold();
rw_lock_t *hash_lock= page_hash_lock<false>(fold);
page_hash_latch *hash_lock= page_hash.lock<false>(fold);
/* The page must exist because watch_set() increments buf_fix_count. */
buf_page_t *bpage= page_hash_get_low(id, fold);
const bool is_sentinel= watch_is_sentinel(*bpage);
rw_lock_s_unlock(hash_lock);
hash_lock->read_unlock();
return !is_sentinel;
}
@ -1769,7 +1719,8 @@ public:
@param hash_lock exclusively held page_hash latch
@return a buffer pool block corresponding to id
@retval nullptr if the block was not present, and a watch was installed */
inline buf_page_t *watch_set(const page_id_t id, rw_lock_t **hash_lock);
inline buf_page_t *watch_set(const page_id_t id,
page_hash_latch **hash_lock);
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
@ -1777,7 +1728,7 @@ public:
void watch_unset(const page_id_t id)
{
const ulint fold= id.fold();
rw_lock_t *hash_lock= page_hash_lock<true>(fold);
page_hash_latch *hash_lock= page_hash.lock<true>(fold);
/* The page must exist because watch_set() increments buf_fix_count. */
buf_page_t *watch= page_hash_get_low(id, fold);
if (watch->unfix() == 0 && watch_is_sentinel(*watch))
@ -1786,7 +1737,7 @@ public:
ut_ad(watch->in_page_hash);
ut_d(watch->in_page_hash= false);
HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch);
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
// Now that the watch is detached from page_hash, release it to watch[].
mutex_enter(&mutex);
/* It is possible that watch_remove() already removed the watch. */
@ -1799,7 +1750,7 @@ public:
mutex_exit(&mutex);
}
else
rw_lock_x_unlock(hash_lock);
hash_lock->write_unlock();
}
/** Remove the sentinel block for the watch before replacing it with a
@ -1872,11 +1823,92 @@ public:
/** read-ahead request size in pages */
Atomic_counter<uint32_t> read_ahead_area;
/** Hash table with singly-linked overflow lists. @see hash_table_t */
struct page_hash_table
{
/** Number of array[] elements per page_hash_latch.
Must be one less than a power of 2. */
static constexpr size_t ELEMENTS_PER_LATCH= 1023;
/** number of payload elements in array[] */
Atomic_relaxed<ulint> n_cells;
/** the hash array, with pad(n_cells) elements */
hash_cell_t *array;
/** Create the hash table.
@param n the lower bound of n_cells */
void create(ulint n);
/** Free the hash table. */
void free() { ut_free(array); array= nullptr; }
/** @return the index of an array element */
ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
/** @return raw array index converted to padded index */
static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; }
private:
/** @return the hash value before any ELEMENTS_PER_LATCH padding */
static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
/** @return the index of an array element */
static ulint calc_hash(ulint fold, ulint n_cells)
{
return pad(hash(fold, n_cells));
}
/** Get a page_hash latch. */
page_hash_latch *lock_get(ulint fold, ulint n) const
{
static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
"must be one less than a power of 2");
return reinterpret_cast<page_hash_latch*>
(&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
}
public:
/** Get a page_hash latch. */
page_hash_latch *lock_get(ulint fold) const
{ return lock_get(fold, n_cells); }
/** Acquire an array latch, tolerating concurrent buf_pool_t::resize()
@tparam exclusive whether the latch is to be acquired exclusively
@param fold hash bucket key */
template<bool exclusive> page_hash_latch *lock(ulint fold)
{
for (;;)
{
auto n= n_cells;
page_hash_latch *latch= lock_get(fold, n);
latch->acquire<exclusive>();
/* Our latch prevents n_cells from changing. */
if (UNIV_LIKELY(n == n_cells))
return latch;
/* Retry, because buf_pool_t::resize_hash() affected us. */
latch->release<exclusive>();
}
}
/** Exclusively aqcuire all latches */
inline void write_lock_all();
/** Release all latches */
inline void write_unlock_all();
};
private:
/** Former page_hash that has been deleted during resize();
singly-linked list via freed_page_hash->array[1] */
page_hash_table *freed_page_hash;
/** Lock all page_hash, also freed_page_hash. */
inline void write_lock_all_page_hash();
/** Release all page_hash, also freed_page_hash. */
inline void write_unlock_all_page_hash();
/** Resize page_hash and zip_hash. */
inline void resize_hash();
public:
/** Hash table of file pages (buf_page_t::in_file() holds),
indexed by page_id_t. Protected by both mutex and page_hash_latches[]. */
hash_table_t page_hash;
/** Latches protecting page_hash */
mutable rw_lock_t page_hash_latches[MAX_PAGE_HASH_LOCKS];
indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
page_hash_table page_hash;
/** map of block->frame to buf_block_t blocks that belong
to buf_buddy_alloc(); protected by buf_pool.mutex */
@ -2103,6 +2135,19 @@ private:
/** The InnoDB buffer pool */
extern buf_pool_t buf_pool;
inline void page_hash_latch::read_lock()
{
ut_ad(!mutex_own(&buf_pool.mutex));
if (!read_trylock())
read_lock_wait();
}
inline void page_hash_latch::write_lock()
{
if (!write_trylock())
write_lock_wait();
}
inline void buf_page_t::add_buf_fix_count(uint32_t count)
{
ut_ad(mutex_own(&buf_pool.mutex));
@ -2129,15 +2174,15 @@ inline void buf_page_t::set_state(buf_page_state state)
if (!in_file()) break;
/* fall through */
case BUF_BLOCK_FILE_PAGE:
ut_ad(rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X));
ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
break;
case BUF_BLOCK_NOT_USED:
if (!in_file()) break;
/* fall through */
case BUF_BLOCK_ZIP_PAGE:
ut_ad((this >= &buf_pool.watch[0] &&
this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]) ||
rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X));
ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
(this >= &buf_pool.watch[0] &&
this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
break;
}
#endif
@ -2159,7 +2204,7 @@ inline void buf_page_t::set_corrupt_id()
break;
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_FILE_PAGE:
ut_ad(rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X));
ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
break;
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_MEMORY:

View file

@ -153,7 +153,7 @@ buf_LRU_stat_update();
@param id page identifier
@param hash_lock buf_pool.page_hash latch (will be released here) */
void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
rw_lock_t *hash_lock)
page_hash_latch *hash_lock)
MY_ATTRIBUTE((nonnull));
#ifdef UNIV_DEBUG

View file

@ -192,10 +192,43 @@ extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX];
#include "ut0mutex.h"
#include "sync0rw.h"
#include "rw_lock.h"
typedef ib_mutex_t BufPoolMutex;
typedef ib_mutex_t FlushListMutex;
typedef rw_lock_t BPageLock;
class page_hash_latch : public rw_lock
{
public:
/** Wait for a shared lock */
void read_lock_wait();
/** Wait for an exclusive lock */
void write_lock_wait();
/** Acquire a shared lock */
inline void read_lock();
/** Acquire an exclusive lock */
inline void write_lock();
/** Acquire a lock */
template<bool exclusive> void acquire()
{
if (exclusive)
write_lock();
else
read_lock();
}
/** Release a lock */
template<bool exclusive> void release()
{
if (exclusive)
write_unlock();
else
read_unlock();
}
};
#endif /* !UNIV_INNOCHECKSUM */
#endif /* buf0types.h */

View file

@ -33,8 +33,6 @@ struct hash_cell_t{
};
typedef void* hash_node_t;
#define hash_calc_hash(FOLD, TABLE) (TABLE)->calc_hash(FOLD)
/*******************************************************************//**
Inserts a struct to a hash table. */
@ -145,7 +143,7 @@ Gets the next struct in a hash chain, NULL if none. */
Looks for a struct in a hash table. */
#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
{\
(DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
(DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \
HASH_ASSERT_VALID(DATA);\
\
while ((DATA) != NULL) {\

View file

@ -0,0 +1,106 @@
/*****************************************************************************
Copyright (c) 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*****************************************************************************/
#pragma once
#include <atomic>
#include "my_dbug.h"
/** Simple read-write lock based on std::atomic */
class rw_lock
{
/** The lock word */
std::atomic<uint32_t> lock;
protected:
/** Available lock */
static constexpr uint32_t UNLOCKED= 0;
/** Flag to indicate that write_lock() is being held */
static constexpr uint32_t WRITER= 1 << 31;
/** Flag to indicate that write_lock_wait() is pending */
static constexpr uint32_t WRITER_WAITING= 1 << 30;
/** Flag to indicate that write_lock() or write_lock_wait() is pending */
static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING;
/** Yield a read lock request due to a conflict with a write lock.
@return the lock value */
uint32_t read_lock_yield()
{
uint32_t l= lock.fetch_sub(1, std::memory_order_relaxed);
DBUG_ASSERT(l & ~WRITER_PENDING);
return l;
}
/** Start waiting for an exclusive lock. */
void write_lock_wait_start()
{ lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
/** Wait for an exclusive lock.
@return whether the exclusive lock was acquired */
bool write_lock_poll()
{
auto l= WRITER_WAITING;
if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
std::memory_order_relaxed))
return true;
if (!(l & WRITER_WAITING))
/* write_lock() must have succeeded for another thread */
write_lock_wait_start();
return false;
}
public:
/** Default constructor */
rw_lock() : lock(UNLOCKED) {}
/** Release a shared lock */
void read_unlock()
{
IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release);
DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */
DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
}
/** Release an exclusive lock */
void write_unlock()
{
IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
DBUG_ASSERT(l & WRITER); /* the write lock must have existed */
}
/** Try to acquire a shared lock.
@return whether the lock was acquired */
bool read_trylock()
{ return !(lock.fetch_add(1, std::memory_order_acquire) & WRITER_PENDING); }
/** Try to acquire an exclusive lock.
@return whether the lock was acquired */
bool write_trylock()
{
auto l= UNLOCKED;
return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
std::memory_order_relaxed);
}
/** @return whether an exclusive lock is being held by any thread */
bool is_write_locked() const
{ return !!(lock.load(std::memory_order_relaxed) & WRITER); }
/** @return whether a shared lock is being held by any thread */
bool is_read_locked() const
{
auto l= lock.load(std::memory_order_relaxed);
return (l & ~WRITER_PENDING) && !(l & WRITER);
}
/** @return whether any lock is being held by any thread */
bool is_locked() const
{ return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; }
};

View file

@ -333,8 +333,6 @@ extern const ulint srv_buf_pool_min_size;
extern const ulint srv_buf_pool_def_size;
/** Requested buffer pool chunk size */
extern ulong srv_buf_pool_chunk_unit;
/** Number of locks to protect buf_pool.page_hash */
extern ulong srv_n_page_hash_locks;
/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
extern ulong srv_LRU_scan_depth;
/** Whether or not to flush neighbors of a block */

View file

@ -226,22 +226,8 @@ rw_lock_lock_word_decr(
caused by concurrent executions of
rw_lock_s_lock(). */
#if 1 /* FIXME: MDEV-22871 Spurious contention between rw_lock_s_lock() */
/* When the number of concurrently executing threads
exceeds the number of available processor cores,
multiple buf_pool.page_hash S-latch requests would
conflict here, mostly in buf_page_get_low(). We should
implement a simpler rw-lock where the S-latch
acquisition would be a simple fetch_add(1) followed by
either an optional load() loop to wait for the X-latch
to be released, or a fetch_sub(1) and a retry.
For now, we work around the problem with a delay in
this loop. It helped a little on some systems and was
reducing performance on others. */
(void) LF_BACKOFF();
#endif
/* Note: unlike this implementation, rw_lock::read_lock()
allows concurrent calls without a spin loop */
}
/* A real conflict was detected. */

View file

@ -126,7 +126,6 @@ extern mysql_pfs_key_t index_tree_rw_lock_key;
extern mysql_pfs_key_t index_online_log_key;
extern mysql_pfs_key_t dict_table_stats_key;
extern mysql_pfs_key_t trx_sys_rw_lock_key;
extern mysql_pfs_key_t hash_table_locks_key;
#endif /* UNIV_PFS_RWLOCK */
/** Prints info of the sync system.

View file

@ -207,9 +207,6 @@ const ulint srv_buf_pool_min_size = 5 * 1024 * 1024;
const ulint srv_buf_pool_def_size = 128 * 1024 * 1024;
/** Requested buffer pool chunk size */
ulong srv_buf_pool_chunk_unit;
/** innodb_page_hash_locks (a debug-only parameter);
number of locks to protect buf_pool.page_hash */
ulong srv_n_page_hash_locks = 64;
/** innodb_lru_scan_depth; number of blocks scanned in LRU flush batch */
ulong srv_LRU_scan_depth;
/** innodb_flush_neighbors; whether or not to flush neighbors of a block */

View file

@ -777,7 +777,7 @@ LatchDebug::check_order(
case SYNC_POOL:
case SYNC_POOL_MANAGER:
case SYNC_RECV_WRITER:
case SYNC_BUF_PAGE_HASH:
basic_check(latches, level, level);
break;
@ -825,14 +825,6 @@ LatchDebug::check_order(
basic_check(latches, level, level - 1);
break;
case SYNC_BUF_PAGE_HASH:
/* Multiple page_hash locks are only allowed during
buf_pool.validate() and that is where buf_pool mutex is already
held. */
/* Fall through */
case SYNC_REC_LOCK:
if (find(latches, SYNC_LOCK_SYS) != 0) {
@ -1453,9 +1445,6 @@ sync_latch_meta_init()
LATCH_ADD_RWLOCK(DICT_TABLE_STATS, SYNC_INDEX_TREE,
dict_table_stats_key);
LATCH_ADD_RWLOCK(HASH_TABLE_RW_LOCK, SYNC_BUF_PAGE_HASH,
hash_table_locks_key);
LATCH_ADD_MUTEX(SYNC_DEBUG_MUTEX, SYNC_NO_ORDER_CHECK,
PFS_NOT_INSTRUMENTED);

View file

@ -102,7 +102,6 @@ mysql_pfs_key_t buf_block_debug_latch_key;
# endif /* UNIV_DEBUG */
mysql_pfs_key_t dict_operation_lock_key;
mysql_pfs_key_t dict_table_stats_key;
mysql_pfs_key_t hash_table_locks_key;
mysql_pfs_key_t index_tree_rw_lock_key;
mysql_pfs_key_t index_online_log_key;
mysql_pfs_key_t fil_space_latch_key;