mariadb/storage/innobase/btr/btr0sea.cc
2025-02-11 20:29:43 +01:00

2090 lines
59 KiB
C++

/*****************************************************************************
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file btr/btr0sea.cc
The index tree adaptive search
Created 2/17/1996 Heikki Tuuri
*************************************************************************/
#include "btr0sea.h"
#ifdef BTR_CUR_HASH_ADAPT
#include "buf0buf.h"
#include "buf0lru.h"
#include "page0page.h"
#include "page0cur.h"
#include "btr0cur.h"
#include "btr0pcur.h"
#include "btr0btr.h"
#include "srv0mon.h"
#include "log.h"
#ifdef UNIV_SEARCH_PERF_STAT
/** Number of successful adaptive hash index lookups */
ulint btr_search_n_succ;
/** Number of failed adaptive hash index lookups */
ulint btr_search_n_hash_fail;
#endif /* UNIV_SEARCH_PERF_STAT */
#ifdef UNIV_PFS_RWLOCK
mysql_pfs_key_t btr_search_latch_key;
#endif /* UNIV_PFS_RWLOCK */
/** The adaptive hash index */
btr_sea btr_search;
struct ahi_node {
/** CRC-32C of the rec prefix */
uint32_t fold;
/** pointer to next record in the hash bucket chain, or nullptr */
ahi_node *next;
/** B-tree index leaf page record */
const rec_t *rec;
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
/** block containing rec, or nullptr */
buf_block_t *block;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
};
inline void btr_sea::partition::init() noexcept
{
latch.SRW_LOCK_INIT(btr_search_latch_key);
blocks_mutex.init();
UT_LIST_INIT(blocks, &buf_page_t::list);
}
inline void btr_sea::partition::clear() noexcept
{
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
ut_ad(blocks_mutex.is_locked());
#endif
if (buf_block_t *b= spare)
{
spare= nullptr;
MEM_MAKE_ADDRESSABLE(b->page.frame, srv_page_size);
buf_pool.free_block(b);
}
table.free();
while (buf_page_t *b= UT_LIST_GET_FIRST(blocks))
{
UT_LIST_REMOVE(blocks, b);
ut_ad(b->free_offset);
b->hash= nullptr;
MEM_MAKE_ADDRESSABLE(b->frame, srv_page_size);
buf_pool.free_block(reinterpret_cast<buf_block_t*>(b));
}
}
inline void btr_sea::partition::free() noexcept
{
if (table.array)
{
ut_d(latch.wr_lock(SRW_LOCK_CALL));
ut_d(blocks_mutex.wr_lock());
clear();
ut_d(blocks_mutex.wr_unlock());
ut_d(latch.wr_unlock());
}
latch.destroy();
blocks_mutex.destroy();
}
inline void btr_sea::partition::alloc(ulint hash_size) noexcept
{
table.create(hash_size);
}
void btr_sea::create() noexcept
{
for (partition &part : parts)
part.init();
if (enabled)
enable();
}
void btr_sea::alloc(ulint hash_size) noexcept
{
hash_size/= n_parts;
for (ulong i= 0; i < n_parts; ++i)
parts[i].alloc(hash_size);
}
inline void btr_sea::clear() noexcept
{
for (ulong i= 0; i < n_parts; ++i)
parts[i].clear();
}
void btr_sea::free() noexcept
{
for (partition &part : parts)
part.free();
}
/** If the number of records on the page divided by this parameter
would have been successfully accessed using a hash index, the index
is then built on the page, assuming the global limit has been reached */
static constexpr unsigned BTR_SEARCH_PAGE_BUILD_LIMIT= 16;
/** The global limit for consecutive potentially successful hash searches,
before hash index building is started */
static constexpr uint8_t BTR_SEARCH_BUILD_LIMIT= 100;
/** Determine the number of accessed key fields.
@param n_bytes_fields number of complete fields | incomplete_bytes << 16
@return number of complete or incomplete fields */
inline size_t btr_search_get_n_fields(ulint n_bytes_fields) noexcept
{
return uint16_t(n_bytes_fields) + (n_bytes_fields >= 1U << 16);
}
/** Determine the number of accessed key fields.
@param cursor b-tree cursor
@return number of complete or incomplete fields */
inline size_t btr_search_get_n_fields(const btr_cur_t *cursor) noexcept
{
return btr_search_get_n_fields(cursor->n_bytes_fields);
}
/** Compute a hash value of a record in a page.
@tparam comp whether ROW_FORMAT=REDUNDANT is not being used
@param rec index record
@param index index tree
@param n_bytes_fields bytes << 16 | number of complete fields
@return CRC-32C of the record prefix */
template<bool comp>
static uint32_t rec_fold(const rec_t *rec, const dict_index_t &index,
uint32_t n_bytes_fields) noexcept
{
ut_ad(page_rec_is_leaf(rec));
ut_ad(page_rec_is_user_rec(rec));
ut_ad(!rec_is_metadata(rec, index));
ut_ad(index.n_uniq <= index.n_core_fields);
size_t n_f= btr_search_get_n_fields(n_bytes_fields);
ut_ad(n_f > 0);
ut_ad(n_f <= index.n_core_fields);
ut_ad(comp == index.table->not_redundant());
size_t n;
if (comp)
{
const byte *nulls= rec - REC_N_NEW_EXTRA_BYTES;
const byte *lens;
if (rec_get_status(rec) == REC_STATUS_INSTANT)
{
ulint n_fields= index.n_core_fields + rec_get_n_add_field(nulls) + 1;
ut_ad(n_fields <= index.n_fields);
const ulint n_nullable= index.get_n_nullable(n_fields);
ut_ad(n_nullable <= index.n_nullable);
lens= --nulls - UT_BITS_IN_BYTES(n_nullable);
}
else
lens= --nulls - index.n_core_null_bytes;
byte null_mask= 1;
n= 0;
const dict_field_t *field= index.fields;
size_t len;
do
{
const dict_col_t *col= field->col;
if (col->is_nullable())
{
const int is_null{*nulls & null_mask};
#if defined __GNUC__ && !defined __clang__
# pragma GCC diagnostic push
# if __GNUC__ < 12 || defined WITH_UBSAN
# pragma GCC diagnostic ignored "-Wconversion"
# endif
#endif
null_mask<<= 1;
#if defined __GNUC__ && !defined __clang__
# pragma GCC diagnostic pop
#endif
if (UNIV_UNLIKELY(!null_mask))
null_mask= 1, nulls--;
if (is_null)
{
len= 0;
continue;
}
}
len= field->fixed_len;
if (!len)
{
len= *lens--;
if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col))
{
len<<= 8;
len|= *lens--;
ut_ad(!(len & 0x4000));
len&= 0x3fff;
}
}
n+= len;
}
while (field++, --n_f);
if (size_t n_bytes= n_bytes_fields >> 16)
n+= std::min(n_bytes, len) - len;
}
else
{
const size_t n_bytes= n_bytes_fields >> 16;
ut_ad(n_f <= rec_get_n_fields_old(rec));
if (rec_get_1byte_offs_flag(rec))
{
n= rec_1_get_field_end_info(rec, n_f - 1) & ~REC_1BYTE_SQL_NULL_MASK;
if (!n_bytes);
else if (!uint16_t(n_bytes_fields))
n= std::min(n_bytes, n);
else
{
size_t len= n - (rec_1_get_field_end_info(rec, n_f - 2) &
~REC_1BYTE_SQL_NULL_MASK);
n+= std::min(n_bytes, n - len) - len;
}
}
else
{
n= rec_2_get_field_end_info(rec, n_f - 1) & ~REC_2BYTE_SQL_NULL_MASK;
ut_ad(n < REC_2BYTE_EXTERN_MASK); /* keys never are BLOBs */
if (!n_bytes);
else if (!uint16_t(n_bytes_fields))
n= std::min(n_bytes, n);
else
{
size_t len= n - (rec_2_get_field_end_info(rec, n_f - 2) &
~REC_2BYTE_SQL_NULL_MASK);
n+= std::min(n_bytes, n - len) - len;
}
}
}
return my_crc32c(uint32_t(ut_fold_ull(index.id)), rec, n);
}
static uint32_t rec_fold(const rec_t *rec, const dict_index_t &index,
uint32_t n_bytes_fields, ulint comp) noexcept
{
return comp
? rec_fold<true>(rec, index, n_bytes_fields)
: rec_fold<false>(rec, index, n_bytes_fields);
}
void btr_sea::partition::prepare_insert() noexcept
{
/* spare may be consumed by insert() or clear() */
if (!spare)
{
buf_block_t *block= buf_block_alloc();
blocks_mutex.wr_lock();
if (!spare && btr_search.enabled)
{
MEM_NOACCESS(block->page.frame, srv_page_size);
spare= block;
block= nullptr;
}
blocks_mutex.wr_unlock();
if (block)
buf_pool.free_block(block);
}
}
/** Clear the AHI reference count on all tables.
@param tables list of tables */
static void
btr_search_disable(const UT_LIST_BASE_NODE_T(dict_table_t)& tables) noexcept
{
for (dict_table_t *table= UT_LIST_GET_FIRST(dict_sys.table_LRU); table;
table = UT_LIST_GET_NEXT(table_LRU, table))
for (dict_index_t *index= dict_table_get_first_index(table); index;
index= dict_table_get_next_index(index))
index->search_info.ref_count= 0;
}
/** Lazily free detached metadata when removing the last reference. */
ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
{
ut_ad(index->freed());
dict_table_t *table= index->table;
table->autoinc_mutex.wr_lock();
/* Perform the skipped steps of dict_index_remove_from_cache_low(). */
UT_LIST_REMOVE(table->freed_indexes, index);
index->lock.free();
dict_mem_index_free(index);
if (!UT_LIST_GET_LEN(table->freed_indexes) &&
!UT_LIST_GET_LEN(table->indexes))
{
ut_ad(!table->id);
table->autoinc_mutex.wr_unlock();
table->autoinc_mutex.destroy();
dict_mem_table_free(table);
return;
}
table->autoinc_mutex.wr_unlock();
}
/** Disable the adaptive hash search system and empty the index. */
void btr_sea::disable() noexcept
{
dict_sys.freeze(SRW_LOCK_CALL);
for (ulong i= 0; i < n_parts; i++)
{
parts[i].latch.wr_lock(SRW_LOCK_CALL);
parts[i].blocks_mutex.wr_lock();
}
if (enabled)
{
enabled= false;
btr_search_disable(dict_sys.table_LRU);
btr_search_disable(dict_sys.table_non_LRU);
dict_sys.unfreeze();
/* Set all block->index= nullptr. */
buf_pool.clear_hash_index();
clear();
}
else
dict_sys.unfreeze();
for (ulong i= 0; i < n_parts; i++)
{
parts[i].latch.wr_unlock();
parts[i].blocks_mutex.wr_unlock();
}
}
/** Enable the adaptive hash search system.
@param resize whether buf_pool_t::resize() is the caller */
void btr_sea::enable(bool resize) noexcept
{
if (!resize)
{
mysql_mutex_lock(&buf_pool.mutex);
bool changed= srv_buf_pool_old_size != srv_buf_pool_size;
mysql_mutex_unlock(&buf_pool.mutex);
if (changed)
return;
}
for (ulong i= 0; i < n_parts; i++)
{
parts[i].latch.wr_lock(SRW_LOCK_CALL);
parts[i].blocks_mutex.wr_lock();
}
if (!parts[0].table.array)
{
enabled= true;
alloc(buf_pool_get_curr_size() / sizeof(void *) / 64);
}
ut_ad(enabled);
for (ulong i= 0; i < n_parts; i++)
{
parts[i].blocks_mutex.wr_unlock();
parts[i].latch.wr_unlock();
}
}
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
# define ha_insert_for_fold(p,f,b,d) (p).insert(f,d,b)
#else
# define ha_insert_for_fold(p,f,b,d) (p).insert(f,d)
#endif
ATTRIBUTE_NOINLINE
/** Update a hash node reference when it has been unsuccessfully used in a
search which could have succeeded with the used hash parameters. This can
happen because when building a hash index for a page, we do not check
what happens at page boundaries, and therefore there can be misleading
hash nodes. Also, collisions in the fold value can lead to misleading
references. This function lazily fixes these imperfections in the hash
index.
@param cursor B-tree cursor
@param block cursor block
@param left_bytes_fields AHI paramaters */
static void btr_search_update_hash_ref(const btr_cur_t &cursor,
buf_block_t *block,
uint32_t left_bytes_fields) noexcept
{
ut_ad(block == cursor.page_cur.block);
#ifdef UNIV_SEARCH_PERF_STAT
btr_search_n_hash_fail++;
#endif /* UNIV_SEARCH_PERF_STAT */
dict_index_t *const index= cursor.index();
ut_ad(block->page.id().space() == index->table->space_id);
btr_sea::partition &part= btr_search.get_part(index->id);
part.prepare_insert();
part.latch.wr_lock(SRW_LOCK_CALL);
if (ut_d(const dict_index_t *block_index=) block->index)
{
ut_ad(block_index == index);
ut_ad(btr_search.enabled);
uint32_t bytes_fields{block->ahi_left_bytes_fields};
if (bytes_fields != left_bytes_fields)
goto skip;
if (UNIV_UNLIKELY(index->search_info.left_bytes_fields !=
left_bytes_fields))
goto skip;
bytes_fields&= ~buf_block_t::LEFT_SIDE;
const rec_t *rec= cursor.page_cur.rec;
uint32_t fold;
if (page_is_comp(block->page.frame))
{
switch (rec - block->page.frame) {
case PAGE_NEW_INFIMUM:
case PAGE_NEW_SUPREMUM:
goto skip;
default:
fold= rec_fold<true>(rec, *index, bytes_fields);
}
}
else
{
switch (rec - block->page.frame) {
case PAGE_OLD_INFIMUM:
case PAGE_OLD_SUPREMUM:
goto skip;
default:
fold= rec_fold<false>(rec, *index, bytes_fields);
}
}
ha_insert_for_fold(part, fold, block, rec);
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
}
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
else
ut_a(!block->n_pointers);
# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
skip:
part.latch.wr_unlock();
}
/** Updates the search info of an index about hash successes.
@param cursor freshly positioned cursor
@return AHI parameters
@retval 0 if the adaptive hash index should not be rebuilt */
static uint32_t btr_search_info_update_hash(const btr_cur_t &cursor) noexcept
{
ut_ad(cursor.flag == BTR_CUR_HASH_FAIL ||
cursor.flag == BTR_CUR_HASH_ABORT ||
cursor.flag == BTR_CUR_BINARY);
dict_index_t *const index= cursor.index();
buf_block_t *const block= cursor.page_cur.block;
ut_ad(block->page.lock.have_any());
ut_d(const uint32_t state= block->page.state());
ut_ad(state >= buf_page_t::UNFIXED);
ut_ad(!block->page.is_read_fixed(state));
switch (uintptr_t(btr_cur_get_rec(&cursor) - block->page.frame)) {
case PAGE_OLD_INFIMUM:
case PAGE_OLD_SUPREMUM:
case PAGE_NEW_INFIMUM:
case PAGE_NEW_SUPREMUM:
/* The adaptive hash index only includes user records. */
return 0;
}
const dict_index_t *const block_index= block->index;
uint16_t n_hash_helps{block->n_hash_helps};
const uint16_t n_uniq=
uint16_t(index->n_uniq ? index->n_uniq : index->n_fields);
dict_index_t::ahi &info= index->search_info;
uint32_t left_bytes_fields{info.left_bytes_fields};
uint8_t n_hash_potential= info.n_hash_potential;
uint32_t ret;
if (!n_hash_potential)
{
info.left_bytes_fields= left_bytes_fields= buf_block_t::LEFT_SIDE | 1;
info.hash_analysis_reset();
increment_potential:
if (n_hash_potential < BTR_SEARCH_BUILD_LIMIT)
info.n_hash_potential= ++n_hash_potential;
if (n_hash_helps)
goto got_help;
goto no_help;
}
else if (uint16_t(left_bytes_fields) >= n_uniq && cursor.up_match >= n_uniq)
/* The search would have succeeded using the recommended prefix */
goto increment_potential;
else
{
const bool left_side{!!(left_bytes_fields & buf_block_t::LEFT_SIDE)};
const int info_cmp=
int(uint16_t((left_bytes_fields & ~buf_block_t::LEFT_SIDE) >> 16) |
int{uint16_t(left_bytes_fields)} << 16);
const int low_cmp = int(cursor.low_match << 16 | cursor.low_bytes);
const int up_cmp = int(cursor.up_match << 16 | cursor.up_bytes);
if (left_side == (info_cmp > low_cmp) && left_side == (info_cmp <= up_cmp))
goto increment_potential;
const int cmp= up_cmp - low_cmp;
static_assert(buf_block_t::LEFT_SIDE == 1U << 31, "");
left_bytes_fields= (cmp >= 0) << 31;
if (left_bytes_fields)
{
if (cursor.up_match >= n_uniq)
left_bytes_fields|= n_uniq;
else if (cursor.low_match < cursor.up_match)
left_bytes_fields|= uint32_t(cursor.low_match + 1);
else
{
left_bytes_fields|= cursor.low_match;
left_bytes_fields|= uint32_t(cursor.low_bytes + 1) << 16;
}
}
else
{
if (cursor.low_match >= n_uniq)
left_bytes_fields|= n_uniq;
else if (cursor.low_match > cursor.up_match)
left_bytes_fields|= uint32_t(cursor.up_match + 1);
else
{
left_bytes_fields|= cursor.up_match;
left_bytes_fields|= uint32_t(cursor.up_bytes + 1) << 16;
}
}
/* We have to set a new recommendation; skip the hash analysis for a
while to avoid unnecessary CPU time usage when there is no chance
for success */
info.hash_analysis_reset();
info.left_bytes_fields= left_bytes_fields;
info.n_hash_potential= cmp != 0;
if (cmp == 0)
goto no_help;
}
ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
ut_ad(btr_cur_get_page(&cursor) == page_align(btr_cur_get_rec(&cursor)));
ut_ad(page_is_leaf(btr_cur_get_page(&cursor)));
if (!n_hash_helps)
{
no_help:
info.last_hash_succ= false;
block->n_hash_helps= 1;
ret= 0;
}
else
{
got_help:
const uint32_t ahi_left_bytes_fields= block->ahi_left_bytes_fields;
ret= left_bytes_fields;
info.last_hash_succ=
block_index && ahi_left_bytes_fields == left_bytes_fields;
if (n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)
{
const auto n_recs= page_get_n_recs(block->page.frame);
if (n_hash_helps / 2 > n_recs)
goto func_exit;
if (n_hash_helps >= n_recs / BTR_SEARCH_PAGE_BUILD_LIMIT &&
(!block_index || left_bytes_fields != ahi_left_bytes_fields))
goto func_exit;
}
if (++n_hash_helps)
block->n_hash_helps= n_hash_helps;
ret= 0;
}
func_exit:
if (!block_index);
else if (UNIV_UNLIKELY(block_index != index))
{
ut_ad(block_index->id == index->id);
btr_search_drop_page_hash_index(block, nullptr);
}
else if (cursor.flag == BTR_CUR_HASH_FAIL)
btr_search_update_hash_ref(cursor, block, left_bytes_fields);
return ret;
}
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
/** Maximum number of records in a page */
constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
void btr_sea::partition::insert(uint32 fold, const rec_t *rec,
buf_block_t *block) noexcept
#else
void btr_sea::partition::insert(uint32_t fold, const rec_t *rec) noexcept
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
{
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(block->page.frame == page_align(rec));
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
ut_ad(btr_search.enabled);
ahi_node **prev= table.cell_get(fold)->
search(&ahi_node::next, [fold](const ahi_node *node)
{ return !node || node->fold == fold; });
ahi_node *node= *prev;
if (node)
{
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
buf_block_t *prev_block= node->block;
if (prev_block != block)
{
ut_a(prev_block->page.frame == page_align(node->rec));
ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
ut_a(block->n_pointers++ < MAX_N_POINTERS);
node->block= block;
}
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
node->rec= rec;
return;
}
/* We have to allocate a new chain node */
{
blocks_mutex.wr_lock();
buf_page_t *last= UT_LIST_GET_LAST(blocks);
if (last && last->free_offset < srv_page_size - sizeof *node)
{
node= reinterpret_cast<ahi_node*>(last->frame + last->free_offset);
#if defined __GNUC__ && !defined __clang__
# pragma GCC diagnostic push
# if __GNUC__ < 12 || defined WITH_UBSAN
# pragma GCC diagnostic ignored "-Wconversion"
# endif
#endif
last->free_offset+= sizeof *node;
#if defined __GNUC__ && !defined __clang__
# pragma GCC diagnostic pop
#endif
MEM_MAKE_ADDRESSABLE(node, sizeof *node);
}
else
{
last= &spare.load()->page;
if (!last)
{
blocks_mutex.wr_unlock();
return;
}
spare= nullptr;
ut_ad(last->state() == buf_page_t::MEMORY);
ut_ad(!reinterpret_cast<buf_block_t*>(last)->index);
ut_ad(!reinterpret_cast<buf_block_t*>(last)->n_pointers);
UT_LIST_ADD_LAST(blocks, last);
last->free_offset= sizeof *node;
node= reinterpret_cast<ahi_node*>(last->frame);
MEM_UNDEFINED(last->frame, srv_page_size);
MEM_MAKE_ADDRESSABLE(node, sizeof *node);
MEM_NOACCESS(node + 1, srv_page_size - sizeof *node);
}
blocks_mutex.wr_unlock();
}
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(block->n_pointers++ < MAX_N_POINTERS);
node->block= block;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
node->rec= rec;
node->fold= fold;
node->next= nullptr;
*prev= node;
}
buf_block_t *btr_sea::partition::cleanup_after_erase(ahi_node *erase) noexcept
{
ut_ad(btr_search.enabled);
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(erase->block->page.frame == page_align(erase->rec));
ut_a(erase->block->n_pointers-- < MAX_N_POINTERS);
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
blocks_mutex.wr_lock();
buf_page_t *last= UT_LIST_GET_LAST(blocks);
const ahi_node *const top= reinterpret_cast<ahi_node*>
(last->frame + last->free_offset - sizeof *top);
if (erase != top)
{
/* Shrink the allocation by replacing the erased element with the top. */
*erase= *top;
ahi_node **prev= table.cell_get(top->fold)->
search(&ahi_node::next, [top](const ahi_node *n) { return n == top; });
*prev= erase;
}
buf_block_t *freed= nullptr;
#if defined __GNUC__ && !defined __clang__
# pragma GCC diagnostic push
# if __GNUC__ < 12 || defined WITH_UBSAN
# pragma GCC diagnostic ignored "-Wconversion"
# endif
#endif
/* We may be able to shrink or free the last block */
if (!(last->free_offset-= uint16_t(sizeof *erase)))
#if defined __GNUC__ && !defined __clang__
# pragma GCC diagnostic pop
#endif
{
if (spare)
{
freed= reinterpret_cast<buf_block_t*>(last);
MEM_MAKE_ADDRESSABLE(last->frame, srv_page_size);
}
else
spare= reinterpret_cast<buf_block_t*>(last);
UT_LIST_REMOVE(blocks, last);
}
else
MEM_NOACCESS(last->frame + last->free_offset, sizeof *erase);
blocks_mutex.wr_unlock();
return freed;
}
__attribute__((nonnull))
/** Delete all pointers to a page.
@param part hash table partition
@param fold CRC-32C value
@param page page of a record to be deleted */
static void ha_remove_all_nodes_to_page(btr_sea::partition &part,
uint32_t fold, const page_t *page)
noexcept
{
hash_cell_t *cell= part.table.cell_get(fold);
const uintptr_t page_size{srv_page_size};
rewind:
ahi_node **prev=
cell->search(&ahi_node::next, [page,page_size](const ahi_node *node)
{ return !node || (uintptr_t(node->rec) ^ uintptr_t(page)) < page_size; });
if (ahi_node *node= *prev)
{
*prev= node->next;
node->next= nullptr;
if (buf_block_t *block= part.cleanup_after_erase(node))
buf_pool.free_block(block);
/* The deletion may compact the heap of nodes and move other nodes! */
goto rewind;
}
/* Check that all nodes really got deleted */
ut_ad(!cell->find(&ahi_node::next, [page](const ahi_node* node)
{ return page_align(node->rec) == page; }));
}
inline bool btr_sea::partition::erase(uint32_t fold, const rec_t *rec) noexcept
{
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
ut_ad(btr_search.enabled);
ahi_node **prev= table.cell_get(fold)->
search(&ahi_node::next, [rec](const ahi_node *node)
{ return !node || node->rec == rec; });
if (ahi_node *node= *prev)
{
*prev= node->next;
node->next= nullptr;
buf_block_t *block= cleanup_after_erase(node);
latch.wr_unlock();
if (block)
buf_pool.free_block(block);
return true;
}
latch.wr_unlock();
return false;
}
__attribute__((nonnull))
/** Looks for an element when we know the pointer to the data and
updates the pointer to data if found.
@param table hash table
@param fold folded value of the searched data
@param data pointer to the data
@param new_data new pointer to the data
@return whether the element was found */
static bool ha_search_and_update_if_found(hash_table_t *table, uint32_t fold,
const rec_t *data,
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
/** block containing new_data */
buf_block_t *new_block,
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
const rec_t *new_data) noexcept
{
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(new_block->page.frame == page_align(new_data));
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
ut_ad(btr_search.enabled);
if (ahi_node *node= table->cell_get(fold)->
find(&ahi_node::next, [data](const ahi_node *node)
{ return node->rec == data; }))
{
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
if (node->block != new_block)
{
ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
node->block= new_block;
}
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
node->rec= new_data;
return true;
}
return false;
}
#if !defined UNIV_AHI_DEBUG && !defined UNIV_DEBUG
# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
ha_search_and_update_if_found(table,fold,data,new_data)
#endif
/** Clear the adaptive hash index on all pages in the buffer pool. */
inline void buf_pool_t::clear_hash_index() noexcept
{
ut_ad(!resizing);
ut_ad(!btr_search.enabled);
std::set<dict_index_t*> garbage;
for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
{
for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
block != end; block++)
{
dict_index_t *index= block->index;
/* We can clear block->index and block->n_pointers when
holding all AHI latches exclusively; see the comments in buf0buf.h */
if (!index)
{
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(!block->n_pointers);
# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
continue;
}
ut_d(const auto s= block->page.state());
/* Another thread may have set the state to
REMOVE_HASH in buf_LRU_block_remove_hashed().
The state change in buf_pool_t::realloc() is not observable
here, because in that case we would have !block->index.
In the end, the entire adaptive hash index will be removed. */
ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH);
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
block->n_pointers= 0;
# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
block->index= nullptr;
if (index->freed())
garbage.emplace(index);
else
index->search_info.ref_count= 0;
}
}
for (dict_index_t *index : garbage)
btr_search_lazy_free(index);
}
/** Get a buffer block from an adaptive hash index pointer.
This function does not return if the block is not identified.
@param ptr pointer to within a page frame
@return pointer to block, never NULL */
inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const noexcept
{
chunk_t::map *chunk_map = chunk_t::map_ref;
ut_ad(chunk_t::map_ref == chunk_t::map_reg);
ut_ad(!resizing);
chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
ut_a(it != chunk_map->begin());
chunk_t *chunk= it == chunk_map->end()
? chunk_map->rbegin()->second
: (--it)->second;
const size_t offs= size_t(ptr - chunk->blocks->page.frame) >>
srv_page_size_shift;
ut_a(offs < chunk->size);
buf_block_t *block= &chunk->blocks[offs];
/* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
block[n].frame == block->page.frame + n * srv_page_size. Check it. */
ut_ad(block->page.frame == page_align(ptr));
/* Read the state of the block without holding hash_lock.
A state transition to REMOVE_HASH is possible during
this execution. */
ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH);
return block;
}
/** Fold a prefix given as the number of fields of a tuple.
@param tuple index record
@param cursor B-tree cursor
@return CRC-32C of the record prefix */
static uint32_t dtuple_fold(const dtuple_t *tuple, const btr_cur_t *cursor)
{
ut_ad(tuple);
ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
ut_ad(dtuple_check_typed(tuple));
const bool comp= cursor->index()->table->not_redundant();
uint32_t fold= uint32_t(ut_fold_ull(cursor->index()->id));
const size_t n_fields= uint16_t(cursor->n_bytes_fields);
for (unsigned i= 0; i < n_fields; i++)
{
const dfield_t *field= dtuple_get_nth_field(tuple, i);
const void *data= dfield_get_data(field);
size_t len= dfield_get_len(field);
if (len == UNIV_SQL_NULL)
{
if (UNIV_UNLIKELY(!comp))
{
len= dtype_get_sql_null_size(dfield_get_type(field), 0);
data= field_ref_zero;
}
else
continue;
}
fold= my_crc32c(fold, data, len);
}
if (size_t n_bytes= cursor->n_bytes_fields >> 16)
{
const dfield_t *field= dtuple_get_nth_field(tuple, n_fields);
const void *data= dfield_get_data(field);
size_t len= dfield_get_len(field);
if (len == UNIV_SQL_NULL)
{
if (UNIV_UNLIKELY(!comp))
{
len= dtype_get_sql_null_size(dfield_get_type(field), 0);
data= field_ref_zero;
}
else
return fold;
}
fold= my_crc32c(fold, data, std::min(n_bytes, len));
}
return fold;
}
/** Tries to guess the right search position based on the hash search info
of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
and the function returns TRUE, then cursor->up_match and cursor->low_match
both have sensible values.
@param[in,out] index index
@param[in] tuple logical record
@param[in] ge false=PAGE_CUR_LE, true=PAGE_CUR_GE
@param[in] latch_mode BTR_SEARCH_LEAF, ...
@param[out] cursor tree cursor
@param[in] mtr mini-transaction
@return whether the search succeeded */
TRANSACTIONAL_TARGET
bool
btr_search_guess_on_hash(
dict_index_t* index,
const dtuple_t* tuple,
bool ge,
btr_latch_mode latch_mode,
btr_cur_t* cursor,
mtr_t* mtr) noexcept
{
ut_ad(mtr->is_active());
ut_ad(index->is_btree());
ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
ut_ad(cursor->flag == BTR_CUR_BINARY);
if ((tuple->info_bits & REC_INFO_MIN_REC_FLAG))
return false;
if (!index->search_info.last_hash_succ ||
!index->search_info.n_hash_potential)
{
ahi_unusable:
if (!index->table->is_temporary() && btr_search.enabled)
cursor->flag= BTR_CUR_HASH_ABORT;
return false;
}
ut_ad(!index->table->is_temporary());
static_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH}, "");
static_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH}, "");
cursor->n_bytes_fields= index->search_info.left_bytes_fields &
~buf_block_t::LEFT_SIDE;
if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor))
goto ahi_unusable;
const index_id_t index_id= index->id;
#ifdef UNIV_SEARCH_PERF_STAT
index->search_info.n_hash_succ++;
#endif
const uint32_t fold= dtuple_fold(tuple, cursor);
cursor->fold= fold;
btr_sea::partition &part= btr_search.get_part(*index);
part.latch.rd_lock(SRW_LOCK_CALL);
if (!btr_search.enabled)
{
ahi_release_and_fail:
part.latch.rd_unlock();
fail:
#ifdef UNIV_SEARCH_PERF_STAT
++index->search_info.n_hash_fail;
if (index->search_info.n_hash_succ > 0)
--index->search_info.n_hash_succ;
#endif /* UNIV_SEARCH_PERF_STAT */
index->search_info.last_hash_succ= false;
return false;
}
const ahi_node *node= part.table.cell_get(fold)->
find(&ahi_node::next, [fold](const ahi_node* node)
{ return node->fold == fold; });
if (!node)
{
cursor->flag= BTR_CUR_HASH_FAIL;
goto ahi_release_and_fail;
}
const rec_t *rec= node->rec;
buf_block_t *block= buf_pool.block_from_ahi(rec);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
ut_a(block == node->block);
#endif
{
buf_pool_t::hash_chain &chain=
buf_pool.page_hash.cell_get(block->page.id().fold());
/* We must hold the cell latch while attempting to acquire
block->page.lock, because buf_LRU_block_remove_hashed() assumes
that block->page.can_relocate() will not cease to hold. */
transactional_shared_lock_guard<page_hash_latch> g
{buf_pool.page_hash.lock_get(chain)};
if (latch_mode == BTR_SEARCH_LEAF
? !block->page.lock.s_lock_try()
: !block->page.lock.x_lock_try())
goto ahi_release_and_fail;
}
const uint32_t state{block->page.state()};
if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED))
{
ut_ad(state == buf_page_t::REMOVE_HASH);
block_and_ahi_release_and_fail:
if (latch_mode == BTR_SEARCH_LEAF)
block->page.lock.s_unlock();
else
block->page.lock.x_unlock();
cursor->flag= BTR_CUR_HASH_FAIL;
goto ahi_release_and_fail;
}
ut_ad(!block->page.is_read_fixed(state));
ut_ad(!block->page.is_write_fixed(state) || latch_mode == BTR_SEARCH_LEAF);
const dict_index_t *block_index= block->index;
if (index != block_index && index_id == block_index->id)
{
ut_a(block_index->freed());
cursor->flag= BTR_CUR_HASH_FAIL;
goto block_and_ahi_release_and_fail;
}
/* We successfully validated the state of the block and that it
actually belongs to our index. Now it is safe to release part.latch.
Because we are holding block->page.lock, the page cannot be
modified or evicted (buf_page_t::can_relocate() will not hold) while
we validate the guessed rec. */
part.latch.rd_unlock();
block->page.fix();
buf_page_make_young_if_needed(&block->page);
static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF}, "");
static_assert(ulint{MTR_MEMO_PAGE_X_FIX} == ulint{BTR_MODIFY_LEAF}, "");
++buf_pool.stat.n_page_gets;
mtr->memo_push(block, mtr_memo_type_t(latch_mode));
ut_ad(page_rec_is_user_rec(rec));
ut_ad(page_is_leaf(block->page.frame));
btr_cur_position(index, const_cast<rec_t*>(rec), block, cursor);
const ulint comp{page_is_comp(block->page.frame)};
if (UNIV_LIKELY(comp != 0))
switch (rec_get_status(rec)) {
case REC_STATUS_INSTANT:
case REC_STATUS_ORDINARY:
break;
default:
mismatch:
mtr->release_last_page();
cursor->flag= BTR_CUR_HASH_FAIL;
goto fail;
}
/* Check the validity of the guess within the page */
if (index_id != btr_page_get_index_id(block->page.frame) ||
cursor->check_mismatch(*tuple, ge, comp))
goto mismatch;
uint8_t n_hash_potential= index->search_info.n_hash_potential;
if (n_hash_potential++ < BTR_SEARCH_BUILD_LIMIT)
index->search_info.n_hash_potential= n_hash_potential;
index->search_info.last_hash_succ= true;
cursor->flag= BTR_CUR_HASH;
#ifdef UNIV_SEARCH_PERF_STAT
btr_search_n_succ++;
#endif
return true;
}
/** The maximum number of rec_fold() values to allocate in stack.
The actual maximum number of records per page is 8189, limited by
the 13-bit heap number field in the record header. */
static constexpr size_t REC_FOLD_IN_STACK= 128;
/** Drop any adaptive hash index entries that point to an index page.
@param block latched block containing index page, or a buffer-unfixed
index page or a block in state BUF_BLOCK_REMOVE_HASH
@param not_garbage drop only if the index is set and NOT this
@param folds work area for REC_FOLD_IN_STACK rec_fold() values */
static void btr_search_drop_page_hash_index(buf_block_t *block,
const dict_index_t *not_garbage,
uint32_t *folds) noexcept
{
retry:
dict_index_t *index= block->index;
if (!index || index == not_garbage)
return;
ut_d(const auto state= block->page.state());
ut_ad(state == buf_page_t::REMOVE_HASH || state >= buf_page_t::UNFIXED);
ut_ad(state == buf_page_t::REMOVE_HASH ||
!(~buf_page_t::LRU_MASK & state) || block->page.lock.have_any());
ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
ut_ad(page_is_leaf(block->page.frame));
/* We must not dereference block->index here, because it could be freed
if (!index->table->get_ref_count() && !dict_sys.frozen()).
Determine the ahi_slot based on the block contents. */
const index_id_t index_id= btr_page_get_index_id(block->page.frame);
btr_sea::partition &part= btr_search.get_part(index_id);
part.latch.rd_lock(SRW_LOCK_CALL);
index= block->index;
if (!index)
{
unlock_and_return:
part.latch.rd_unlock();
return;
}
ut_ad(btr_search.enabled);
bool holding_x= index->freed();
if (holding_x)
{
part.latch.rd_unlock();
part.latch.wr_lock(SRW_LOCK_CALL);
if (index != block->index)
{
part.latch.wr_unlock();
goto retry;
}
}
else if (not_garbage != nullptr)
{
ut_ad(!index || index == not_garbage ||
not_garbage == reinterpret_cast<dict_index_t*>(-1));
goto unlock_and_return;
}
assert_block_ahi_valid(block);
ut_ad(!index->table->is_temporary());
ut_ad(block->page.id().space() == index->table->space_id);
ut_a(index_id == index->id);
const uint32_t left_bytes_fields= block->ahi_left_bytes_fields;
/* NOTE: block->ahi_left_bytes_fields may change after we release part.latch,
as we might only hold block->page.lock.s_lock()! */
if (!holding_x)
part.latch.rd_unlock();
const uint32_t n_bytes_fields= left_bytes_fields & ~buf_block_t::LEFT_SIDE;
ut_ad(n_bytes_fields);
const page_t *const page= block->page.frame;
size_t n_folds= 0;
const rec_t *rec;
if (page_is_comp(page))
{
rec= page_rec_next_get<true>(page, page + PAGE_NEW_INFIMUM);
if (rec && rec_is_metadata(rec, true))
{
ut_ad(index->is_instant());
rec= page_rec_next_get<true>(page, rec);
}
while (rec && rec != page + PAGE_NEW_SUPREMUM)
{
next_not_redundant:
folds[n_folds]= rec_fold<true>(rec, *index, n_bytes_fields);
rec= page_rec_next_get<true>(page, rec);
if (!n_folds)
n_folds++;
else if (folds[n_folds] == folds[n_folds - 1]);
else if (++n_folds == REC_FOLD_IN_STACK)
break;
}
}
else
{
rec= page_rec_next_get<false>(page, page + PAGE_OLD_INFIMUM);
if (rec && rec_is_metadata(rec, false))
{
ut_ad(index->is_instant());
rec= page_rec_next_get<false>(page, rec);
}
while (rec && rec != page + PAGE_OLD_SUPREMUM)
{
next_redundant:
folds[n_folds]= rec_fold<false>(rec, *index, n_bytes_fields);
rec= page_rec_next_get<false>(page, rec);
if (!n_folds)
n_folds++;
else if (folds[n_folds] == folds[n_folds - 1]);
else if (++n_folds == REC_FOLD_IN_STACK)
break;
}
}
if (!holding_x)
{
part.latch.wr_lock(SRW_LOCK_CALL);
if (UNIV_UNLIKELY(!block->index))
/* Someone else has meanwhile dropped the hash index */
goto cleanup;
ut_a(block->index == index);
}
if (block->ahi_left_bytes_fields != left_bytes_fields)
{
/* Someone else has meanwhile built a new hash index on the page,
with different parameters */
part.latch.wr_unlock();
goto retry;
}
MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_folds);
while (n_folds)
ha_remove_all_nodes_to_page(part, folds[--n_folds], page);
if (!rec);
else if (page_is_comp(page))
{
if (rec != page + PAGE_NEW_SUPREMUM)
{
holding_x= true;
goto next_not_redundant;
}
}
else if (rec != page + PAGE_OLD_SUPREMUM)
{
holding_x= true;
goto next_redundant;
}
switch (index->search_info.ref_count--) {
case 0:
ut_error;
case 1:
if (index->freed())
btr_search_lazy_free(index);
}
ut_ad(!block->n_pointers);
block->index= nullptr;
MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
cleanup:
assert_block_ahi_valid(block);
part.latch.wr_unlock();
}
void btr_search_drop_page_hash_index(buf_block_t *block,
const dict_index_t *not_garbage) noexcept
{
uint32_t folds[REC_FOLD_IN_STACK];
btr_search_drop_page_hash_index(block, not_garbage, folds);
}
void btr_search_drop_page_hash_when_freed(const page_id_t page_id) noexcept
{
mtr_t mtr;
mtr.start();
/* If the caller has a latch on the page, then the caller must be an
x-latch page and it must have already dropped the hash index for the
page. Because of the x-latch that we are possibly holding, we must
(recursively) x-latch it, even though we are only reading. */
if (buf_block_t *block= buf_page_get_gen(page_id, 0, RW_X_LATCH, nullptr,
BUF_PEEK_IF_IN_POOL, &mtr))
{
/* In all our callers, the table handle should be open, or we
should be in the process of dropping the table (preventing eviction). */
ut_d(if (dict_index_t *i= block->index))
ut_ad(i->table->get_ref_count() || dict_sys.locked());
btr_search_drop_page_hash_index(block, nullptr);
}
mtr.commit();
}
/** Build a hash index on a page with the given parameters. If the page already
has a hash index with different parameters, the old hash index is removed.
If index is non-NULL, this function checks if n_fields and n_bytes are
sensible, and does not build a hash index if not.
@param index B-tree index
@param block latched B-tree leaf page
@param part the adaptive search partition
@param left_bytes_fields hash parameters */
static void btr_search_build_page_hash_index(dict_index_t *index,
buf_block_t *block,
btr_sea::partition &part,
const uint32_t left_bytes_fields)
noexcept
{
ut_ad(!index->table->is_temporary());
if (!btr_search.enabled)
return;
ut_ad(block->page.id().space() == index->table->space_id);
ut_ad(page_is_leaf(block->page.frame));
ut_ad(block->page.lock.have_any());
ut_ad(block->page.id().page_no() >= 3);
ut_ad(&part == &btr_search.get_part(*index));
part.latch.rd_lock(SRW_LOCK_CALL);
const dict_index_t *const block_index= block->index;
const bool rebuild= block_index &&
(block_index != index ||
block->ahi_left_bytes_fields != left_bytes_fields);
const bool enabled= btr_search.enabled;
part.latch.rd_unlock();
if (!enabled)
return;
struct{uint32_t fold;uint32_t offset;} fr[REC_FOLD_IN_STACK / 2];
if (rebuild)
btr_search_drop_page_hash_index(block, nullptr, &fr[0].fold);
const uint32_t n_bytes_fields{left_bytes_fields & ~buf_block_t::LEFT_SIDE};
/* Check that the values for hash index build are sensible */
ut_ad(n_bytes_fields);
ut_ad(btr_search_get_n_fields(n_bytes_fields) <=
(index->n_uniq ? index->n_uniq : index->n_fields));
const page_t *const page= block->page.frame;
size_t n_cached= 0;
const rec_t *rec;
if (page_is_comp(page))
{
rec= page_rec_next_get<true>(page, page + PAGE_NEW_INFIMUM);
if (rec && rec_is_metadata(rec, true))
{
ut_ad(index->is_instant());
rec= page_rec_next_get<true>(page, rec);
}
while (rec && rec != page + PAGE_NEW_SUPREMUM)
{
next_not_redundant:
const uint32_t offset= uint32_t(uintptr_t(rec));
fr[n_cached]= {rec_fold<true>(rec, *index, n_bytes_fields), offset};
rec= page_rec_next_get<true>(page, rec);
if (!n_cached)
n_cached= 1;
else if (fr[n_cached - 1].fold == fr[n_cached].fold)
{
if (!(left_bytes_fields & buf_block_t::LEFT_SIDE))
fr[n_cached - 1].offset= offset;
}
else if (++n_cached == array_elements(fr))
break;
}
}
else
{
rec= page_rec_next_get<false>(page, page + PAGE_OLD_INFIMUM);
if (rec && rec_is_metadata(rec, false))
{
ut_ad(index->is_instant());
rec= page_rec_next_get<false>(page, rec);
}
while (rec && rec != page + PAGE_OLD_SUPREMUM)
{
next_redundant:
const uint32_t offset= uint32_t(uintptr_t(rec));
fr[n_cached]= {rec_fold<false>(rec, *index, n_bytes_fields), offset};
rec= page_rec_next_get<false>(page, rec);
if (!n_cached)
n_cached= 1;
else if (fr[n_cached - 1].fold == fr[n_cached].fold)
{
if (!(left_bytes_fields & buf_block_t::LEFT_SIDE))
fr[n_cached - 1].offset= offset;
}
else if (++n_cached == array_elements(fr))
break;
}
}
part.prepare_insert();
part.latch.wr_lock(SRW_LOCK_CALL);
if (!block->index)
{
if (!btr_search.enabled)
goto exit_func;
ut_ad(!block->n_pointers);
index->search_info.ref_count++;
}
else if (block->ahi_left_bytes_fields != left_bytes_fields)
goto exit_func;
block->n_hash_helps= 0;
block->index= index;
block->ahi_left_bytes_fields= left_bytes_fields;
MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
while (n_cached)
{
#if SIZEOF_SIZE_T <= 4
const auto &f= fr[--n_cached];
const rec_t *rec= reinterpret_cast<const rec_t*>(f.offset);
#else
const auto f= fr[--n_cached];
const rec_t *rec= page + (uint32_t(uintptr_t(page)) ^ f.offset);
#endif
ha_insert_for_fold(part, f.fold, block, rec);
}
if (!rec);
else if (page_is_comp(page))
{
if (rec != page + PAGE_NEW_SUPREMUM)
{
part.latch.wr_unlock();
goto next_not_redundant;
}
}
else if (rec != page + PAGE_OLD_SUPREMUM)
{
part.latch.wr_unlock();
goto next_redundant;
}
MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
exit_func:
assert_block_ahi_valid(block);
part.latch.wr_unlock();
}
void btr_cur_t::search_info_update() const noexcept
{
if (uint32_t left_bytes_fields= btr_search_info_update_hash(*this))
btr_search_build_page_hash_index(index(), page_cur.block,
btr_search.get_part(*index()),
left_bytes_fields);
}
void btr_search_move_or_delete_hash_entries(buf_block_t *new_block,
buf_block_t *block) noexcept
{
ut_ad(block->page.lock.have_x());
ut_ad(new_block->page.lock.have_x());
if (!btr_search.enabled)
return;
dict_index_t *index= block->index, *new_block_index= new_block->index;
assert_block_ahi_valid(block);
assert_block_ahi_valid(new_block);
if (new_block_index)
{
ut_ad(!index || index == new_block_index);
drop_exit:
btr_search_drop_page_hash_index(block, nullptr);
return;
}
if (!index)
return;
btr_sea::partition &part= btr_search.get_part(*index);
part.latch.rd_lock(SRW_LOCK_CALL);
if (index->freed())
{
part.latch.rd_unlock();
goto drop_exit;
}
if (ut_d(dict_index_t *block_index=) block->index)
{
ut_ad(block_index == index);
const uint32_t left_bytes_fields{block->ahi_left_bytes_fields};
ut_ad(left_bytes_fields & ~buf_block_t::LEFT_SIDE);
part.latch.rd_unlock();
btr_search_build_page_hash_index(index, new_block, part,
left_bytes_fields);
return;
}
part.latch.rd_unlock();
}
void btr_search_update_hash_on_delete(btr_cur_t *cursor) noexcept
{
ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
if (!btr_search.enabled)
return;
buf_block_t *block= btr_cur_get_block(cursor);
ut_ad(block->page.lock.have_x());
assert_block_ahi_valid(block);
dict_index_t *index= block->index;
if (!index)
return;
ut_ad(!cursor->index()->table->is_temporary());
if (UNIV_UNLIKELY(index != cursor->index()))
{
btr_search_drop_page_hash_index(block, nullptr);
return;
}
ut_ad(block->page.id().space() == index->table->space_id);
const uint32_t n_bytes_fields=
block->ahi_left_bytes_fields & ~buf_block_t::LEFT_SIDE;
ut_ad(n_bytes_fields);
const rec_t *rec= btr_cur_get_rec(cursor);
uint32_t fold= rec_fold(rec, *index, n_bytes_fields,
page_is_comp(btr_cur_get_page(cursor)));
btr_sea::partition &part= btr_search.get_part(*index);
part.latch.wr_lock(SRW_LOCK_CALL);
assert_block_ahi_valid(block);
if (ut_d(dict_index_t *block_index=) block->index)
{
ut_ad(btr_search.enabled);
ut_ad(block_index == index);
if (part.erase(fold, rec))
{
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
}
else
{
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
}
}
else
part.latch.wr_unlock();
}
void btr_search_update_hash_on_insert(btr_cur_t *cursor, bool reorg) noexcept
{
ut_ad(!cursor->index()->table->is_temporary());
ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
buf_block_t *block= btr_cur_get_block(cursor);
ut_ad(block->page.lock.have_x());
assert_block_ahi_valid(block);
dict_index_t *index= block->index;
if (!index)
return;
ut_ad(block->page.id().space() == index->table->space_id);
const rec_t *rec= btr_cur_get_rec(cursor);
if (UNIV_UNLIKELY(index != cursor->index()))
{
ut_ad(index->id == cursor->index()->id);
drop:
btr_search_drop_page_hash_index(block, nullptr);
return;
}
btr_sea::partition &part= btr_search.get_part(*index);
bool locked= false;
const uint32_t left_bytes_fields{block->ahi_left_bytes_fields};
const uint32_t n_bytes_fields{left_bytes_fields & ~buf_block_t::LEFT_SIDE};
const page_t *const page= block->page.frame;
const rec_t *ins_rec;
const rec_t *next_rec;
uint32_t ins_fold, next_fold= 0, fold;
bool next_is_supremum, rec_valid;
if (!reorg && cursor->flag == BTR_CUR_HASH &&
left_bytes_fields == cursor->n_bytes_fields)
{
part.latch.wr_lock(SRW_LOCK_CALL);
if (!block->index)
goto unlock_exit;
ut_ad(btr_search.enabled);
locked= true;
if (page_is_comp(page))
{
ins_rec= page_rec_next_get<true>(page, rec);
update_on_insert:
/* The adaptive hash index is allowed to be a subset of what is
actually present in the index page. It could happen that there are
several INSERT with the same rec_fold() value, especially if the
record prefix identified by n_bytes_fields is being duplicated
in each INSERT. Therefore, we may fail to find the old rec
(and fail to update the AHI to point to to our ins_rec). */
if (ins_rec &&
ha_search_and_update_if_found(&part.table,
cursor->fold, rec, block, ins_rec))
{
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
}
assert_block_ahi_valid(block);
goto unlock_exit;
}
else
{
ins_rec= page_rec_next_get<false>(page, rec);
goto update_on_insert;
}
}
if (page_is_comp(page))
{
ins_rec= page_rec_next_get<true>(page, rec);
if (UNIV_UNLIKELY(!ins_rec)) goto drop;
next_rec= page_rec_next_get<true>(page, ins_rec);
if (UNIV_UNLIKELY(!next_rec)) goto drop;
ins_fold= rec_fold<true>(ins_rec, *index, n_bytes_fields);
next_is_supremum= next_rec == page + PAGE_NEW_SUPREMUM;
if (!next_is_supremum)
next_fold= rec_fold<true>(next_rec, *index, n_bytes_fields);
rec_valid= rec != page + PAGE_NEW_INFIMUM && !rec_is_metadata(rec, true);
if (rec_valid)
fold= rec_fold<true>(rec, *index, n_bytes_fields);
}
else
{
ins_rec= page_rec_next_get<false>(page, rec);
if (UNIV_UNLIKELY(!ins_rec)) goto drop;
next_rec= page_rec_next_get<false>(page, ins_rec);
if (UNIV_UNLIKELY(!next_rec)) goto drop;
ins_fold= rec_fold<false>(ins_rec, *index, n_bytes_fields);
next_is_supremum= next_rec == page + PAGE_OLD_SUPREMUM;
if (!next_is_supremum)
next_fold= rec_fold<false>(next_rec, *index, n_bytes_fields);
rec_valid= rec != page + PAGE_OLD_INFIMUM && !rec_is_metadata(rec, false);
if (rec_valid)
fold= rec_fold<false>(rec, *index, n_bytes_fields);
}
part.prepare_insert();
if (!rec_valid)
{
if (left_bytes_fields & buf_block_t::LEFT_SIDE)
{
locked= true;
part.latch.wr_lock(SRW_LOCK_CALL);
if (!block->index)
goto unlock_exit;
ha_insert_for_fold(part, ins_fold, block, ins_rec);
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
}
}
else if (fold != ins_fold)
{
if (!locked)
{
locked= true;
part.latch.wr_lock(SRW_LOCK_CALL);
if (!block->index)
goto unlock_exit;
}
if (left_bytes_fields & buf_block_t::LEFT_SIDE)
fold= ins_fold, rec= ins_rec;
ha_insert_for_fold(part, fold, block, rec);
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
}
if (next_is_supremum)
{
if (!(left_bytes_fields & ~buf_block_t::LEFT_SIDE))
{
if (!locked)
{
locked= true;
part.latch.wr_lock(SRW_LOCK_CALL);
if (!block->index)
goto unlock_exit;
}
ha_insert_for_fold(part, ins_fold, block, ins_rec);
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
}
}
else if (ins_fold != next_fold)
{
if (!locked)
{
locked= true;
part.latch.wr_lock(SRW_LOCK_CALL);
if (!block->index)
goto unlock_exit;
}
if (!(left_bytes_fields & ~buf_block_t::LEFT_SIDE))
next_fold= ins_fold, next_rec= ins_rec;
ha_insert_for_fold(part, next_fold, block, next_rec);
MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
}
ut_ad(!locked || index == block->index);
if (locked)
unlock_exit:
part.latch.wr_unlock();
}
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
__attribute__((nonnull))
/** @return whether a range of the cells is valid */
static bool ha_validate(const hash_table_t *table,
ulint start_index, ulint end_index)
{
ut_a(start_index <= end_index);
ut_a(end_index < table->n_cells);
bool ok= true;
for (ulint i= start_index; i <= end_index; i++)
{
for (auto node= static_cast<const ahi_node*>(table->array[i].node); node;
node= node->next)
{
if (table->calc_hash(node->fold) != i) {
sql_print_error("InnoDB: Hash table node fold value " UINT32PF
" does not match the cell number %zu",
node->fold, i);
ok= false;
}
}
}
return ok;
}
/** Lock all search latches in exclusive mode. */
static void btr_search_x_lock_all() noexcept
{
for (ulong i= 0; i < btr_search.n_parts; i++)
btr_search.parts[i].latch.wr_lock(SRW_LOCK_CALL);
}
/** Unlock all search latches from exclusive mode. */
static void btr_search_x_unlock_all() noexcept
{
for (ulong i= 0; i < btr_search.n_parts; i++)
btr_search.parts[i].latch.wr_unlock();
}
/** Validates the search system for given hash table.
@param thd connection, for checking if CHECK TABLE has been killed
@param hash_table_id hash table to validate
@return true if ok */
static bool btr_search_hash_table_validate(THD *thd, ulint hash_table_id)
noexcept
{
ahi_node* node;
bool ok = true;
ulint i;
ulint cell_count;
btr_search_x_lock_all();
if (!btr_search.enabled || (thd && thd_kill_level(thd))) {
func_exit:
btr_search_x_unlock_all();
return ok;
}
/* How many cells to check before temporarily releasing
search latches. */
ulint chunk_size = 10000;
mysql_mutex_lock(&buf_pool.mutex);
btr_sea::partition& part = btr_search.parts[hash_table_id];
cell_count = part.table.n_cells;
for (i = 0; i < cell_count; i++) {
/* We release search latches every once in a while to
give other queries a chance to run. */
if ((i != 0) && ((i % chunk_size) == 0)) {
mysql_mutex_unlock(&buf_pool.mutex);
btr_search_x_unlock_all();
std::this_thread::yield();
btr_search_x_lock_all();
if (!btr_search.enabled
|| (thd && thd_kill_level(thd))) {
goto func_exit;
}
mysql_mutex_lock(&buf_pool.mutex);
ulint curr_cell_count = part.table.n_cells;
if (cell_count != curr_cell_count) {
cell_count = curr_cell_count;
if (i >= cell_count) {
break;
}
}
}
node = static_cast<ahi_node*>(part.table.array[i].node);
for (; node != NULL; node = node->next) {
const buf_block_t* block
= buf_pool.block_from_ahi(node->rec);
index_id_t page_index_id;
if (UNIV_LIKELY(block->page.in_file())) {
/* The space and offset are only valid
for file blocks. It is possible that
the block is being freed
(BUF_BLOCK_REMOVE_HASH, see the
assertion and the comment below) */
const page_id_t id(block->page.id());
if (const buf_page_t* hash_page
= buf_pool.page_hash.get(
id, buf_pool.page_hash.cell_get(
id.fold()))) {
ut_ad(hash_page == &block->page);
goto state_ok;
}
}
/* When a block is being freed,
buf_LRU_search_and_free_block() first removes
the block from buf_pool.page_hash by calling
buf_LRU_block_remove_hashed_page(). Then it
invokes btr_search_drop_page_hash_index(). */
ut_a(block->page.state() == buf_page_t::REMOVE_HASH);
state_ok:
const dict_index_t* index = block->index;
ut_ad(block->page.id().space() == index->table->space_id);
const page_t* page = block->page.frame;
page_index_id = btr_page_get_index_id(page);
const uint32_t fold = rec_fold(
node->rec, *block->index,
block->ahi_left_bytes_fields
& ~buf_block_t::LEFT_SIDE,
page_is_comp(page));
if (node->fold != fold) {
ok = FALSE;
ib::error() << "Error in an adaptive hash"
<< " index pointer to page "
<< block->page.id()
<< ", ptr mem address "
<< reinterpret_cast<const void*>(
node->rec)
<< ", index id " << page_index_id
<< ", node fold " << node->fold
<< ", rec fold " << fold;
ut_ad(0);
}
}
}
for (i = 0; i < cell_count; i += chunk_size) {
/* We release search latches every once in a while to
give other queries a chance to run. */
if (i != 0) {
mysql_mutex_unlock(&buf_pool.mutex);
btr_search_x_unlock_all();
std::this_thread::yield();
btr_search_x_lock_all();
if (!btr_search.enabled
|| (thd && thd_kill_level(thd))) {
goto func_exit;
}
mysql_mutex_lock(&buf_pool.mutex);
ulint curr_cell_count = part.table.n_cells;
if (cell_count != curr_cell_count) {
cell_count = curr_cell_count;
if (i >= cell_count) {
break;
}
}
}
ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
if (!ha_validate(&part.table, i, end_index)) {
ok = false;
}
}
mysql_mutex_unlock(&buf_pool.mutex);
goto func_exit;
}
/** Validates the search system.
@param thd connection, for checking if CHECK TABLE has been killed
@return true if ok */
bool btr_search_validate(THD *thd) noexcept
{
for (ulint i= 0; i < btr_search.n_parts; ++i)
if (!btr_search_hash_table_validate(thd, i))
return false;
return true;
}
# endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
# ifdef UNIV_DEBUG
bool btr_search_check_marked_free_index(const buf_block_t *block) noexcept
{
const index_id_t index_id= btr_page_get_index_id(block->page.frame);
btr_sea::partition &part= btr_search.get_part(index_id);
bool is_freed= false;
part.latch.rd_lock(SRW_LOCK_CALL);
if (dict_index_t *index= block->index)
is_freed= index->freed();
part.latch.rd_unlock();
return is_freed;
}
# endif /* UNIV_DEBUG */
#endif /* BTR_CUR_HASH_ADAPT */