2016-08-12 11:17:45 +03:00
|
|
|
/*****************************************************************************
|
|
|
|
|
2018-10-25 15:04:37 +03:00
|
|
|
Copyright (c) 2016, 2018, Oracle and/or its affiliates. All Rights Reserved.
|
2021-04-13 10:28:13 +03:00
|
|
|
Copyright (c) 2017, 2021, MariaDB Corporation.
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
|
|
Foundation; version 2 of the License.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
2019-05-13 17:47:26 +03:00
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
/**************************************************//**
|
|
|
|
@file gis/gis0sea.cc
|
|
|
|
InnoDB R-tree search interfaces
|
|
|
|
|
|
|
|
Created 2014/01/16 Jimmy Yang
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include "fsp0fsp.h"
|
|
|
|
#include "page0page.h"
|
|
|
|
#include "page0cur.h"
|
|
|
|
#include "page0zip.h"
|
|
|
|
#include "gis0rtree.h"
|
|
|
|
#include "btr0cur.h"
|
|
|
|
#include "btr0sea.h"
|
|
|
|
#include "btr0pcur.h"
|
|
|
|
#include "rem0cmp.h"
|
|
|
|
#include "lock0lock.h"
|
|
|
|
#include "ibuf0ibuf.h"
|
|
|
|
#include "trx0trx.h"
|
|
|
|
#include "srv0mon.h"
|
2018-03-11 23:34:23 +02:00
|
|
|
#include "que0que.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "gis0geo.h"
|
|
|
|
|
2015-05-26 10:01:12 +03:00
|
|
|
/** Restore the stored position of a persistent cursor bufferfixing the page */
|
|
|
|
static
|
|
|
|
bool
|
|
|
|
rtr_cur_restore_position(
|
|
|
|
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
|
|
|
|
btr_cur_t* cursor, /*!< in: detached persistent cursor */
|
|
|
|
ulint level, /*!< in: index level */
|
|
|
|
mtr_t* mtr); /*!< in: mtr */
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/*************************************************************//**
|
|
|
|
Pop out used parent path entry, until we find the parent with matching
|
|
|
|
page number */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
rtr_adjust_parent_path(
|
|
|
|
/*===================*/
|
|
|
|
rtr_info_t* rtr_info, /* R-Tree info struct */
|
|
|
|
ulint page_no) /* page number to look for */
|
|
|
|
{
|
|
|
|
while (!rtr_info->parent_path->empty()) {
|
|
|
|
if (rtr_info->parent_path->back().child_no == page_no) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
if (rtr_info->parent_path->back().cursor) {
|
|
|
|
btr_pcur_close(
|
|
|
|
rtr_info->parent_path->back().cursor);
|
|
|
|
ut_free(rtr_info->parent_path->back().cursor);
|
|
|
|
}
|
|
|
|
|
|
|
|
rtr_info->parent_path->pop_back();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*************************************************************//**
|
|
|
|
Find the next matching record. This function is used by search
|
|
|
|
or record locating during index delete/update.
|
|
|
|
@return true if there is suitable record found, otherwise false */
|
|
|
|
static
|
|
|
|
bool
|
|
|
|
rtr_pcur_getnext_from_path(
|
|
|
|
/*=======================*/
|
|
|
|
const dtuple_t* tuple, /*!< in: data tuple */
|
|
|
|
page_cur_mode_t mode, /*!< in: cursor search mode */
|
|
|
|
btr_cur_t* btr_cur,/*!< in: persistent cursor; NOTE that the
|
|
|
|
function may release the page latch */
|
|
|
|
ulint target_level,
|
|
|
|
/*!< in: target level */
|
|
|
|
ulint latch_mode,
|
|
|
|
/*!< in: latch_mode */
|
|
|
|
bool index_locked,
|
|
|
|
/*!< in: index tree locked */
|
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
dict_index_t* index = btr_cur->index;
|
|
|
|
bool found = false;
|
|
|
|
page_cur_t* page_cursor;
|
|
|
|
ulint level = 0;
|
|
|
|
node_visit_t next_rec;
|
|
|
|
rtr_info_t* rtr_info = btr_cur->rtr_info;
|
|
|
|
node_seq_t page_ssn;
|
|
|
|
ulint my_latch_mode;
|
|
|
|
ulint skip_parent = false;
|
|
|
|
bool new_split = false;
|
|
|
|
bool need_parent;
|
|
|
|
bool for_delete = false;
|
|
|
|
bool for_undo_ins = false;
|
|
|
|
|
|
|
|
/* exhausted all the pages to be searched */
|
|
|
|
if (rtr_info->path->empty()) {
|
|
|
|
return(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
ut_ad(dtuple_get_n_fields_cmp(tuple));
|
|
|
|
|
|
|
|
my_latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
|
|
|
|
|
|
|
|
for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
|
|
|
|
for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
|
|
|
|
|
|
|
|
/* There should be no insert coming to this function. Only
|
|
|
|
mode with BTR_MODIFY_* should be delete */
|
|
|
|
ut_ad(mode != PAGE_CUR_RTREE_INSERT);
|
|
|
|
ut_ad(my_latch_mode == BTR_SEARCH_LEAF
|
|
|
|
|| my_latch_mode == BTR_MODIFY_LEAF
|
|
|
|
|| my_latch_mode == BTR_MODIFY_TREE
|
|
|
|
|| my_latch_mode == BTR_CONT_MODIFY_TREE);
|
|
|
|
|
|
|
|
/* Whether need to track parent information. Only need so
|
|
|
|
when we do tree altering operations (such as index page merge) */
|
|
|
|
need_parent = ((my_latch_mode == BTR_MODIFY_TREE
|
|
|
|
|| my_latch_mode == BTR_CONT_MODIFY_TREE)
|
|
|
|
&& mode == PAGE_CUR_RTREE_LOCATE);
|
|
|
|
|
|
|
|
if (!index_locked) {
|
|
|
|
ut_ad(latch_mode & BTR_SEARCH_LEAF
|
|
|
|
|| latch_mode & BTR_MODIFY_LEAF);
|
2019-11-14 11:40:33 +02:00
|
|
|
mtr_s_lock_index(index, mtr);
|
2016-08-12 11:17:45 +03:00
|
|
|
} else {
|
2020-06-10 07:43:58 +03:00
|
|
|
ut_ad(mtr->memo_contains_flagged(&index->lock,
|
|
|
|
MTR_MEMO_SX_LOCK
|
|
|
|
| MTR_MEMO_S_LOCK
|
|
|
|
| MTR_MEMO_X_LOCK));
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
2019-02-06 19:50:11 +02:00
|
|
|
const ulint zip_size = index->table->space->zip_size();
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/* Pop each node/page to be searched from "path" structure
|
|
|
|
and do a search on it. Please note, any pages that are in
|
|
|
|
the "path" structure are protected by "page" lock, so tey
|
|
|
|
cannot be shrunk away */
|
|
|
|
do {
|
|
|
|
buf_block_t* block;
|
|
|
|
node_seq_t path_ssn;
|
|
|
|
const page_t* page;
|
|
|
|
ulint rw_latch = RW_X_LATCH;
|
|
|
|
ulint tree_idx;
|
|
|
|
|
|
|
|
mutex_enter(&rtr_info->rtr_path_mutex);
|
|
|
|
next_rec = rtr_info->path->back();
|
|
|
|
rtr_info->path->pop_back();
|
|
|
|
level = next_rec.level;
|
|
|
|
path_ssn = next_rec.seq_no;
|
|
|
|
tree_idx = btr_cur->tree_height - level - 1;
|
|
|
|
|
|
|
|
/* Maintain the parent path info as well, if needed */
|
|
|
|
if (need_parent && !skip_parent && !new_split) {
|
|
|
|
ulint old_level;
|
|
|
|
ulint new_level;
|
|
|
|
|
|
|
|
ut_ad(!rtr_info->parent_path->empty());
|
|
|
|
|
|
|
|
/* Cleanup unused parent info */
|
|
|
|
if (rtr_info->parent_path->back().cursor) {
|
|
|
|
btr_pcur_close(
|
|
|
|
rtr_info->parent_path->back().cursor);
|
|
|
|
ut_free(rtr_info->parent_path->back().cursor);
|
|
|
|
}
|
|
|
|
|
|
|
|
old_level = rtr_info->parent_path->back().level;
|
|
|
|
|
|
|
|
rtr_info->parent_path->pop_back();
|
|
|
|
|
|
|
|
ut_ad(!rtr_info->parent_path->empty());
|
|
|
|
|
|
|
|
/* check whether there is a level change. If so,
|
|
|
|
the current parent path needs to pop enough
|
|
|
|
nodes to adjust to the new search page */
|
|
|
|
new_level = rtr_info->parent_path->back().level;
|
|
|
|
|
|
|
|
if (old_level < new_level) {
|
|
|
|
rtr_adjust_parent_path(
|
|
|
|
rtr_info, next_rec.page_no);
|
|
|
|
}
|
|
|
|
|
|
|
|
ut_ad(!rtr_info->parent_path->empty());
|
|
|
|
|
|
|
|
ut_ad(next_rec.page_no
|
|
|
|
== rtr_info->parent_path->back().child_no);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&rtr_info->rtr_path_mutex);
|
|
|
|
|
|
|
|
skip_parent = false;
|
|
|
|
new_split = false;
|
|
|
|
|
|
|
|
/* Once we have pages in "path", these pages are
|
|
|
|
predicate page locked, so they can't be shrunk away.
|
|
|
|
They also have SSN (split sequence number) to detect
|
|
|
|
splits, so we can directly latch single page while
|
|
|
|
getting them. They can be unlatched if not qualified.
|
|
|
|
One reason for pre-latch is that we might need to position
|
|
|
|
some parent position (requires latch) during search */
|
|
|
|
if (level == 0) {
|
|
|
|
/* S latched for SEARCH_LEAF, and X latched
|
|
|
|
for MODIFY_LEAF */
|
|
|
|
if (my_latch_mode <= BTR_MODIFY_LEAF) {
|
|
|
|
rw_latch = my_latch_mode;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (my_latch_mode == BTR_CONT_MODIFY_TREE
|
|
|
|
|| my_latch_mode == BTR_MODIFY_TREE) {
|
|
|
|
rw_latch = RW_NO_LATCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (level == target_level) {
|
|
|
|
rw_latch = RW_X_LATCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Release previous locked blocks */
|
|
|
|
if (my_latch_mode != BTR_SEARCH_LEAF) {
|
|
|
|
for (ulint idx = 0; idx < btr_cur->tree_height;
|
|
|
|
idx++) {
|
|
|
|
if (rtr_info->tree_blocks[idx]) {
|
|
|
|
mtr_release_block_at_savepoint(
|
|
|
|
mtr,
|
|
|
|
rtr_info->tree_savepoints[idx],
|
|
|
|
rtr_info->tree_blocks[idx]);
|
|
|
|
rtr_info->tree_blocks[idx] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (ulint idx = RTR_MAX_LEVELS; idx < RTR_MAX_LEVELS + 3;
|
|
|
|
idx++) {
|
|
|
|
if (rtr_info->tree_blocks[idx]) {
|
|
|
|
mtr_release_block_at_savepoint(
|
|
|
|
mtr,
|
|
|
|
rtr_info->tree_savepoints[idx],
|
|
|
|
rtr_info->tree_blocks[idx]);
|
|
|
|
rtr_info->tree_blocks[idx] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set up savepoint to record any locks to be taken */
|
|
|
|
rtr_info->tree_savepoints[tree_idx] = mtr_set_savepoint(mtr);
|
|
|
|
|
|
|
|
#ifdef UNIV_RTR_DEBUG
|
2018-07-23 13:31:10 +03:00
|
|
|
ut_ad(!(rw_lock_own_flagged(&btr_cur->page_cur.block->lock,
|
|
|
|
RW_LOCK_FLAG_X | RW_LOCK_FLAG_S))
|
2016-08-12 11:17:45 +03:00
|
|
|
|| my_latch_mode == BTR_MODIFY_TREE
|
|
|
|
|| my_latch_mode == BTR_CONT_MODIFY_TREE
|
|
|
|
|| !page_is_leaf(buf_block_get_frame(
|
|
|
|
btr_cur->page_cur.block)));
|
|
|
|
#endif /* UNIV_RTR_DEBUG */
|
|
|
|
|
|
|
|
dberr_t err = DB_SUCCESS;
|
|
|
|
|
|
|
|
block = buf_page_get_gen(
|
2018-11-22 17:07:35 +02:00
|
|
|
page_id_t(index->table->space_id,
|
2019-02-06 19:50:11 +02:00
|
|
|
next_rec.page_no), zip_size,
|
2016-08-12 11:17:45 +03:00
|
|
|
rw_latch, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err);
|
|
|
|
|
|
|
|
if (block == NULL) {
|
|
|
|
continue;
|
|
|
|
} else if (rw_latch != RW_NO_LATCH) {
|
|
|
|
ut_ad(!dict_index_is_ibuf(index));
|
|
|
|
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
|
|
|
|
}
|
|
|
|
|
|
|
|
rtr_info->tree_blocks[tree_idx] = block;
|
|
|
|
|
|
|
|
page = buf_block_get_frame(block);
|
|
|
|
page_ssn = page_get_ssn_id(page);
|
|
|
|
|
|
|
|
/* If there are splits, push the splitted page.
|
|
|
|
Note that we have SX lock on index->lock, there
|
|
|
|
should not be any split/shrink happening here */
|
|
|
|
if (page_ssn > path_ssn) {
|
2019-11-11 13:36:21 +02:00
|
|
|
uint32_t next_page_no = btr_page_get_next(page);
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_non_leaf_stack_push(
|
|
|
|
rtr_info->path, next_page_no, path_ssn,
|
|
|
|
level, 0, NULL, 0);
|
|
|
|
|
|
|
|
if (!srv_read_only_mode
|
|
|
|
&& mode != PAGE_CUR_RTREE_INSERT
|
|
|
|
&& mode != PAGE_CUR_RTREE_LOCATE) {
|
|
|
|
ut_ad(rtr_info->thr);
|
|
|
|
lock_place_prdt_page_lock(
|
2020-09-11 15:55:30 +03:00
|
|
|
page_id_t(block->page.id().space(),
|
|
|
|
next_page_no),
|
|
|
|
index,
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_info->thr);
|
|
|
|
}
|
|
|
|
new_split = true;
|
2018-04-26 16:33:05 +03:00
|
|
|
#if defined(UNIV_GIS_DEBUG)
|
2016-08-12 11:17:45 +03:00
|
|
|
fprintf(stderr,
|
|
|
|
"GIS_DIAG: Splitted page found: %d, %ld\n",
|
|
|
|
static_cast<int>(need_parent), next_page_no);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
page_cursor = btr_cur_get_page_cur(btr_cur);
|
|
|
|
page_cursor->rec = NULL;
|
|
|
|
|
|
|
|
if (mode == PAGE_CUR_RTREE_LOCATE) {
|
2016-09-06 09:43:16 +03:00
|
|
|
if (level == target_level && level == 0) {
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint low_match;
|
|
|
|
|
|
|
|
found = false;
|
|
|
|
|
|
|
|
low_match = page_cur_search(
|
|
|
|
block, index, tuple,
|
|
|
|
PAGE_CUR_LE,
|
|
|
|
btr_cur_get_page_cur(btr_cur));
|
|
|
|
|
|
|
|
if (low_match == dtuple_get_n_fields_cmp(
|
|
|
|
tuple)) {
|
|
|
|
rec_t* rec = btr_cur_get_rec(btr_cur);
|
|
|
|
|
|
|
|
if (!rec_get_deleted_flag(rec,
|
|
|
|
dict_table_is_comp(index->table))
|
|
|
|
|| (!for_delete && !for_undo_ins)) {
|
|
|
|
found = true;
|
|
|
|
btr_cur->low_match = low_match;
|
|
|
|
} else {
|
|
|
|
/* mark we found deleted row */
|
|
|
|
btr_cur->rtr_info->fd_del
|
|
|
|
= true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2016-09-06 09:43:16 +03:00
|
|
|
page_cur_mode_t page_mode = mode;
|
|
|
|
|
|
|
|
if (level == target_level
|
|
|
|
&& target_level != 0) {
|
|
|
|
page_mode = PAGE_CUR_RTREE_GET_FATHER;
|
|
|
|
}
|
2016-08-12 11:17:45 +03:00
|
|
|
found = rtr_cur_search_with_match(
|
2016-09-06 09:43:16 +03:00
|
|
|
block, index, tuple, page_mode,
|
|
|
|
page_cursor, btr_cur->rtr_info);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/* Save the position of parent if needed */
|
|
|
|
if (found && need_parent) {
|
|
|
|
btr_pcur_t* r_cursor =
|
|
|
|
rtr_get_parent_cursor(
|
|
|
|
btr_cur, level, false);
|
|
|
|
|
|
|
|
rec_t* rec = page_cur_get_rec(
|
|
|
|
page_cursor);
|
|
|
|
page_cur_position(
|
|
|
|
rec, block,
|
|
|
|
btr_pcur_get_page_cur(r_cursor));
|
|
|
|
r_cursor->pos_state =
|
|
|
|
BTR_PCUR_IS_POSITIONED;
|
|
|
|
r_cursor->latch_mode = my_latch_mode;
|
|
|
|
btr_pcur_store_position(r_cursor, mtr);
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
ulint num_stored =
|
|
|
|
rtr_store_parent_path(
|
|
|
|
block, btr_cur,
|
|
|
|
rw_latch, level, mtr);
|
|
|
|
ut_ad(num_stored > 0);
|
|
|
|
#else
|
|
|
|
rtr_store_parent_path(
|
|
|
|
block, btr_cur, rw_latch,
|
|
|
|
level, mtr);
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
found = rtr_cur_search_with_match(
|
|
|
|
block, index, tuple, mode, page_cursor,
|
|
|
|
btr_cur->rtr_info);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Attach predicate lock if needed, no matter whether
|
|
|
|
there are matched records */
|
|
|
|
if (mode != PAGE_CUR_RTREE_INSERT
|
|
|
|
&& mode != PAGE_CUR_RTREE_LOCATE
|
|
|
|
&& mode >= PAGE_CUR_CONTAIN
|
2018-10-25 15:04:37 +03:00
|
|
|
&& btr_cur->rtr_info->need_prdt_lock) {
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_prdt_t prdt;
|
|
|
|
|
|
|
|
trx_t* trx = thr_get_trx(
|
|
|
|
btr_cur->rtr_info->thr);
|
|
|
|
lock_mutex_enter();
|
|
|
|
lock_init_prdt_from_mbr(
|
|
|
|
&prdt, &btr_cur->rtr_info->mbr,
|
|
|
|
mode, trx->lock.lock_heap);
|
|
|
|
lock_mutex_exit();
|
|
|
|
|
|
|
|
if (rw_latch == RW_NO_LATCH) {
|
|
|
|
rw_lock_s_lock(&(block->lock));
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_prdt_lock(block, &prdt, index, LOCK_S,
|
2018-05-01 01:10:37 +03:00
|
|
|
LOCK_PREDICATE, btr_cur->rtr_info->thr);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (rw_latch == RW_NO_LATCH) {
|
|
|
|
rw_lock_s_unlock(&(block->lock));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found) {
|
|
|
|
if (level == target_level) {
|
|
|
|
page_cur_t* r_cur;;
|
|
|
|
|
|
|
|
if (my_latch_mode == BTR_MODIFY_TREE
|
|
|
|
&& level == 0) {
|
|
|
|
ut_ad(rw_latch == RW_NO_LATCH);
|
|
|
|
|
|
|
|
btr_cur_latch_leaves(
|
MDEV-12266: Change dict_table_t::space to fil_space_t*
InnoDB always keeps all tablespaces in the fil_system cache.
The fil_system.LRU is only for closing file handles; the
fil_space_t and fil_node_t for all data files will remain
in main memory. Between startup to shutdown, they can only be
created and removed by DDL statements. Therefore, we can
let dict_table_t::space point directly to the fil_space_t.
dict_table_t::space_id: A numeric tablespace ID for the corner cases
where we do not have a tablespace. The most prominent examples are
ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file.
There are a few functional differences; most notably:
(1) DROP TABLE will delete matching .ibd and .cfg files,
even if they were not attached to the data dictionary.
(2) Some error messages will report file names instead of numeric IDs.
There still are many functions that use numeric tablespace IDs instead
of fil_space_t*, and many functions could be converted to fil_space_t
member functions. Also, Tablespace and Datafile should be merged with
fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use
fil_space_t& instead of a numeric ID, and after moving to a single
buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to
fil_space_t::page_hash.
FilSpace: Remove. Only few calls to fil_space_acquire() will remain,
and gradually they should be removed.
mtr_t::set_named_space_id(ulint): Renamed from set_named_space(),
to prevent accidental calls to this slower function. Very few
callers remain.
fseg_create(), fsp_reserve_free_extents(): Take fil_space_t*
as a parameter instead of a space_id.
fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(),
fil_name_write_rename(), fil_rename_tablespace(). Mariabackup
passes the parameter log=false; InnoDB passes log=true.
dict_mem_table_create(): Take fil_space_t* instead of space_id
as parameter.
dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter
'status' with 'bool cached'.
dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name.
fil_ibd_open(): Return the tablespace.
fil_space_t::set_imported(): Replaces fil_space_set_imported().
truncate_t: Change many member function parameters to fil_space_t*,
and remove page_size parameters.
row_truncate_prepare(): Merge to its only caller.
row_drop_table_from_cache(): Assert that the table is persistent.
dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL
if the tablespace has been discarded.
row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
|
|
|
block,
|
2019-09-25 16:08:48 +03:00
|
|
|
BTR_MODIFY_TREE,
|
2016-08-12 11:17:45 +03:00
|
|
|
btr_cur, mtr);
|
|
|
|
}
|
|
|
|
|
|
|
|
r_cur = btr_cur_get_page_cur(btr_cur);
|
|
|
|
|
|
|
|
page_cur_position(
|
|
|
|
page_cur_get_rec(page_cursor),
|
|
|
|
page_cur_get_block(page_cursor),
|
|
|
|
r_cur);
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
btr_cur->low_match = level != 0 ?
|
|
|
|
DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1
|
|
|
|
: btr_cur->low_match;
|
2016-08-12 11:17:45 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Keep the parent path node, which points to
|
|
|
|
last node just located */
|
|
|
|
skip_parent = true;
|
|
|
|
} else {
|
|
|
|
/* Release latch on the current page */
|
|
|
|
ut_ad(rtr_info->tree_blocks[tree_idx]);
|
|
|
|
|
|
|
|
mtr_release_block_at_savepoint(
|
|
|
|
mtr, rtr_info->tree_savepoints[tree_idx],
|
|
|
|
rtr_info->tree_blocks[tree_idx]);
|
|
|
|
rtr_info->tree_blocks[tree_idx] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
} while (!rtr_info->path->empty());
|
|
|
|
|
|
|
|
const rec_t* rec = btr_cur_get_rec(btr_cur);
|
|
|
|
|
|
|
|
if (page_rec_is_infimum(rec) || page_rec_is_supremum(rec)) {
|
|
|
|
mtr_commit(mtr);
|
|
|
|
mtr_start(mtr);
|
|
|
|
} else if (!index_locked) {
|
|
|
|
mtr_memo_release(mtr, dict_index_get_lock(index),
|
|
|
|
MTR_MEMO_X_LOCK);
|
|
|
|
}
|
|
|
|
|
|
|
|
return(found);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*************************************************************//**
|
|
|
|
Find the next matching record. This function will first exhaust
|
|
|
|
the copied record listed in the rtr_info->matches vector before
|
|
|
|
moving to the next page
|
|
|
|
@return true if there is suitable record found, otherwise false */
|
|
|
|
bool
|
|
|
|
rtr_pcur_move_to_next(
|
|
|
|
/*==================*/
|
|
|
|
const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
|
|
|
|
tuple must be set so that it cannot get
|
|
|
|
compared to the node ptr page number field! */
|
|
|
|
page_cur_mode_t mode, /*!< in: cursor search mode */
|
|
|
|
btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
|
|
|
|
function may release the page latch */
|
|
|
|
ulint level, /*!< in: target level */
|
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
rtr_info_t* rtr_info = cursor->btr_cur.rtr_info;
|
|
|
|
|
|
|
|
ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
|
|
|
|
|
|
|
|
mutex_enter(&rtr_info->matches->rtr_match_mutex);
|
|
|
|
/* First retrieve the next record on the current page */
|
|
|
|
if (!rtr_info->matches->matched_recs->empty()) {
|
|
|
|
rtr_rec_t rec;
|
|
|
|
rec = rtr_info->matches->matched_recs->back();
|
|
|
|
rtr_info->matches->matched_recs->pop_back();
|
|
|
|
mutex_exit(&rtr_info->matches->rtr_match_mutex);
|
|
|
|
|
|
|
|
cursor->btr_cur.page_cur.rec = rec.r_rec;
|
|
|
|
cursor->btr_cur.page_cur.block = &rtr_info->matches->block;
|
|
|
|
|
|
|
|
DEBUG_SYNC_C("rtr_pcur_move_to_next_return");
|
|
|
|
return(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&rtr_info->matches->rtr_match_mutex);
|
|
|
|
|
|
|
|
/* Fetch the next page */
|
|
|
|
return(rtr_pcur_getnext_from_path(tuple, mode, &cursor->btr_cur,
|
|
|
|
level, cursor->latch_mode,
|
|
|
|
false, mtr));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*************************************************************//**
|
|
|
|
Check if the cursor holds record pointing to the specified child page
|
|
|
|
@return true if it is (pointing to the child page) false otherwise */
|
|
|
|
static
|
|
|
|
bool
|
|
|
|
rtr_compare_cursor_rec(
|
|
|
|
/*===================*/
|
|
|
|
dict_index_t* index, /*!< in: index */
|
|
|
|
btr_cur_t* cursor, /*!< in: Cursor to check */
|
|
|
|
ulint page_no, /*!< in: desired child page number */
|
|
|
|
mem_heap_t** heap) /*!< in: memory heap */
|
|
|
|
{
|
|
|
|
const rec_t* rec;
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs* offsets;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
rec = btr_cur_get_rec(cursor);
|
|
|
|
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets = rec_get_offsets(rec, index, NULL, 0, ULINT_UNDEFINED, heap);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
return(btr_node_ptr_get_child_page_no(rec, offsets) == page_no);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************//**
|
|
|
|
Initializes and opens a persistent cursor to an index tree. It should be
|
|
|
|
closed with btr_pcur_close. Mainly called by row_search_index_entry() */
|
|
|
|
void
|
|
|
|
rtr_pcur_open_low(
|
|
|
|
/*==============*/
|
|
|
|
dict_index_t* index, /*!< in: index */
|
|
|
|
ulint level, /*!< in: level in the rtree */
|
|
|
|
const dtuple_t* tuple, /*!< in: tuple on which search done */
|
|
|
|
page_cur_mode_t mode, /*!< in: PAGE_CUR_RTREE_LOCATE, ... */
|
|
|
|
ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
|
|
|
|
btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
|
|
|
|
const char* file, /*!< in: file name */
|
2017-03-01 08:27:39 +02:00
|
|
|
unsigned line, /*!< in: line where called */
|
2016-08-12 11:17:45 +03:00
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
btr_cur_t* btr_cursor;
|
|
|
|
ulint n_fields;
|
|
|
|
ulint low_match;
|
|
|
|
rec_t* rec;
|
|
|
|
bool tree_latched = false;
|
|
|
|
bool for_delete = false;
|
|
|
|
bool for_undo_ins = false;
|
|
|
|
|
|
|
|
ut_ad(level == 0);
|
|
|
|
|
|
|
|
ut_ad(latch_mode & BTR_MODIFY_LEAF || latch_mode & BTR_MODIFY_TREE);
|
|
|
|
ut_ad(mode == PAGE_CUR_RTREE_LOCATE);
|
|
|
|
|
|
|
|
/* Initialize the cursor */
|
|
|
|
|
|
|
|
btr_pcur_init(cursor);
|
|
|
|
|
|
|
|
for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
|
|
|
|
for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
|
|
|
|
|
|
|
|
cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
|
|
|
|
cursor->search_mode = mode;
|
|
|
|
|
|
|
|
/* Search with the tree cursor */
|
|
|
|
|
|
|
|
btr_cursor = btr_pcur_get_btr_cur(cursor);
|
|
|
|
|
|
|
|
btr_cursor->rtr_info = rtr_create_rtr_info(false, false,
|
|
|
|
btr_cursor, index);
|
|
|
|
|
|
|
|
/* Purge will SX lock the tree instead of take Page Locks */
|
|
|
|
if (btr_cursor->thr) {
|
|
|
|
btr_cursor->rtr_info->need_page_lock = true;
|
|
|
|
btr_cursor->rtr_info->thr = btr_cursor->thr;
|
|
|
|
}
|
|
|
|
|
|
|
|
btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
|
|
|
|
btr_cursor, 0, file, line, mtr);
|
|
|
|
cursor->pos_state = BTR_PCUR_IS_POSITIONED;
|
|
|
|
|
|
|
|
cursor->trx_if_known = NULL;
|
|
|
|
|
|
|
|
low_match = btr_pcur_get_low_match(cursor);
|
|
|
|
|
|
|
|
rec = btr_pcur_get_rec(cursor);
|
|
|
|
|
|
|
|
n_fields = dtuple_get_n_fields(tuple);
|
|
|
|
|
|
|
|
if (latch_mode & BTR_ALREADY_S_LATCHED) {
|
2020-06-10 07:43:58 +03:00
|
|
|
ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
|
2016-08-12 11:17:45 +03:00
|
|
|
tree_latched = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (latch_mode & BTR_MODIFY_TREE) {
|
2020-06-10 07:43:58 +03:00
|
|
|
ut_ad(mtr->memo_contains_flagged(&index->lock,
|
|
|
|
MTR_MEMO_X_LOCK
|
|
|
|
| MTR_MEMO_SX_LOCK));
|
2016-08-12 11:17:45 +03:00
|
|
|
tree_latched = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (page_rec_is_infimum(rec) || low_match != n_fields
|
|
|
|
|| (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))
|
|
|
|
&& (for_delete || for_undo_ins))) {
|
|
|
|
|
|
|
|
if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))
|
|
|
|
&& for_delete) {
|
|
|
|
btr_cursor->rtr_info->fd_del = true;
|
|
|
|
btr_cursor->low_match = 0;
|
|
|
|
}
|
|
|
|
/* Did not find matched row in first dive. Release
|
|
|
|
latched block if any before search more pages */
|
|
|
|
if (latch_mode & BTR_MODIFY_LEAF) {
|
|
|
|
ulint tree_idx = btr_cursor->tree_height - 1;
|
|
|
|
rtr_info_t* rtr_info = btr_cursor->rtr_info;
|
|
|
|
|
|
|
|
ut_ad(level == 0);
|
|
|
|
|
|
|
|
if (rtr_info->tree_blocks[tree_idx]) {
|
|
|
|
mtr_release_block_at_savepoint(
|
|
|
|
mtr,
|
|
|
|
rtr_info->tree_savepoints[tree_idx],
|
|
|
|
rtr_info->tree_blocks[tree_idx]);
|
|
|
|
rtr_info->tree_blocks[tree_idx] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ret = rtr_pcur_getnext_from_path(
|
|
|
|
tuple, mode, btr_cursor, level, latch_mode,
|
|
|
|
tree_latched, mtr);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
low_match = btr_pcur_get_low_match(cursor);
|
|
|
|
ut_ad(low_match == n_fields);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/* Get the rtree page father.
|
|
|
|
@param[in] index rtree index
|
|
|
|
@param[in] block child page in the index
|
|
|
|
@param[in] mtr mtr
|
|
|
|
@param[in] sea_cur search cursor, contains information
|
|
|
|
about parent nodes in search
|
|
|
|
@param[in] cursor cursor on node pointer record,
|
|
|
|
its page x-latched */
|
|
|
|
void
|
|
|
|
rtr_page_get_father(
|
|
|
|
dict_index_t* index,
|
|
|
|
buf_block_t* block,
|
|
|
|
mtr_t* mtr,
|
|
|
|
btr_cur_t* sea_cur,
|
|
|
|
btr_cur_t* cursor)
|
|
|
|
{
|
|
|
|
mem_heap_t* heap = mem_heap_create(100);
|
|
|
|
#ifdef UNIV_DEBUG
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs* offsets;
|
2016-09-06 09:43:16 +03:00
|
|
|
|
|
|
|
offsets = rtr_page_get_father_block(
|
|
|
|
NULL, heap, index, block, mtr, sea_cur, cursor);
|
|
|
|
|
|
|
|
ulint page_no = btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
|
|
|
|
offsets);
|
|
|
|
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
ut_ad(page_no == block->page.id().page_no());
|
2016-09-06 09:43:16 +03:00
|
|
|
#else
|
|
|
|
rtr_page_get_father_block(
|
|
|
|
NULL, heap, index, block, mtr, sea_cur, cursor);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
mem_heap_free(heap);
|
|
|
|
}
|
|
|
|
|
2020-04-28 09:47:40 +03:00
|
|
|
/********************************************************************//**
|
|
|
|
Returns the upper level node pointer to a R-Tree page. It is assumed
|
|
|
|
that mtr holds an x-latch on the tree. */
|
|
|
|
static void rtr_get_father_node(
|
|
|
|
dict_index_t* index, /*!< in: index */
|
|
|
|
ulint level, /*!< in: the tree level of search */
|
|
|
|
const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
|
|
|
|
tuple must be set so that it cannot get
|
|
|
|
compared to the node ptr page number field! */
|
|
|
|
btr_cur_t* sea_cur,/*!< in: search cursor */
|
|
|
|
btr_cur_t* btr_cur,/*!< in/out: tree cursor; the cursor page is
|
|
|
|
s- or x-latched, but see also above! */
|
|
|
|
ulint page_no,/*!< Current page no */
|
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
mem_heap_t* heap = NULL;
|
|
|
|
bool ret = false;
|
|
|
|
const rec_t* rec;
|
|
|
|
ulint n_fields;
|
|
|
|
bool new_rtr = false;
|
|
|
|
|
|
|
|
/* Try to optimally locate the parent node. Level should always
|
|
|
|
less than sea_cur->tree_height unless the root is splitting */
|
|
|
|
if (sea_cur && sea_cur->tree_height > level) {
|
2020-06-10 07:43:58 +03:00
|
|
|
ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
|
|
|
|
| MTR_MEMO_SX_LOCK));
|
2020-04-28 09:47:40 +03:00
|
|
|
ret = rtr_cur_restore_position(
|
|
|
|
BTR_CONT_MODIFY_TREE, sea_cur, level, mtr);
|
|
|
|
|
|
|
|
/* Once we block shrink tree nodes while there are
|
|
|
|
active search on it, this optimal locating should always
|
|
|
|
succeeds */
|
|
|
|
ut_ad(ret);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
btr_pcur_t* r_cursor = rtr_get_parent_cursor(
|
|
|
|
sea_cur, level, false);
|
|
|
|
|
|
|
|
rec = btr_pcur_get_rec(r_cursor);
|
|
|
|
|
|
|
|
ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
|
|
|
|
page_cur_position(rec,
|
|
|
|
btr_pcur_get_block(r_cursor),
|
|
|
|
btr_cur_get_page_cur(btr_cur));
|
|
|
|
btr_cur->rtr_info = sea_cur->rtr_info;
|
|
|
|
btr_cur->tree_height = sea_cur->tree_height;
|
|
|
|
ut_ad(rtr_compare_cursor_rec(
|
|
|
|
index, btr_cur, page_no, &heap));
|
|
|
|
goto func_exit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We arrive here in one of two scenario
|
|
|
|
1) check table and btr_valide
|
|
|
|
2) index root page being raised */
|
|
|
|
ut_ad(!sea_cur || sea_cur->tree_height == level);
|
|
|
|
|
|
|
|
if (btr_cur->rtr_info) {
|
|
|
|
rtr_clean_rtr_info(btr_cur->rtr_info, true);
|
|
|
|
} else {
|
|
|
|
new_rtr = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
btr_cur->rtr_info = rtr_create_rtr_info(false, false, btr_cur, index);
|
|
|
|
|
|
|
|
if (sea_cur && sea_cur->tree_height == level) {
|
|
|
|
/* root split, and search the new root */
|
|
|
|
btr_cur_search_to_nth_level(
|
|
|
|
index, level, tuple, PAGE_CUR_RTREE_LOCATE,
|
|
|
|
BTR_CONT_MODIFY_TREE, btr_cur, 0,
|
|
|
|
__FILE__, __LINE__, mtr);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
/* btr_validate */
|
|
|
|
ut_ad(level >= 1);
|
|
|
|
ut_ad(!sea_cur);
|
|
|
|
|
|
|
|
btr_cur_search_to_nth_level(
|
|
|
|
index, level, tuple, PAGE_CUR_RTREE_LOCATE,
|
|
|
|
BTR_CONT_MODIFY_TREE, btr_cur, 0,
|
|
|
|
__FILE__, __LINE__, mtr);
|
|
|
|
|
|
|
|
rec = btr_cur_get_rec(btr_cur);
|
|
|
|
n_fields = dtuple_get_n_fields_cmp(tuple);
|
|
|
|
|
|
|
|
if (page_rec_is_infimum(rec)
|
|
|
|
|| (btr_cur->low_match != n_fields)) {
|
|
|
|
ret = rtr_pcur_getnext_from_path(
|
|
|
|
tuple, PAGE_CUR_RTREE_LOCATE, btr_cur,
|
|
|
|
level, BTR_CONT_MODIFY_TREE,
|
|
|
|
true, mtr);
|
|
|
|
|
|
|
|
ut_ad(ret && btr_cur->low_match == n_fields);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = rtr_compare_cursor_rec(
|
|
|
|
index, btr_cur, page_no, &heap);
|
|
|
|
|
|
|
|
ut_ad(ret);
|
|
|
|
|
|
|
|
func_exit:
|
|
|
|
if (heap) {
|
|
|
|
mem_heap_free(heap);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (new_rtr && btr_cur->rtr_info) {
|
|
|
|
rtr_clean_rtr_info(btr_cur->rtr_info, true);
|
|
|
|
btr_cur->rtr_info = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-26 10:01:12 +03:00
|
|
|
/** Returns the upper level node pointer to a R-Tree page. It is assumed
|
|
|
|
that mtr holds an SX-latch or X-latch on the tree.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return rec_get_offsets() of the node pointer record */
|
2015-05-26 10:01:12 +03:00
|
|
|
static
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs*
|
2015-05-26 10:01:12 +03:00
|
|
|
rtr_page_get_father_node_ptr(
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs* offsets,/*!< in: work area for the return value */
|
2016-08-12 11:17:45 +03:00
|
|
|
mem_heap_t* heap, /*!< in: memory heap to use */
|
|
|
|
btr_cur_t* sea_cur,/*!< in: search cursor */
|
|
|
|
btr_cur_t* cursor, /*!< in: cursor pointing to user record,
|
|
|
|
out: cursor on node pointer record,
|
|
|
|
its page x-latched */
|
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
dtuple_t* tuple;
|
|
|
|
rec_t* user_rec;
|
|
|
|
rec_t* node_ptr;
|
|
|
|
ulint level;
|
|
|
|
ulint page_no;
|
|
|
|
dict_index_t* index;
|
|
|
|
rtr_mbr_t mbr;
|
|
|
|
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
page_no = btr_cur_get_block(cursor)->page.id().page_no();
|
2016-08-12 11:17:45 +03:00
|
|
|
index = btr_cur_get_index(cursor);
|
|
|
|
|
|
|
|
ut_ad(srv_read_only_mode
|
2020-06-10 07:43:58 +03:00
|
|
|
|| mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
|
|
|
|
| MTR_MEMO_SX_LOCK));
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
ut_ad(dict_index_get_page(index) != page_no);
|
|
|
|
|
2018-02-13 23:02:46 +03:00
|
|
|
level = btr_page_get_level(btr_cur_get_page(cursor));
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
user_rec = btr_cur_get_rec(cursor);
|
|
|
|
ut_a(page_rec_is_user_rec(user_rec));
|
|
|
|
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets = rec_get_offsets(user_rec, index, offsets,
|
|
|
|
level ? 0 : index->n_fields,
|
2016-08-12 11:17:45 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
|
|
|
rtr_get_mbr_from_rec(user_rec, offsets, &mbr);
|
|
|
|
|
|
|
|
tuple = rtr_index_build_node_ptr(
|
2018-05-01 01:10:37 +03:00
|
|
|
index, &mbr, user_rec, page_no, heap);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (sea_cur && !sea_cur->rtr_info) {
|
|
|
|
sea_cur = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
rtr_get_father_node(index, level + 1, tuple, sea_cur, cursor,
|
|
|
|
page_no, mtr);
|
|
|
|
|
|
|
|
node_ptr = btr_cur_get_rec(cursor);
|
|
|
|
ut_ad(!page_rec_is_comp(node_ptr)
|
|
|
|
|| rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets = rec_get_offsets(node_ptr, index, offsets, 0,
|
2016-08-12 11:17:45 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
|
|
|
|
|
|
|
ulint child_page = btr_node_ptr_get_child_page_no(node_ptr, offsets);
|
|
|
|
|
|
|
|
if (child_page != page_no) {
|
|
|
|
const rec_t* print_rec;
|
|
|
|
|
|
|
|
ib::fatal error;
|
|
|
|
|
|
|
|
error << "Corruption of index " << index->name
|
|
|
|
<< " of table " << index->table->name
|
|
|
|
<< " parent page " << page_no
|
|
|
|
<< " child page " << child_page;
|
|
|
|
|
|
|
|
print_rec = page_rec_get_next(
|
|
|
|
page_get_infimum_rec(page_align(user_rec)));
|
2017-09-19 19:20:11 +03:00
|
|
|
offsets = rec_get_offsets(print_rec, index, offsets,
|
2021-04-13 10:28:13 +03:00
|
|
|
page_rec_is_leaf(user_rec)
|
|
|
|
? index->n_fields : 0,
|
2017-09-19 19:20:11 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
2016-08-12 11:17:45 +03:00
|
|
|
error << "; child ";
|
|
|
|
rec_print(error.m_oss, print_rec,
|
|
|
|
rec_get_info_bits(print_rec, rec_offs_comp(offsets)),
|
|
|
|
offsets);
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets = rec_get_offsets(node_ptr, index, offsets, 0,
|
2016-08-12 11:17:45 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
|
|
|
error << "; parent ";
|
|
|
|
rec_print(error.m_oss, print_rec,
|
|
|
|
rec_get_info_bits(print_rec, rec_offs_comp(offsets)),
|
|
|
|
offsets);
|
|
|
|
|
|
|
|
error << ". You should dump + drop + reimport the table to"
|
|
|
|
" fix the corruption. If the crash happens at"
|
2018-01-10 13:53:44 +02:00
|
|
|
" database startup, see "
|
2018-12-03 13:54:32 +02:00
|
|
|
"https://mariadb.com/kb/en/library/innodb-recovery-modes/"
|
2018-01-10 13:53:44 +02:00
|
|
|
" about forcing"
|
2016-08-12 11:17:45 +03:00
|
|
|
" recovery. Then dump + drop + reimport.";
|
|
|
|
}
|
|
|
|
|
|
|
|
return(offsets);
|
|
|
|
}
|
|
|
|
|
2015-05-26 10:01:12 +03:00
|
|
|
/************************************************************//**
|
|
|
|
Returns the father block to a page. It is assumed that mtr holds
|
|
|
|
an X or SX latch on the tree.
|
|
|
|
@return rec_get_offsets() of the node pointer record */
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs*
|
2015-05-26 10:01:12 +03:00
|
|
|
rtr_page_get_father_block(
|
|
|
|
/*======================*/
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs* offsets,/*!< in: work area for the return value */
|
2015-05-26 10:01:12 +03:00
|
|
|
mem_heap_t* heap, /*!< in: memory heap to use */
|
|
|
|
dict_index_t* index, /*!< in: b-tree index */
|
|
|
|
buf_block_t* block, /*!< in: child page in the index */
|
|
|
|
mtr_t* mtr, /*!< in: mtr */
|
|
|
|
btr_cur_t* sea_cur,/*!< in: search cursor, contains information
|
|
|
|
about parent nodes in search */
|
|
|
|
btr_cur_t* cursor) /*!< out: cursor on node pointer record,
|
|
|
|
its page x-latched */
|
|
|
|
{
|
|
|
|
rec_t* rec = page_rec_get_next(
|
|
|
|
page_get_infimum_rec(buf_block_get_frame(block)));
|
|
|
|
btr_cur_position(index, rec, block, cursor);
|
|
|
|
|
|
|
|
return(rtr_page_get_father_node_ptr(offsets, heap, sea_cur,
|
|
|
|
cursor, mtr));
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/*******************************************************************//**
|
|
|
|
Create a RTree search info structure */
|
|
|
|
rtr_info_t*
|
|
|
|
rtr_create_rtr_info(
|
|
|
|
/******************/
|
|
|
|
bool need_prdt, /*!< in: Whether predicate lock
|
|
|
|
is needed */
|
|
|
|
bool init_matches, /*!< in: Whether to initiate the
|
|
|
|
"matches" structure for collecting
|
|
|
|
matched leaf records */
|
|
|
|
btr_cur_t* cursor, /*!< in: tree search cursor */
|
|
|
|
dict_index_t* index) /*!< in: index struct */
|
|
|
|
{
|
|
|
|
rtr_info_t* rtr_info;
|
|
|
|
|
|
|
|
index = index ? index : cursor->index;
|
|
|
|
ut_ad(index);
|
|
|
|
|
|
|
|
rtr_info = static_cast<rtr_info_t*>(ut_zalloc_nokey(sizeof(*rtr_info)));
|
|
|
|
|
|
|
|
rtr_info->allocated = true;
|
|
|
|
rtr_info->cursor = cursor;
|
|
|
|
rtr_info->index = index;
|
|
|
|
|
|
|
|
if (init_matches) {
|
|
|
|
rtr_info->heap = mem_heap_create(sizeof(*(rtr_info->matches)));
|
|
|
|
rtr_info->matches = static_cast<matched_rec_t*>(
|
|
|
|
mem_heap_zalloc(
|
|
|
|
rtr_info->heap,
|
|
|
|
sizeof(*rtr_info->matches)));
|
|
|
|
|
|
|
|
rtr_info->matches->matched_recs
|
|
|
|
= UT_NEW_NOKEY(rtr_rec_vector());
|
|
|
|
|
|
|
|
rtr_info->matches->bufp = page_align(rtr_info->matches->rec_buf
|
|
|
|
+ UNIV_PAGE_SIZE_MAX + 1);
|
|
|
|
mutex_create(LATCH_ID_RTR_MATCH_MUTEX,
|
|
|
|
&rtr_info->matches->rtr_match_mutex);
|
|
|
|
rw_lock_create(PFS_NOT_INSTRUMENTED,
|
|
|
|
&(rtr_info->matches->block.lock),
|
|
|
|
SYNC_LEVEL_VARYING);
|
|
|
|
}
|
|
|
|
|
|
|
|
rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
|
|
|
|
rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
|
|
|
|
rtr_info->need_prdt_lock = need_prdt;
|
|
|
|
mutex_create(LATCH_ID_RTR_PATH_MUTEX,
|
|
|
|
&rtr_info->rtr_path_mutex);
|
|
|
|
|
|
|
|
mutex_enter(&index->rtr_track->rtr_active_mutex);
|
2019-05-27 19:45:44 +03:00
|
|
|
index->rtr_track->rtr_active.push_front(rtr_info);
|
2016-08-12 11:17:45 +03:00
|
|
|
mutex_exit(&index->rtr_track->rtr_active_mutex);
|
|
|
|
return(rtr_info);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*******************************************************************//**
|
|
|
|
Update a btr_cur_t with rtr_info */
|
|
|
|
void
|
|
|
|
rtr_info_update_btr(
|
|
|
|
/******************/
|
|
|
|
btr_cur_t* cursor, /*!< in/out: tree cursor */
|
|
|
|
rtr_info_t* rtr_info) /*!< in: rtr_info to set to the
|
|
|
|
cursor */
|
|
|
|
{
|
|
|
|
ut_ad(rtr_info);
|
|
|
|
|
|
|
|
cursor->rtr_info = rtr_info;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*******************************************************************//**
|
|
|
|
Initialize a R-Tree Search structure */
|
|
|
|
void
|
|
|
|
rtr_init_rtr_info(
|
|
|
|
/****************/
|
|
|
|
rtr_info_t* rtr_info, /*!< in: rtr_info to set to the
|
|
|
|
cursor */
|
|
|
|
bool need_prdt, /*!< in: Whether predicate lock is
|
|
|
|
needed */
|
|
|
|
btr_cur_t* cursor, /*!< in: tree search cursor */
|
|
|
|
dict_index_t* index, /*!< in: index structure */
|
|
|
|
bool reinit) /*!< in: Whether this is a reinit */
|
|
|
|
{
|
|
|
|
ut_ad(rtr_info);
|
|
|
|
|
|
|
|
if (!reinit) {
|
|
|
|
/* Reset all members. */
|
|
|
|
rtr_info->path = NULL;
|
|
|
|
rtr_info->parent_path = NULL;
|
|
|
|
rtr_info->matches = NULL;
|
|
|
|
|
|
|
|
mutex_create(LATCH_ID_RTR_PATH_MUTEX,
|
|
|
|
&rtr_info->rtr_path_mutex);
|
|
|
|
|
|
|
|
memset(rtr_info->tree_blocks, 0x0,
|
|
|
|
sizeof(rtr_info->tree_blocks));
|
|
|
|
memset(rtr_info->tree_savepoints, 0x0,
|
|
|
|
sizeof(rtr_info->tree_savepoints));
|
|
|
|
rtr_info->mbr.xmin = 0.0;
|
|
|
|
rtr_info->mbr.xmax = 0.0;
|
|
|
|
rtr_info->mbr.ymin = 0.0;
|
|
|
|
rtr_info->mbr.ymax = 0.0;
|
|
|
|
rtr_info->thr = NULL;
|
|
|
|
rtr_info->heap = NULL;
|
|
|
|
rtr_info->cursor = NULL;
|
|
|
|
rtr_info->index = NULL;
|
|
|
|
rtr_info->need_prdt_lock = false;
|
|
|
|
rtr_info->need_page_lock = false;
|
|
|
|
rtr_info->allocated = false;
|
|
|
|
rtr_info->mbr_adj = false;
|
|
|
|
rtr_info->fd_del = false;
|
|
|
|
rtr_info->search_tuple = NULL;
|
|
|
|
rtr_info->search_mode = PAGE_CUR_UNSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
ut_ad(!rtr_info->matches || rtr_info->matches->matched_recs->empty());
|
|
|
|
|
|
|
|
rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
|
|
|
|
rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
|
|
|
|
rtr_info->need_prdt_lock = need_prdt;
|
|
|
|
rtr_info->cursor = cursor;
|
|
|
|
rtr_info->index = index;
|
|
|
|
|
|
|
|
mutex_enter(&index->rtr_track->rtr_active_mutex);
|
2019-05-27 19:45:44 +03:00
|
|
|
index->rtr_track->rtr_active.push_front(rtr_info);
|
2016-08-12 11:17:45 +03:00
|
|
|
mutex_exit(&index->rtr_track->rtr_active_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************//**
|
|
|
|
Clean up R-Tree search structure */
|
|
|
|
void
|
|
|
|
rtr_clean_rtr_info(
|
|
|
|
/*===============*/
|
|
|
|
rtr_info_t* rtr_info, /*!< in: RTree search info */
|
|
|
|
bool free_all) /*!< in: need to free rtr_info itself */
|
|
|
|
{
|
|
|
|
dict_index_t* index;
|
|
|
|
bool initialized = false;
|
|
|
|
|
|
|
|
if (!rtr_info) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
index = rtr_info->index;
|
|
|
|
|
|
|
|
if (index) {
|
|
|
|
mutex_enter(&index->rtr_track->rtr_active_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (rtr_info->parent_path && !rtr_info->parent_path->empty()) {
|
|
|
|
btr_pcur_t* cur = rtr_info->parent_path->back().cursor;
|
|
|
|
rtr_info->parent_path->pop_back();
|
|
|
|
|
|
|
|
if (cur) {
|
|
|
|
btr_pcur_close(cur);
|
|
|
|
ut_free(cur);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UT_DELETE(rtr_info->parent_path);
|
|
|
|
rtr_info->parent_path = NULL;
|
|
|
|
|
|
|
|
if (rtr_info->path != NULL) {
|
|
|
|
UT_DELETE(rtr_info->path);
|
|
|
|
rtr_info->path = NULL;
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rtr_info->matches) {
|
|
|
|
rtr_info->matches->used = false;
|
|
|
|
rtr_info->matches->locked = false;
|
|
|
|
rtr_info->matches->valid = false;
|
|
|
|
rtr_info->matches->matched_recs->clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (index) {
|
2019-05-27 19:45:44 +03:00
|
|
|
index->rtr_track->rtr_active.remove(rtr_info);
|
2016-08-12 11:17:45 +03:00
|
|
|
mutex_exit(&index->rtr_track->rtr_active_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (free_all) {
|
|
|
|
if (rtr_info->matches) {
|
|
|
|
if (rtr_info->matches->matched_recs != NULL) {
|
|
|
|
UT_DELETE(rtr_info->matches->matched_recs);
|
|
|
|
}
|
|
|
|
|
|
|
|
rw_lock_free(&(rtr_info->matches->block.lock));
|
|
|
|
|
|
|
|
mutex_destroy(&rtr_info->matches->rtr_match_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rtr_info->heap) {
|
|
|
|
mem_heap_free(rtr_info->heap);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (initialized) {
|
|
|
|
mutex_destroy(&rtr_info->rtr_path_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rtr_info->allocated) {
|
|
|
|
ut_free(rtr_info);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************//**
|
|
|
|
Rebuilt the "path" to exclude the removing page no */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
rtr_rebuild_path(
|
|
|
|
/*=============*/
|
|
|
|
rtr_info_t* rtr_info, /*!< in: RTree search info */
|
|
|
|
ulint page_no) /*!< in: need to free rtr_info itself */
|
|
|
|
{
|
|
|
|
rtr_node_path_t* new_path
|
|
|
|
= UT_NEW_NOKEY(rtr_node_path_t());
|
|
|
|
|
|
|
|
rtr_node_path_t::iterator rit;
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
ulint before_size = rtr_info->path->size();
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
|
|
|
|
for (rit = rtr_info->path->begin();
|
|
|
|
rit != rtr_info->path->end(); ++rit) {
|
|
|
|
node_visit_t next_rec = *rit;
|
|
|
|
|
|
|
|
if (next_rec.page_no == page_no) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_path->push_back(next_rec);
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
node_visit_t rec = new_path->back();
|
|
|
|
ut_ad(rec.level < rtr_info->cursor->tree_height
|
|
|
|
&& rec.page_no > 0);
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
}
|
|
|
|
|
|
|
|
UT_DELETE(rtr_info->path);
|
|
|
|
|
|
|
|
ut_ad(new_path->size() == before_size - 1);
|
|
|
|
|
|
|
|
rtr_info->path = new_path;
|
|
|
|
|
|
|
|
if (!rtr_info->parent_path->empty()) {
|
|
|
|
rtr_node_path_t* new_parent_path = UT_NEW_NOKEY(
|
|
|
|
rtr_node_path_t());
|
|
|
|
|
|
|
|
for (rit = rtr_info->parent_path->begin();
|
|
|
|
rit != rtr_info->parent_path->end(); ++rit) {
|
|
|
|
node_visit_t next_rec = *rit;
|
|
|
|
|
|
|
|
if (next_rec.child_no == page_no) {
|
|
|
|
btr_pcur_t* cur = next_rec.cursor;
|
|
|
|
|
|
|
|
if (cur) {
|
|
|
|
btr_pcur_close(cur);
|
|
|
|
ut_free(cur);
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_parent_path->push_back(next_rec);
|
|
|
|
}
|
|
|
|
UT_DELETE(rtr_info->parent_path);
|
|
|
|
rtr_info->parent_path = new_parent_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************//**
|
|
|
|
Check whether a discarding page is in anyone's search path */
|
|
|
|
void
|
|
|
|
rtr_check_discard_page(
|
|
|
|
/*===================*/
|
|
|
|
dict_index_t* index, /*!< in: index */
|
|
|
|
btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
|
|
|
|
the root page */
|
|
|
|
buf_block_t* block) /*!< in: block of page to be discarded */
|
|
|
|
{
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
const ulint pageno = block->page.id().page_no();
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
mutex_enter(&index->rtr_track->rtr_active_mutex);
|
|
|
|
|
2019-05-27 19:45:44 +03:00
|
|
|
for (const auto& rtr_info : index->rtr_track->rtr_active) {
|
2016-08-12 11:17:45 +03:00
|
|
|
if (cursor && rtr_info == cursor->rtr_info) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&rtr_info->rtr_path_mutex);
|
2019-05-27 19:45:44 +03:00
|
|
|
for (const node_visit_t& node : *rtr_info->path) {
|
2016-08-12 11:17:45 +03:00
|
|
|
if (node.page_no == pageno) {
|
2019-05-27 19:45:44 +03:00
|
|
|
rtr_rebuild_path(rtr_info, pageno);
|
2016-08-12 11:17:45 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_exit(&rtr_info->rtr_path_mutex);
|
|
|
|
|
|
|
|
if (rtr_info->matches) {
|
|
|
|
mutex_enter(&rtr_info->matches->rtr_match_mutex);
|
|
|
|
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
if ((&rtr_info->matches->block)->page.id().page_no()
|
2016-08-12 11:17:45 +03:00
|
|
|
== pageno) {
|
|
|
|
if (!rtr_info->matches->matched_recs->empty()) {
|
|
|
|
rtr_info->matches->matched_recs->clear();
|
|
|
|
}
|
|
|
|
ut_ad(rtr_info->matches->matched_recs->empty());
|
|
|
|
rtr_info->matches->valid = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&rtr_info->matches->rtr_match_mutex);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&index->rtr_track->rtr_active_mutex);
|
|
|
|
|
|
|
|
lock_mutex_enter();
|
2020-06-18 12:26:28 +03:00
|
|
|
lock_prdt_page_free_from_discard(block, &lock_sys.prdt_hash);
|
|
|
|
lock_prdt_page_free_from_discard(block, &lock_sys.prdt_page_hash);
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_mutex_exit();
|
|
|
|
}
|
|
|
|
|
2020-10-27 17:56:49 +05:30
|
|
|
/** Structure acts as functor to get the optimistic access of the page.
|
|
|
|
It returns true if it successfully gets the page. */
|
|
|
|
struct optimistic_get
|
|
|
|
{
|
|
|
|
btr_pcur_t *const r_cursor;
|
|
|
|
mtr_t *const mtr;
|
|
|
|
|
|
|
|
optimistic_get(btr_pcur_t *r_cursor,mtr_t *mtr)
|
|
|
|
:r_cursor(r_cursor), mtr(mtr) {}
|
|
|
|
|
|
|
|
bool operator()(buf_block_t *hint) const
|
|
|
|
{
|
|
|
|
return hint && buf_page_optimistic_get(
|
|
|
|
RW_X_LATCH, hint, r_cursor->modify_clock, __FILE__,
|
|
|
|
__LINE__, mtr);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-05-26 10:01:12 +03:00
|
|
|
/** Restore the stored position of a persistent cursor bufferfixing the page */
|
|
|
|
static
|
2016-08-12 11:17:45 +03:00
|
|
|
bool
|
2015-05-26 10:01:12 +03:00
|
|
|
rtr_cur_restore_position(
|
|
|
|
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
|
2016-08-12 11:17:45 +03:00
|
|
|
btr_cur_t* btr_cur, /*!< in: detached persistent cursor */
|
|
|
|
ulint level, /*!< in: index level */
|
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
dict_index_t* index;
|
|
|
|
mem_heap_t* heap;
|
|
|
|
btr_pcur_t* r_cursor = rtr_get_parent_cursor(btr_cur, level, false);
|
|
|
|
dtuple_t* tuple;
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
ut_ad(mtr);
|
|
|
|
ut_ad(r_cursor);
|
|
|
|
ut_ad(mtr->is_active());
|
|
|
|
|
|
|
|
index = btr_cur_get_index(btr_cur);
|
|
|
|
|
|
|
|
if (r_cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
|
|
|
|
|| r_cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
|
|
|
|
return(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBUG_EXECUTE_IF(
|
|
|
|
"rtr_pessimistic_position",
|
|
|
|
r_cursor->modify_clock = 100;
|
|
|
|
);
|
|
|
|
|
|
|
|
ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
|
|
|
|
|
2020-10-27 17:56:49 +05:30
|
|
|
if (r_cursor->block_when_stored.run_with_hint(
|
|
|
|
optimistic_get(r_cursor, mtr))) {
|
2016-08-12 11:17:45 +03:00
|
|
|
ut_ad(r_cursor->pos_state == BTR_PCUR_IS_POSITIONED);
|
|
|
|
|
|
|
|
ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
do {
|
|
|
|
const rec_t* rec;
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets1;
|
|
|
|
const rec_offs* offsets2;
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint comp;
|
|
|
|
|
|
|
|
rec = btr_pcur_get_rec(r_cursor);
|
|
|
|
|
|
|
|
heap = mem_heap_create(256);
|
|
|
|
offsets1 = rec_get_offsets(
|
2021-04-13 10:28:13 +03:00
|
|
|
r_cursor->old_rec, index, NULL,
|
|
|
|
level ? 0 : r_cursor->old_n_fields,
|
2016-08-12 11:17:45 +03:00
|
|
|
r_cursor->old_n_fields, &heap);
|
|
|
|
offsets2 = rec_get_offsets(
|
2021-04-13 10:28:13 +03:00
|
|
|
rec, index, NULL,
|
|
|
|
level ? 0 : r_cursor->old_n_fields,
|
2016-08-12 11:17:45 +03:00
|
|
|
r_cursor->old_n_fields, &heap);
|
|
|
|
|
|
|
|
comp = rec_offs_comp(offsets1);
|
|
|
|
|
|
|
|
if (rec_get_info_bits(r_cursor->old_rec, comp)
|
|
|
|
& REC_INFO_MIN_REC_FLAG) {
|
|
|
|
ut_ad(rec_get_info_bits(rec, comp)
|
|
|
|
& REC_INFO_MIN_REC_FLAG);
|
|
|
|
} else {
|
|
|
|
|
|
|
|
ut_ad(!cmp_rec_rec(r_cursor->old_rec,
|
|
|
|
rec, offsets1, offsets2,
|
|
|
|
index));
|
|
|
|
}
|
|
|
|
|
|
|
|
mem_heap_free(heap);
|
|
|
|
} while (0);
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
|
|
|
|
return(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Page has changed, for R-Tree, the page cannot be shrunk away,
|
|
|
|
so we search the page and its right siblings */
|
|
|
|
buf_block_t* block;
|
|
|
|
node_seq_t page_ssn;
|
|
|
|
const page_t* page;
|
|
|
|
page_cur_t* page_cursor;
|
|
|
|
node_visit_t* node = rtr_get_parent_node(btr_cur, level, false);
|
|
|
|
node_seq_t path_ssn = node->seq_no;
|
2020-10-15 16:28:19 +03:00
|
|
|
const unsigned zip_size = index->table->space->zip_size();
|
|
|
|
uint32_t page_no = node->page_no;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
heap = mem_heap_create(256);
|
|
|
|
|
2017-09-20 22:34:20 +03:00
|
|
|
tuple = dict_index_build_data_tuple(r_cursor->old_rec, index, !level,
|
2016-08-12 11:17:45 +03:00
|
|
|
r_cursor->old_n_fields, heap);
|
|
|
|
|
|
|
|
page_cursor = btr_pcur_get_page_cur(r_cursor);
|
|
|
|
ut_ad(r_cursor == node->cursor);
|
|
|
|
|
|
|
|
search_again:
|
|
|
|
dberr_t err = DB_SUCCESS;
|
|
|
|
|
|
|
|
block = buf_page_get_gen(
|
2018-11-22 17:07:35 +02:00
|
|
|
page_id_t(index->table->space_id, page_no),
|
2019-02-06 19:50:11 +02:00
|
|
|
zip_size, RW_X_LATCH, NULL,
|
2016-08-12 11:17:45 +03:00
|
|
|
BUF_GET, __FILE__, __LINE__, mtr, &err);
|
|
|
|
|
|
|
|
ut_ad(block);
|
|
|
|
|
|
|
|
/* Get the page SSN */
|
|
|
|
page = buf_block_get_frame(block);
|
|
|
|
page_ssn = page_get_ssn_id(page);
|
|
|
|
|
|
|
|
ulint low_match = page_cur_search(
|
|
|
|
block, index, tuple, PAGE_CUR_LE, page_cursor);
|
|
|
|
|
|
|
|
if (low_match == r_cursor->old_n_fields) {
|
|
|
|
const rec_t* rec;
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets1;
|
|
|
|
const rec_offs* offsets2;
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint comp;
|
|
|
|
|
|
|
|
rec = btr_pcur_get_rec(r_cursor);
|
|
|
|
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets1 = rec_get_offsets(r_cursor->old_rec, index, NULL,
|
|
|
|
level ? 0 : r_cursor->old_n_fields,
|
|
|
|
r_cursor->old_n_fields, &heap);
|
|
|
|
offsets2 = rec_get_offsets(rec, index, NULL,
|
|
|
|
level ? 0 : r_cursor->old_n_fields,
|
|
|
|
r_cursor->old_n_fields, &heap);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
comp = rec_offs_comp(offsets1);
|
|
|
|
|
|
|
|
if ((rec_get_info_bits(r_cursor->old_rec, comp)
|
|
|
|
& REC_INFO_MIN_REC_FLAG)
|
|
|
|
&& (rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG)) {
|
|
|
|
r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
|
|
|
|
ret = true;
|
|
|
|
} else if (!cmp_rec_rec(r_cursor->old_rec, rec, offsets1, offsets2,
|
|
|
|
index)) {
|
|
|
|
r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
|
|
|
|
ret = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check the page SSN to see if it has been splitted, if so, search
|
|
|
|
the right page */
|
|
|
|
if (!ret && page_ssn > path_ssn) {
|
2019-11-11 13:36:21 +02:00
|
|
|
page_no = btr_page_get_next(page);
|
2016-08-12 11:17:45 +03:00
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
|
|
|
|
mem_heap_free(heap);
|
|
|
|
|
|
|
|
return(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************//**
|
|
|
|
Copy the leaf level R-tree record, and push it to matched_rec in rtr_info */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
rtr_leaf_push_match_rec(
|
|
|
|
/*====================*/
|
|
|
|
const rec_t* rec, /*!< in: record to copy */
|
|
|
|
rtr_info_t* rtr_info, /*!< in/out: search stack */
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs* offsets, /*!< in: offsets */
|
2016-08-12 11:17:45 +03:00
|
|
|
bool is_comp) /*!< in: is compact format */
|
|
|
|
{
|
|
|
|
byte* buf;
|
|
|
|
matched_rec_t* match_rec = rtr_info->matches;
|
|
|
|
rec_t* copy;
|
|
|
|
ulint data_len;
|
|
|
|
rtr_rec_t rtr_rec;
|
|
|
|
|
|
|
|
buf = match_rec->block.frame + match_rec->used;
|
2017-09-19 19:20:11 +03:00
|
|
|
ut_ad(page_rec_is_leaf(rec));
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
copy = rec_copy(buf, rec, offsets);
|
|
|
|
|
|
|
|
if (is_comp) {
|
|
|
|
rec_set_next_offs_new(copy, PAGE_NEW_SUPREMUM);
|
|
|
|
} else {
|
|
|
|
rec_set_next_offs_old(copy, PAGE_OLD_SUPREMUM);
|
|
|
|
}
|
|
|
|
|
|
|
|
rtr_rec.r_rec = copy;
|
|
|
|
rtr_rec.locked = false;
|
|
|
|
|
|
|
|
match_rec->matched_recs->push_back(rtr_rec);
|
|
|
|
match_rec->valid = true;
|
|
|
|
|
|
|
|
data_len = rec_offs_data_size(offsets) + rec_offs_extra_size(offsets);
|
|
|
|
match_rec->used += data_len;
|
|
|
|
|
2018-04-27 13:49:25 +03:00
|
|
|
ut_ad(match_rec->used < srv_page_size);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************//**
|
|
|
|
Store the parent path cursor
|
|
|
|
@return number of cursor stored */
|
|
|
|
ulint
|
|
|
|
rtr_store_parent_path(
|
|
|
|
/*==================*/
|
|
|
|
const buf_block_t* block, /*!< in: block of the page */
|
|
|
|
btr_cur_t* btr_cur,/*!< in/out: persistent cursor */
|
|
|
|
ulint latch_mode,
|
|
|
|
/*!< in: latch_mode */
|
|
|
|
ulint level, /*!< in: index level */
|
|
|
|
mtr_t* mtr) /*!< in: mtr */
|
|
|
|
{
|
|
|
|
ulint num = btr_cur->rtr_info->parent_path->size();
|
|
|
|
ulint num_stored = 0;
|
|
|
|
|
|
|
|
while (num >= 1) {
|
|
|
|
node_visit_t* node = &(*btr_cur->rtr_info->parent_path)[
|
|
|
|
num - 1];
|
|
|
|
btr_pcur_t* r_cursor = node->cursor;
|
|
|
|
buf_block_t* cur_block;
|
|
|
|
|
|
|
|
if (node->level > level) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
|
|
|
|
r_cursor->latch_mode = latch_mode;
|
|
|
|
|
|
|
|
cur_block = btr_pcur_get_block(r_cursor);
|
|
|
|
|
|
|
|
if (cur_block == block) {
|
|
|
|
btr_pcur_store_position(r_cursor, mtr);
|
|
|
|
num_stored++;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
num--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return(num_stored);
|
|
|
|
}
|
|
|
|
/**************************************************************//**
|
|
|
|
push a nonleaf index node to the search path for insertion */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
rtr_non_leaf_insert_stack_push(
|
|
|
|
/*===========================*/
|
|
|
|
dict_index_t* index, /*!< in: index descriptor */
|
|
|
|
rtr_node_path_t* path, /*!< in/out: search path */
|
|
|
|
ulint level, /*!< in: index page level */
|
2020-10-15 16:28:19 +03:00
|
|
|
uint32_t child_no,/*!< in: child page no */
|
2016-08-12 11:17:45 +03:00
|
|
|
const buf_block_t* block, /*!< in: block of the page */
|
|
|
|
const rec_t* rec, /*!< in: positioned record */
|
|
|
|
double mbr_inc)/*!< in: MBR needs to be enlarged */
|
|
|
|
{
|
|
|
|
node_seq_t new_seq;
|
|
|
|
btr_pcur_t* my_cursor;
|
|
|
|
|
|
|
|
my_cursor = static_cast<btr_pcur_t*>(
|
|
|
|
ut_malloc_nokey(sizeof(*my_cursor)));
|
|
|
|
|
|
|
|
btr_pcur_init(my_cursor);
|
|
|
|
|
|
|
|
page_cur_position(rec, block, btr_pcur_get_page_cur(my_cursor));
|
|
|
|
|
|
|
|
(btr_pcur_get_btr_cur(my_cursor))->index = index;
|
|
|
|
|
|
|
|
new_seq = rtr_get_current_ssn_id(index);
|
2020-10-15 16:28:19 +03:00
|
|
|
rtr_non_leaf_stack_push(path, block->page.id().page_no(),
|
|
|
|
new_seq, level, child_no, my_cursor, mbr_inc);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
/** Copy a buf_block_t, except "block->lock".
|
2016-08-12 11:17:45 +03:00
|
|
|
@param[in,out] matches copy to match->block
|
|
|
|
@param[in] block block to copy */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
rtr_copy_buf(
|
|
|
|
matched_rec_t* matches,
|
|
|
|
const buf_block_t* block)
|
|
|
|
{
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
/* Copy all members of "block" to "matches->block" except "lock".
|
|
|
|
We skip "lock" because it is not used
|
2016-08-12 11:17:45 +03:00
|
|
|
from the dummy buf_block_t we create here and because memcpy()ing
|
MDEV-15053 Reduce buf_pool_t::mutex contention
User-visible changes: The INFORMATION_SCHEMA views INNODB_BUFFER_PAGE
and INNODB_BUFFER_PAGE_LRU will report a dummy value FLUSH_TYPE=0
and will no longer report the PAGE_STATE value READY_FOR_USE.
We will remove some fields from buf_page_t and move much code to
member functions of buf_pool_t and buf_page_t, so that the access
rules of data members can be enforced consistently.
Evicting or adding pages in buf_pool.LRU will remain covered by
buf_pool.mutex.
Evicting or adding pages in buf_pool.page_hash will remain
covered by both buf_pool.mutex and the buf_pool.page_hash X-latch.
After this fix, buf_pool.page_hash lookups can entirely
avoid acquiring buf_pool.mutex, only relying on
buf_pool.hash_lock_get() S-latch.
Similarly, buf_flush_check_neighbors() can will rely solely on
buf_pool.mutex, no buf_pool.page_hash latch at all.
The buf_pool.mutex is rather contended in I/O heavy benchmarks,
especially when the workload does not fit in the buffer pool.
The first attempt to alleviate the contention was the
buf_pool_t::mutex split in
commit 4ed7082eefe56b3e97e0edefb3df76dd7ef5e858
which introduced buf_block_t::mutex, which we are now removing.
Later, multiple instances of buf_pool_t were introduced
in commit c18084f71b02ea707c6461353e6cfc15d7553bc6
and recently removed by us in
commit 1a6f708ec594ac0ae2dd30db926ab07b100fa24b (MDEV-15058).
UNIV_BUF_DEBUG: Remove. This option to enable some buffer pool
related debugging in otherwise non-debug builds has not been used
for years. Instead, we have been using UNIV_DEBUG, which is enabled
in CMAKE_BUILD_TYPE=Debug.
buf_block_t::mutex, buf_pool_t::zip_mutex: Remove. We can mainly rely on
std::atomic and the buf_pool.page_hash latches, and in some cases
depend on buf_pool.mutex or buf_pool.flush_list_mutex just like before.
We must always release buf_block_t::lock before invoking
unfix() or io_unfix(), to prevent a glitch where a block that was
added to the buf_pool.free list would apper X-latched. See
commit c5883debd6ef440a037011c11873b396923e93c5 how this glitch
was finally caught in a debug environment.
We move some buf_pool_t::page_hash specific code from the
ha and hash modules to buf_pool, for improved readability.
buf_pool_t::close(): Assert that all blocks are clean, except
on aborted startup or crash-like shutdown.
buf_pool_t::validate(): No longer attempt to validate
n_flush[] against the number of BUF_IO_WRITE fixed blocks,
because buf_page_t::flush_type no longer exists.
buf_pool_t::watch_set(): Replaces buf_pool_watch_set().
Reduce mutex contention by separating the buf_pool.watch[]
allocation and the insert into buf_pool.page_hash.
buf_pool_t::page_hash_lock<bool exclusive>(): Acquire a
buf_pool.page_hash latch.
Replaces and extends buf_page_hash_lock_s_confirm()
and buf_page_hash_lock_x_confirm().
buf_pool_t::READ_AHEAD_PAGES: Renamed from BUF_READ_AHEAD_PAGES.
buf_pool_t::curr_size, old_size, read_ahead_area, n_pend_reads:
Use Atomic_counter.
buf_pool_t::running_out(): Replaces buf_LRU_buf_pool_running_out().
buf_pool_t::LRU_remove(): Remove a block from the LRU list
and return its predecessor. Incorporates buf_LRU_adjust_hp(),
which was removed.
buf_page_get_gen(): Remove a redundant call of fsp_is_system_temporary(),
for mode == BUF_GET_IF_IN_POOL_OR_WATCH, which is only used by
BTR_DELETE_OP (purge), which is never invoked on temporary tables.
buf_free_from_unzip_LRU_list_batch(): Avoid redundant assignments.
buf_LRU_free_from_unzip_LRU_list(): Simplify the loop condition.
buf_LRU_free_page(): Clarify the function comment.
buf_flush_check_neighbor(), buf_flush_check_neighbors():
Rewrite the construction of the page hash range. We will hold
the buf_pool.mutex for up to buf_pool.read_ahead_area (at most 64)
consecutive lookups of buf_pool.page_hash.
buf_flush_page_and_try_neighbors(): Remove.
Merge to its only callers, and remove redundant operations in
buf_flush_LRU_list_batch().
buf_read_ahead_random(), buf_read_ahead_linear(): Rewrite.
Do not acquire buf_pool.mutex, and iterate directly with page_id_t.
ut_2_power_up(): Remove. my_round_up_to_next_power() is inlined
and avoids any loops.
fil_page_get_prev(), fil_page_get_next(), fil_addr_is_null(): Remove.
buf_flush_page(): Add a fil_space_t* parameter. Minimize the
buf_pool.mutex hold time. buf_pool.n_flush[] is no longer updated
atomically with the io_fix, and we will protect most buf_block_t
fields with buf_block_t::lock. The function
buf_flush_write_block_low() is removed and merged here.
buf_page_init_for_read(): Use static linkage. Initialize the newly
allocated block and acquire the exclusive buf_block_t::lock while not
holding any mutex.
IORequest::IORequest(): Remove the body. We only need to invoke
set_punch_hole() in buf_flush_page() and nowhere else.
buf_page_t::flush_type: Remove. Replaced by IORequest::flush_type.
This field is only used during a fil_io() call.
That function already takes IORequest as a parameter, so we had
better introduce for the rarely changing field.
buf_block_t::init(): Replaces buf_page_init().
buf_page_t::init(): Replaces buf_page_init_low().
buf_block_t::initialise(): Initialise many fields, but
keep the buf_page_t::state(). Both buf_pool_t::validate() and
buf_page_optimistic_get() requires that buf_page_t::in_file()
be protected atomically with buf_page_t::in_page_hash
and buf_page_t::in_LRU_list.
buf_page_optimistic_get(): Now that buf_block_t::mutex
no longer exists, we must check buf_page_t::io_fix()
after acquiring the buf_pool.page_hash lock, to detect
whether buf_page_init_for_read() has been initiated.
We will also check the io_fix() before acquiring hash_lock
in order to avoid unnecessary computation.
The field buf_block_t::modify_clock (protected by buf_block_t::lock)
allows buf_page_optimistic_get() to validate the block.
buf_page_t::real_size: Remove. It was only used while flushing
pages of page_compressed tables.
buf_page_encrypt(): Add an output parameter that allows us ot eliminate
buf_page_t::real_size. Replace a condition with debug assertion.
buf_page_should_punch_hole(): Remove.
buf_dblwr_t::add_to_batch(): Replaces buf_dblwr_add_to_batch().
Add the parameter size (to replace buf_page_t::real_size).
buf_dblwr_t::write_single_page(): Replaces buf_dblwr_write_single_page().
Add the parameter size (to replace buf_page_t::real_size).
fil_system_t::detach(): Replaces fil_space_detach().
Ensure that fil_validate() will not be violated even if
fil_system.mutex is released and reacquired.
fil_node_t::complete_io(): Renamed from fil_node_complete_io().
fil_node_t::close_to_free(): Replaces fil_node_close_to_free().
Avoid invoking fil_node_t::close() because fil_system.n_open
has already been decremented in fil_space_t::detach().
BUF_BLOCK_READY_FOR_USE: Remove. Directly use BUF_BLOCK_MEMORY.
BUF_BLOCK_ZIP_DIRTY: Remove. Directly use BUF_BLOCK_ZIP_PAGE,
and distinguish dirty pages by buf_page_t::oldest_modification().
BUF_BLOCK_POOL_WATCH: Remove. Use BUF_BLOCK_NOT_USED instead.
This state was only being used for buf_page_t that are in
buf_pool.watch.
buf_pool_t::watch[]: Remove pointer indirection.
buf_page_t::in_flush_list: Remove. It was set if and only if
buf_page_t::oldest_modification() is nonzero.
buf_page_decrypt_after_read(), buf_corrupt_page_release(),
buf_page_check_corrupt(): Change the const fil_space_t* parameter
to const fil_node_t& so that we can report the correct file name.
buf_page_monitor(): Declare as an ATTRIBUTE_COLD global function.
buf_page_io_complete(): Split to buf_page_read_complete() and
buf_page_write_complete().
buf_dblwr_t::in_use: Remove.
buf_dblwr_t::buf_block_array: Add IORequest::flush_t.
buf_dblwr_sync_datafiles(): Remove. It was a useless wrapper of
os_aio_wait_until_no_pending_writes().
buf_flush_write_complete(): Declare static, not global.
Add the parameter IORequest::flush_t.
buf_flush_freed_page(): Simplify the code.
recv_sys_t::flush_lru: Renamed from flush_type and changed to bool.
fil_read(), fil_write(): Replaced with direct use of fil_io().
fil_buffering_disabled(): Remove. Check srv_file_flush_method directly.
fil_mutex_enter_and_prepare_for_io(): Return the resolved
fil_space_t* to avoid a duplicated lookup in the caller.
fil_report_invalid_page_access(): Clean up the parameters.
fil_io(): Return fil_io_t, which comprises fil_node_t and error code.
Always invoke fil_space_t::acquire_for_io() and let either the
sync=true caller or fil_aio_callback() invoke
fil_space_t::release_for_io().
fil_aio_callback(): Rewrite to replace buf_page_io_complete().
fil_check_pending_operations(): Remove a parameter, and remove some
redundant lookups.
fil_node_close_to_free(): Wait for n_pending==0. Because we no longer
do an extra lookup of the tablespace between fil_io() and the
completion of the operation, we must give fil_node_t::complete_io() a
chance to decrement the counter.
fil_close_tablespace(): Remove unused parameter trx, and document
that this is only invoked during the error handling of IMPORT TABLESPACE.
row_import_discard_changes(): Merged with the only caller,
row_import_cleanup(). Do not lock up the data dictionary while
invoking fil_close_tablespace().
logs_empty_and_mark_files_at_shutdown(): Do not invoke
fil_close_all_files(), to avoid a !needs_flush assertion failure
on fil_node_t::close().
innodb_shutdown(): Invoke os_aio_free() before fil_close_all_files().
fil_close_all_files(): Invoke fil_flush_file_spaces()
to ensure proper durability.
thread_pool::unbind(): Fix a crash that would occur on Windows
after srv_thread_pool->disable_aio() and os_file_close().
This fix was submitted by Vladislav Vaintroub.
Thanks to Matthias Leich and Axel Schwenke for extensive testing,
Vladislav Vaintroub for helpful comments, and Eugene Kosov for a review.
2020-06-05 12:35:46 +03:00
|
|
|
it generates (valid) compiler warnings that the vtable pointer
|
|
|
|
will be copied. */
|
2018-08-03 13:06:03 +03:00
|
|
|
new (&matches->block.page) buf_page_t(block->page);
|
2016-08-12 11:17:45 +03:00
|
|
|
matches->block.frame = block->frame;
|
|
|
|
matches->block.unzip_LRU = block->unzip_LRU;
|
|
|
|
|
|
|
|
ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list);
|
|
|
|
ut_d(matches->block.in_withdraw_list = block->in_withdraw_list);
|
|
|
|
|
|
|
|
/* Skip buf_block_t::lock */
|
|
|
|
matches->block.modify_clock = block->modify_clock;
|
2017-02-23 23:05:12 +02:00
|
|
|
#ifdef BTR_CUR_HASH_ADAPT
|
2016-08-12 11:17:45 +03:00
|
|
|
matches->block.n_hash_helps = block->n_hash_helps;
|
|
|
|
matches->block.n_fields = block->n_fields;
|
|
|
|
matches->block.left_side = block->left_side;
|
|
|
|
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
|
2018-12-28 17:53:50 +04:00
|
|
|
matches->block.n_pointers = 0;
|
2016-08-12 11:17:45 +03:00
|
|
|
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
|
|
|
|
matches->block.curr_n_fields = block->curr_n_fields;
|
|
|
|
matches->block.curr_left_side = block->curr_left_side;
|
|
|
|
matches->block.index = block->index;
|
2017-02-23 23:05:12 +02:00
|
|
|
#endif /* BTR_CUR_HASH_ADAPT */
|
2018-12-28 09:56:46 +02:00
|
|
|
ut_d(matches->block.debug_latch = NULL);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************//**
|
|
|
|
Generate a shadow copy of the page block header to save the
|
|
|
|
matched records */
|
|
|
|
static
|
|
|
|
void
|
|
|
|
rtr_init_match(
|
|
|
|
/*===========*/
|
|
|
|
matched_rec_t* matches,/*!< in/out: match to initialize */
|
|
|
|
const buf_block_t* block, /*!< in: buffer block */
|
|
|
|
const page_t* page) /*!< in: buffer page */
|
|
|
|
{
|
|
|
|
ut_ad(matches->matched_recs->empty());
|
|
|
|
matches->locked = false;
|
|
|
|
rtr_copy_buf(matches, block);
|
|
|
|
matches->block.frame = matches->bufp;
|
|
|
|
matches->valid = false;
|
|
|
|
/* We have to copy PAGE_W*_SUPREMUM_END bytes so that we can
|
|
|
|
use infimum/supremum of this page as normal btr page for search. */
|
|
|
|
memcpy(matches->block.frame, page, page_is_comp(page)
|
|
|
|
? PAGE_NEW_SUPREMUM_END
|
|
|
|
: PAGE_OLD_SUPREMUM_END);
|
|
|
|
matches->used = page_is_comp(page)
|
|
|
|
? PAGE_NEW_SUPREMUM_END
|
|
|
|
: PAGE_OLD_SUPREMUM_END;
|
|
|
|
#ifdef RTR_SEARCH_DIAGNOSTIC
|
|
|
|
ulint pageno = page_get_page_no(page);
|
|
|
|
fprintf(stderr, "INNODB_RTR: Searching leaf page %d\n",
|
|
|
|
static_cast<int>(pageno));
|
|
|
|
#endif /* RTR_SEARCH_DIAGNOSTIC */
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************//**
|
|
|
|
Get the bounding box content from an index record */
|
|
|
|
void
|
|
|
|
rtr_get_mbr_from_rec(
|
|
|
|
/*=================*/
|
|
|
|
const rec_t* rec, /*!< in: data tuple */
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets,/*!< in: offsets array */
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_mbr_t* mbr) /*!< out MBR */
|
|
|
|
{
|
|
|
|
ulint rec_f_len;
|
|
|
|
const byte* data;
|
|
|
|
|
|
|
|
data = rec_get_nth_field(rec, offsets, 0, &rec_f_len);
|
|
|
|
|
|
|
|
rtr_read_mbr(data, mbr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************//**
|
|
|
|
Get the bounding box content from a MBR data record */
|
|
|
|
void
|
|
|
|
rtr_get_mbr_from_tuple(
|
|
|
|
/*===================*/
|
|
|
|
const dtuple_t* dtuple, /*!< in: data tuple */
|
|
|
|
rtr_mbr* mbr) /*!< out: mbr to fill */
|
|
|
|
{
|
|
|
|
const dfield_t* dtuple_field;
|
|
|
|
ulint dtuple_f_len;
|
|
|
|
|
|
|
|
dtuple_field = dtuple_get_nth_field(dtuple, 0);
|
|
|
|
dtuple_f_len = dfield_get_len(dtuple_field);
|
|
|
|
ut_a(dtuple_f_len >= 4 * sizeof(double));
|
|
|
|
|
2019-05-03 16:47:07 +03:00
|
|
|
rtr_read_mbr(static_cast<const byte*>(dfield_get_data(dtuple_field)),
|
|
|
|
mbr);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
2020-03-12 13:58:45 +02:00
|
|
|
/** Compare minimum bounding rectangles.
|
|
|
|
@return 1, 0, -1, if mode == PAGE_CUR_MBR_EQUAL. And return
|
|
|
|
1, 0 for rest compare modes, depends on a and b qualifies the
|
|
|
|
relationship (CONTAINS, WITHIN etc.) */
|
|
|
|
static int cmp_gis_field(page_cur_mode_t mode, const void *a, const void *b)
|
|
|
|
{
|
|
|
|
return mode == PAGE_CUR_MBR_EQUAL
|
|
|
|
? cmp_geometry_field(a, b)
|
|
|
|
: rtree_key_cmp(mode, a, b);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Compare a GIS data tuple to a physical record in rtree non-leaf node.
|
|
|
|
We need to check the page number field, since we don't store pk field in
|
|
|
|
rtree non-leaf node.
|
|
|
|
@param[in] dtuple data tuple
|
|
|
|
@param[in] rec R-tree record
|
|
|
|
@return whether dtuple is less than rec */
|
|
|
|
static bool
|
|
|
|
cmp_dtuple_rec_with_gis_internal(const dtuple_t* dtuple, const rec_t* rec)
|
|
|
|
{
|
|
|
|
const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
|
|
|
|
ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
|
|
|
|
|
|
|
|
if (cmp_gis_field(PAGE_CUR_WITHIN, dfield_get_data(dtuple_field), rec))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
dtuple_field= dtuple_get_nth_field(dtuple, 1);
|
|
|
|
ut_ad(dfield_get_len(dtuple_field) == 4); /* child page number */
|
|
|
|
ut_ad(dtuple_field->type.mtype == DATA_SYS_CHILD);
|
|
|
|
ut_ad(!(dtuple_field->type.prtype & ~DATA_NOT_NULL));
|
|
|
|
|
|
|
|
return memcmp(dtuple_field->data, rec + DATA_MBR_LEN, 4) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef UNIV_DEBUG
|
|
|
|
static
|
|
|
|
#endif
|
|
|
|
/** Compare a GIS data tuple to a physical record.
|
|
|
|
@param[in] dtuple data tuple
|
|
|
|
@param[in] rec R-tree record
|
|
|
|
@param[in] mode compare mode
|
|
|
|
@retval negative if dtuple is less than rec */
|
|
|
|
int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
|
|
|
|
page_cur_mode_t mode)
|
|
|
|
{
|
|
|
|
const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
|
|
|
|
/* FIXME: TABLE_SHARE::init_from_binary_frm_image() is adding
|
|
|
|
field->key_part_length_bytes() to the key length */
|
|
|
|
ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN ||
|
|
|
|
dfield_get_len(dtuple_field) == DATA_MBR_LEN + 2);
|
|
|
|
|
|
|
|
return cmp_gis_field(mode, dfield_get_data(dtuple_field), rec);
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/****************************************************************//**
|
|
|
|
Searches the right position in rtree for a page cursor. */
|
|
|
|
bool
|
|
|
|
rtr_cur_search_with_match(
|
|
|
|
/*======================*/
|
|
|
|
const buf_block_t* block, /*!< in: buffer block */
|
|
|
|
dict_index_t* index, /*!< in: index descriptor */
|
|
|
|
const dtuple_t* tuple, /*!< in: data tuple */
|
|
|
|
page_cur_mode_t mode, /*!< in: PAGE_CUR_RTREE_INSERT,
|
|
|
|
PAGE_CUR_RTREE_LOCATE etc. */
|
|
|
|
page_cur_t* cursor, /*!< in/out: page cursor */
|
|
|
|
rtr_info_t* rtr_info)/*!< in/out: search stack */
|
|
|
|
{
|
|
|
|
bool found = false;
|
|
|
|
const page_t* page;
|
|
|
|
const rec_t* rec;
|
|
|
|
const rec_t* last_rec;
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
|
|
|
|
rec_offs* offsets = offsets_;
|
2016-08-12 11:17:45 +03:00
|
|
|
mem_heap_t* heap = NULL;
|
|
|
|
int cmp = 1;
|
|
|
|
double least_inc = DBL_MAX;
|
|
|
|
const rec_t* best_rec;
|
|
|
|
const rec_t* last_match_rec = NULL;
|
|
|
|
bool match_init = false;
|
|
|
|
page_cur_mode_t orig_mode = mode;
|
|
|
|
const rec_t* first_rec = NULL;
|
|
|
|
|
|
|
|
rec_offs_init(offsets_);
|
|
|
|
|
|
|
|
ut_ad(RTREE_SEARCH_MODE(mode));
|
|
|
|
|
|
|
|
ut_ad(dict_index_is_spatial(index));
|
|
|
|
|
|
|
|
page = buf_block_get_frame(block);
|
|
|
|
|
2018-02-13 23:02:46 +03:00
|
|
|
const ulint level = btr_page_get_level(page);
|
2021-04-13 10:28:13 +03:00
|
|
|
const ulint n_core = level ? 0 : index->n_fields;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (mode == PAGE_CUR_RTREE_LOCATE) {
|
|
|
|
ut_ad(level != 0);
|
|
|
|
mode = PAGE_CUR_WITHIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
rec = page_dir_slot_get_rec(page_dir_get_nth_slot(page, 0));
|
|
|
|
|
|
|
|
last_rec = rec;
|
|
|
|
best_rec = rec;
|
|
|
|
|
|
|
|
if (page_rec_is_infimum(rec)) {
|
|
|
|
rec = page_rec_get_next_const(rec);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check insert tuple size is larger than first rec, and try to
|
|
|
|
avoid it if possible */
|
|
|
|
if (mode == PAGE_CUR_RTREE_INSERT && !page_rec_is_supremum(rec)) {
|
|
|
|
|
|
|
|
ulint new_rec_size = rec_get_converted_size(index, tuple, 0);
|
|
|
|
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets = rec_get_offsets(rec, index, offsets, n_core,
|
2016-08-12 11:17:45 +03:00
|
|
|
dtuple_get_n_fields_cmp(tuple),
|
|
|
|
&heap);
|
|
|
|
|
|
|
|
if (rec_offs_size(offsets) < new_rec_size) {
|
|
|
|
first_rec = rec;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If this is the left-most page of this index level
|
|
|
|
and the table is a compressed table, try to avoid
|
|
|
|
first page as much as possible, as there will be problem
|
|
|
|
when update MIN_REC rec in compress table */
|
2019-08-08 22:53:33 +03:00
|
|
|
if (is_buf_block_get_page_zip(block)
|
2018-02-08 22:34:21 +02:00
|
|
|
&& !page_has_prev(page)
|
2016-08-12 11:17:45 +03:00
|
|
|
&& page_get_n_recs(page) >= 2) {
|
|
|
|
|
|
|
|
rec = page_rec_get_next_const(rec);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (!page_rec_is_supremum(rec)) {
|
2021-04-13 10:28:13 +03:00
|
|
|
if (!n_core) {
|
2016-08-12 11:17:45 +03:00
|
|
|
switch (mode) {
|
|
|
|
case PAGE_CUR_CONTAIN:
|
|
|
|
case PAGE_CUR_INTERSECT:
|
|
|
|
case PAGE_CUR_MBR_EQUAL:
|
|
|
|
/* At non-leaf level, we will need to check
|
|
|
|
both CONTAIN and INTERSECT for either of
|
|
|
|
the search mode */
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec, PAGE_CUR_CONTAIN);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (cmp != 0) {
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec,
|
2016-08-12 11:17:45 +03:00
|
|
|
PAGE_CUR_INTERSECT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case PAGE_CUR_DISJOINT:
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec, mode);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (cmp != 0) {
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec,
|
2016-08-12 11:17:45 +03:00
|
|
|
PAGE_CUR_INTERSECT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case PAGE_CUR_RTREE_INSERT:
|
|
|
|
double increase;
|
|
|
|
double area;
|
|
|
|
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec, PAGE_CUR_WITHIN);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (cmp != 0) {
|
|
|
|
increase = rtr_rec_cal_increase(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec, &area);
|
2016-08-12 11:17:45 +03:00
|
|
|
/* Once it goes beyond DBL_MAX,
|
|
|
|
it would not make sense to record
|
|
|
|
such value, just make it
|
|
|
|
DBL_MAX / 2 */
|
|
|
|
if (increase >= DBL_MAX) {
|
|
|
|
increase = DBL_MAX / 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (increase < least_inc) {
|
|
|
|
least_inc = increase;
|
|
|
|
best_rec = rec;
|
|
|
|
} else if (best_rec
|
|
|
|
&& best_rec == first_rec) {
|
|
|
|
/* if first_rec is set,
|
|
|
|
we will try to avoid it */
|
|
|
|
least_inc = increase;
|
|
|
|
best_rec = rec;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2016-09-06 09:43:16 +03:00
|
|
|
case PAGE_CUR_RTREE_GET_FATHER:
|
|
|
|
cmp = cmp_dtuple_rec_with_gis_internal(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec);
|
2016-09-06 09:43:16 +03:00
|
|
|
break;
|
2016-08-12 11:17:45 +03:00
|
|
|
default:
|
|
|
|
/* WITHIN etc. */
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec, mode);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* At leaf level, INSERT should translate to LE */
|
|
|
|
ut_ad(mode != PAGE_CUR_RTREE_INSERT);
|
|
|
|
|
|
|
|
cmp = cmp_dtuple_rec_with_gis(
|
2020-03-12 13:58:45 +02:00
|
|
|
tuple, rec, mode);
|
2016-08-12 11:17:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (cmp == 0) {
|
|
|
|
found = true;
|
|
|
|
|
|
|
|
/* If located, the matching node/rec will be pushed
|
|
|
|
to rtr_info->path for non-leaf nodes, or
|
|
|
|
rtr_info->matches for leaf nodes */
|
|
|
|
if (rtr_info && mode != PAGE_CUR_RTREE_INSERT) {
|
2021-04-13 10:28:13 +03:00
|
|
|
if (!n_core) {
|
2020-10-15 16:28:19 +03:00
|
|
|
uint32_t page_no;
|
2016-08-12 11:17:45 +03:00
|
|
|
node_seq_t new_seq;
|
2016-09-06 09:43:16 +03:00
|
|
|
bool is_loc;
|
|
|
|
|
|
|
|
is_loc = (orig_mode
|
|
|
|
== PAGE_CUR_RTREE_LOCATE
|
|
|
|
|| orig_mode
|
|
|
|
== PAGE_CUR_RTREE_GET_FATHER);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
offsets = rec_get_offsets(
|
2021-04-13 10:28:13 +03:00
|
|
|
rec, index, offsets, 0,
|
2016-08-12 11:17:45 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
|
|
|
|
|
|
|
page_no = btr_node_ptr_get_child_page_no(
|
|
|
|
rec, offsets);
|
|
|
|
|
|
|
|
ut_ad(level >= 1);
|
|
|
|
|
|
|
|
/* Get current SSN, before we insert
|
|
|
|
it into the path stack */
|
|
|
|
new_seq = rtr_get_current_ssn_id(index);
|
|
|
|
|
|
|
|
rtr_non_leaf_stack_push(
|
|
|
|
rtr_info->path,
|
|
|
|
page_no,
|
|
|
|
new_seq, level - 1, 0,
|
|
|
|
NULL, 0);
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
if (is_loc) {
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_non_leaf_insert_stack_push(
|
|
|
|
index,
|
|
|
|
rtr_info->parent_path,
|
|
|
|
level, page_no, block,
|
|
|
|
rec, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!srv_read_only_mode
|
|
|
|
&& (rtr_info->need_page_lock
|
2016-09-06 09:43:16 +03:00
|
|
|
|| !is_loc)) {
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/* Lock the page, preventing it
|
|
|
|
from being shrunk */
|
|
|
|
lock_place_prdt_page_lock(
|
2020-09-11 15:55:30 +03:00
|
|
|
page_id_t(block->page
|
|
|
|
.id()
|
|
|
|
.space(),
|
|
|
|
page_no),
|
|
|
|
index,
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_info->thr);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ut_ad(orig_mode
|
|
|
|
!= PAGE_CUR_RTREE_LOCATE);
|
|
|
|
|
|
|
|
if (!match_init) {
|
|
|
|
rtr_init_match(
|
|
|
|
rtr_info->matches,
|
|
|
|
block, page);
|
|
|
|
match_init = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Collect matched records on page */
|
|
|
|
offsets = rec_get_offsets(
|
2021-04-13 10:28:13 +03:00
|
|
|
rec, index, offsets,
|
|
|
|
index->n_fields,
|
2016-08-12 11:17:45 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
|
|
|
rtr_leaf_push_match_rec(
|
|
|
|
rec, rtr_info, offsets,
|
|
|
|
page_is_comp(page));
|
|
|
|
}
|
|
|
|
|
|
|
|
last_match_rec = rec;
|
|
|
|
} else {
|
|
|
|
/* This is the insertion case, it will break
|
|
|
|
once it finds the first MBR that can accomodate
|
|
|
|
the inserting rec */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
last_rec = rec;
|
|
|
|
|
|
|
|
rec = page_rec_get_next_const(rec);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* All records on page are searched */
|
|
|
|
if (page_rec_is_supremum(rec)) {
|
2021-04-13 10:28:13 +03:00
|
|
|
if (!n_core) {
|
2016-08-12 11:17:45 +03:00
|
|
|
if (!found) {
|
|
|
|
/* No match case, if it is for insertion,
|
|
|
|
then we select the record that result in
|
|
|
|
least increased area */
|
|
|
|
if (mode == PAGE_CUR_RTREE_INSERT) {
|
|
|
|
ut_ad(least_inc < DBL_MAX);
|
|
|
|
offsets = rec_get_offsets(
|
2017-09-19 19:20:11 +03:00
|
|
|
best_rec, index, offsets,
|
2021-04-13 10:28:13 +03:00
|
|
|
0, ULINT_UNDEFINED, &heap);
|
2020-10-15 16:28:19 +03:00
|
|
|
uint32_t child_no =
|
2016-08-12 11:17:45 +03:00
|
|
|
btr_node_ptr_get_child_page_no(
|
|
|
|
best_rec, offsets);
|
|
|
|
|
|
|
|
rtr_non_leaf_insert_stack_push(
|
|
|
|
index, rtr_info->parent_path,
|
|
|
|
level, child_no, block,
|
|
|
|
best_rec, least_inc);
|
|
|
|
|
|
|
|
page_cur_position(best_rec, block,
|
|
|
|
cursor);
|
|
|
|
rtr_info->mbr_adj = true;
|
|
|
|
} else {
|
|
|
|
/* Position at the last rec of the
|
|
|
|
page, if it is not the leaf page */
|
|
|
|
page_cur_position(last_rec, block,
|
|
|
|
cursor);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* There are matching records, position
|
|
|
|
in the last matching records */
|
|
|
|
if (rtr_info) {
|
|
|
|
rec = last_match_rec;
|
|
|
|
page_cur_position(
|
|
|
|
rec, block, cursor);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (rtr_info) {
|
|
|
|
/* Leaf level, no match, position at the
|
|
|
|
last (supremum) rec */
|
|
|
|
if (!last_match_rec) {
|
|
|
|
page_cur_position(rec, block, cursor);
|
|
|
|
goto func_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* There are matched records */
|
|
|
|
matched_rec_t* match_rec = rtr_info->matches;
|
|
|
|
|
|
|
|
rtr_rec_t test_rec;
|
|
|
|
|
|
|
|
test_rec = match_rec->matched_recs->back();
|
|
|
|
#ifdef UNIV_DEBUG
|
2020-04-28 10:46:51 +10:00
|
|
|
rec_offs offsets_2[REC_OFFS_NORMAL_SIZE];
|
|
|
|
rec_offs* offsets2 = offsets_2;
|
2016-08-12 11:17:45 +03:00
|
|
|
rec_offs_init(offsets_2);
|
|
|
|
|
|
|
|
ut_ad(found);
|
|
|
|
|
|
|
|
/* Verify the record to be positioned is the same
|
|
|
|
as the last record in matched_rec vector */
|
|
|
|
offsets2 = rec_get_offsets(test_rec.r_rec, index,
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets2, index->n_fields,
|
2017-09-19 19:20:11 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
offsets = rec_get_offsets(last_match_rec, index,
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets, index->n_fields,
|
2017-09-19 19:20:11 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
ut_ad(cmp_rec_rec(test_rec.r_rec, last_match_rec,
|
|
|
|
offsets2, offsets, index) == 0);
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
/* Pop the last match record and position on it */
|
|
|
|
match_rec->matched_recs->pop_back();
|
|
|
|
page_cur_position(test_rec.r_rec, &match_rec->block,
|
|
|
|
cursor);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
|
|
|
|
if (mode == PAGE_CUR_RTREE_INSERT) {
|
2020-03-12 13:58:45 +02:00
|
|
|
ut_ad(!last_match_rec);
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_non_leaf_insert_stack_push(
|
2020-03-12 13:58:45 +02:00
|
|
|
index, rtr_info->parent_path, level,
|
|
|
|
mach_read_from_4(rec + DATA_MBR_LEN),
|
2016-08-12 11:17:45 +03:00
|
|
|
block, rec, 0);
|
|
|
|
|
2021-04-13 10:28:13 +03:00
|
|
|
} else if (rtr_info && found && !n_core) {
|
2016-08-12 11:17:45 +03:00
|
|
|
rec = last_match_rec;
|
|
|
|
}
|
|
|
|
|
|
|
|
page_cur_position(rec, block, cursor);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
/* Verify that we are positioned at the same child page as pushed in
|
|
|
|
the path stack */
|
2021-04-13 10:28:13 +03:00
|
|
|
if (!n_core && (!page_rec_is_supremum(rec) || found)
|
2016-08-12 11:17:45 +03:00
|
|
|
&& mode != PAGE_CUR_RTREE_INSERT) {
|
|
|
|
ulint page_no;
|
|
|
|
|
2021-04-13 10:28:13 +03:00
|
|
|
offsets = rec_get_offsets(rec, index, offsets, 0,
|
2016-08-12 11:17:45 +03:00
|
|
|
ULINT_UNDEFINED, &heap);
|
|
|
|
page_no = btr_node_ptr_get_child_page_no(rec, offsets);
|
|
|
|
|
|
|
|
if (rtr_info && found) {
|
|
|
|
rtr_node_path_t* path = rtr_info->path;
|
|
|
|
node_visit_t last_visit = path->back();
|
|
|
|
|
|
|
|
ut_ad(last_visit.page_no == page_no);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
|
|
|
|
func_exit:
|
|
|
|
if (UNIV_LIKELY_NULL(heap)) {
|
|
|
|
mem_heap_free(heap);
|
|
|
|
}
|
|
|
|
|
|
|
|
return(found);
|
|
|
|
}
|