2014-02-26 19:11:54 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
|
2022-04-27 13:16:07 +03:00
|
|
|
Copyright (c) 1996, 2022, Oracle and/or its affiliates.
|
2022-03-08 09:04:03 +02:00
|
|
|
Copyright (c) 2017, 2022, MariaDB Corporation.
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
|
|
Foundation; version 2 of the License.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
2019-05-11 19:25:02 +03:00
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
/**************************************************//**
|
|
|
|
@file include/lock0lock.h
|
|
|
|
The transaction lock system
|
|
|
|
|
|
|
|
Created 5/7/1996 Heikki Tuuri
|
|
|
|
*******************************************************/
|
|
|
|
|
|
|
|
#ifndef lock0lock_h
|
|
|
|
#define lock0lock_h
|
|
|
|
|
|
|
|
#include "buf0types.h"
|
2021-10-22 12:38:45 +03:00
|
|
|
#include "trx0trx.h"
|
2014-02-26 19:11:54 +01:00
|
|
|
#include "mtr0types.h"
|
|
|
|
#include "rem0types.h"
|
|
|
|
#include "hash0hash.h"
|
|
|
|
#include "srv0srv.h"
|
|
|
|
#include "ut0vec.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "gis0rtree.h"
|
|
|
|
#include "lock0prdt.h"
|
2021-10-22 12:38:45 +03:00
|
|
|
#include "transactional_lock_guard.h"
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
// Forward declaration
|
|
|
|
class ReadView;
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2016-07-28 13:08:52 +08:00
|
|
|
/** The value of innodb_deadlock_detect */
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
extern my_bool innodb_deadlock_detect;
|
|
|
|
/** The value of innodb_deadlock_report */
|
|
|
|
extern ulong innodb_deadlock_report;
|
|
|
|
|
|
|
|
namespace Deadlock
|
|
|
|
{
|
|
|
|
/** The allowed values of innodb_deadlock_report */
|
|
|
|
enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
|
|
|
|
}
|
2016-07-28 13:08:52 +08:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Gets the heap_no of the smallest user record on a page.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
|
2014-02-26 19:11:54 +01:00
|
|
|
UNIV_INLINE
|
|
|
|
ulint
|
|
|
|
lock_get_min_heap_no(
|
|
|
|
/*=================*/
|
|
|
|
const buf_block_t* block); /*!< in: buffer block */
|
2021-04-27 18:21:38 +03:00
|
|
|
|
2021-10-22 12:38:45 +03:00
|
|
|
/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
|
|
|
|
after an aborted CREATE INDEX operation.
|
|
|
|
@param index a stale index on which ADD INDEX operation was aborted */
|
|
|
|
ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
|
2021-04-27 18:21:38 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when we have reorganized a page. NOTE: we copy
|
|
|
|
also the locks set on the infimum of the page; the infimum may carry
|
|
|
|
locks if an update of a record is occurring on the page, and its locks
|
|
|
|
were temporarily stored on the infimum. */
|
|
|
|
void
|
|
|
|
lock_move_reorganize_page(
|
|
|
|
/*======================*/
|
|
|
|
const buf_block_t* block, /*!< in: old index page, now
|
|
|
|
reorganized */
|
|
|
|
const buf_block_t* oblock);/*!< in: copy of the old, not
|
|
|
|
reorganized page */
|
|
|
|
/*************************************************************//**
|
|
|
|
Moves the explicit locks on user records to another page if a record
|
|
|
|
list end is moved to another page. */
|
|
|
|
void
|
|
|
|
lock_move_rec_list_end(
|
|
|
|
/*===================*/
|
|
|
|
const buf_block_t* new_block, /*!< in: index page to move to */
|
|
|
|
const buf_block_t* block, /*!< in: index page */
|
|
|
|
const rec_t* rec); /*!< in: record on page: this
|
|
|
|
is the first record moved */
|
|
|
|
/*************************************************************//**
|
|
|
|
Moves the explicit locks on user records to another page if a record
|
|
|
|
list start is moved to another page. */
|
|
|
|
void
|
|
|
|
lock_move_rec_list_start(
|
|
|
|
/*=====================*/
|
|
|
|
const buf_block_t* new_block, /*!< in: index page to move to */
|
|
|
|
const buf_block_t* block, /*!< in: index page */
|
|
|
|
const rec_t* rec, /*!< in: record on page:
|
|
|
|
this is the first
|
|
|
|
record NOT copied */
|
|
|
|
const rec_t* old_end); /*!< in: old
|
|
|
|
previous-to-last
|
|
|
|
record on new_page
|
|
|
|
before the records
|
|
|
|
were copied */
|
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when a page is split to the right. */
|
|
|
|
void
|
|
|
|
lock_update_split_right(
|
|
|
|
/*====================*/
|
|
|
|
const buf_block_t* right_block, /*!< in: right page */
|
|
|
|
const buf_block_t* left_block); /*!< in: left page */
|
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when a page is merged to the right. */
|
|
|
|
void
|
|
|
|
lock_update_merge_right(
|
|
|
|
/*====================*/
|
|
|
|
const buf_block_t* right_block, /*!< in: right page to
|
|
|
|
which merged */
|
|
|
|
const rec_t* orig_succ, /*!< in: original
|
|
|
|
successor of infimum
|
|
|
|
on the right page
|
|
|
|
before merge */
|
|
|
|
const buf_block_t* left_block); /*!< in: merged index
|
|
|
|
page which will be
|
|
|
|
discarded */
|
2021-02-05 18:29:30 +02:00
|
|
|
/** Update locks when the root page is copied to another in
|
|
|
|
btr_root_raise_and_insert(). Note that we leave lock structs on the
|
2014-02-26 19:11:54 +01:00
|
|
|
root page, even though they do not make sense on other than leaf
|
|
|
|
pages: the reason is that in a pessimistic update the infimum record
|
|
|
|
of the root page will act as a dummy carrier of the locks of the record
|
|
|
|
to be updated. */
|
2021-02-05 18:29:30 +02:00
|
|
|
void lock_update_root_raise(const buf_block_t &block, const page_id_t root);
|
|
|
|
/** Update the lock table when a page is copied to another.
|
|
|
|
@param new_block the target page
|
|
|
|
@param old old page (not index root page) */
|
|
|
|
void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old);
|
2022-05-10 11:53:59 +02:00
|
|
|
|
2022-04-27 13:16:07 +03:00
|
|
|
/** Update gap locks between the last record of the left_block and the
|
|
|
|
first record of the right_block when a record is about to be inserted
|
|
|
|
at the start of the right_block, even though it should "naturally" be
|
|
|
|
inserted as the last record of the left_block according to the
|
|
|
|
current node pointer in the parent page.
|
|
|
|
|
|
|
|
That is, we assume that the lowest common ancestor of the left_block
|
|
|
|
and right_block routes the key of the new record to the left_block,
|
|
|
|
but a heuristic which tries to avoid overflowing left_block has chosen
|
|
|
|
to insert the record into right_block instead. Said ancestor performs
|
|
|
|
this routing by comparing the key of the record to a "split point" -
|
|
|
|
all records greater or equal to than the split point (node pointer)
|
|
|
|
are in right_block, and smaller ones in left_block.
|
|
|
|
The split point may be smaller than the smallest key in right_block.
|
|
|
|
|
|
|
|
The gap between the last record on the left_block and the first record
|
|
|
|
on the right_block is represented as a gap lock attached to the supremum
|
|
|
|
pseudo-record of left_block, and a gap lock attached to the new first
|
|
|
|
record of right_block.
|
|
|
|
|
|
|
|
Thus, inserting the new record, and subsequently adjusting the node
|
|
|
|
pointers in parent pages to values smaller or equal to the new
|
|
|
|
records' key, will mean that gap will be sliced at a different place
|
|
|
|
("moved to the left"): fragment of the 1st gap will now become treated
|
|
|
|
as 2nd. Therefore, we must copy any GRANTED locks from 1st gap to the
|
|
|
|
2nd gap. Any WAITING locks must be of INSERT_INTENTION type (as no
|
|
|
|
other GAP locks ever wait for anything) and can stay at 1st gap, as
|
|
|
|
their only purpose is to notify the requester they can retry
|
|
|
|
insertion, and there's no correctness requirement to avoid waking them
|
|
|
|
up too soon.
|
|
|
|
@param left_block left page
|
|
|
|
@param right_block right page */
|
|
|
|
void lock_update_node_pointer(const buf_block_t *left_block,
|
|
|
|
const buf_block_t *right_block);
|
2014-02-26 19:11:54 +01:00
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when a page is split to the left. */
|
|
|
|
void
|
|
|
|
lock_update_split_left(
|
|
|
|
/*===================*/
|
|
|
|
const buf_block_t* right_block, /*!< in: right page */
|
|
|
|
const buf_block_t* left_block); /*!< in: left page */
|
2021-02-05 18:29:30 +02:00
|
|
|
/** Update the lock table when a page is merged to the left.
|
|
|
|
@param left left page
|
|
|
|
@param orig_pred original predecessor of supremum on the left page before merge
|
|
|
|
@param right merged, to-be-discarded right page */
|
|
|
|
void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
|
|
|
|
const page_id_t right);
|
MDEV-25791: Remove UNIV_INTERN
Back in 2006 or 2007, when MySQL AB and Innobase Oy existed as
separately controlled entities (Innobase had been acquired by
Oracle Corporation), MySQL 5.1 introduced a storage engine plugin
interface and Oracle made use of it by distributing a separate
InnoDB Plugin, which would contain some more bug fixes and
improvements, compared to the version of InnoDB that was statically
linked with the mysqld server that was distributed by MySQL AB.
The built-in InnoDB would export global symbols, which would clash
with the symbols of the dynamic InnoDB Plugin (which was supposed
to override the built-in one when present).
The solution to this problem was to declare all global symbols with
UNIV_INTERN, so that they would get the GCC function attribute that
specifies hidden visibility.
Later, in MariaDB Server, something based on Percona XtraDB (a fork of
MySQL InnoDB) became the statically linked implementation, and something
closer to MySQL InnoDB was available as a dynamic plugin. Starting with
version 10.2, MariaDB Server includes only one InnoDB implementation,
and hence any reason to have the UNIV_INTERN definition was lost.
btr_get_size_and_reserved(): Move to the same compilation unit with
the only caller.
innodb_set_buf_pool_size(): Remove. Modify innobase_buffer_pool_size
directly.
fil_crypt_calculate_checksum(): Merge to the only caller.
ha_innobase::innobase_reset_autoinc(): Merge to the only caller.
thd_query_start_micro(): Remove. Call thd_start_utime() directly.
2021-05-27 10:13:14 +03:00
|
|
|
|
|
|
|
/** Update the locks when a page is split and merged to two pages,
|
|
|
|
in defragmentation. */
|
|
|
|
void lock_update_split_and_merge(
|
2014-08-06 15:28:58 +03:00
|
|
|
const buf_block_t* left_block, /*!< in: left page to which merged */
|
|
|
|
const rec_t* orig_pred, /*!< in: original predecessor of
|
|
|
|
supremum on the left page before merge*/
|
|
|
|
const buf_block_t* right_block);/*!< in: right page from which merged */
|
|
|
|
/*************************************************************//**
|
2014-02-26 19:11:54 +01:00
|
|
|
Resets the original locks on heir and replaces them with gap type locks
|
|
|
|
inherited from rec. */
|
|
|
|
void
|
|
|
|
lock_rec_reset_and_inherit_gap_locks(
|
|
|
|
/*=================================*/
|
2021-02-05 18:29:30 +02:00
|
|
|
const buf_block_t& heir_block, /*!< in: block containing the
|
2014-02-26 19:11:54 +01:00
|
|
|
record which inherits */
|
2021-02-05 18:29:30 +02:00
|
|
|
const page_id_t donor, /*!< in: page containing the
|
2014-02-26 19:11:54 +01:00
|
|
|
record from which inherited;
|
|
|
|
does NOT reset the locks on
|
|
|
|
this record */
|
|
|
|
ulint heir_heap_no, /*!< in: heap_no of the
|
|
|
|
inheriting record */
|
|
|
|
ulint heap_no); /*!< in: heap_no of the
|
|
|
|
donating record */
|
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when a page is discarded. */
|
|
|
|
void
|
|
|
|
lock_update_discard(
|
|
|
|
/*================*/
|
|
|
|
const buf_block_t* heir_block, /*!< in: index page
|
|
|
|
which will inherit the locks */
|
|
|
|
ulint heir_heap_no, /*!< in: heap_no of the record
|
|
|
|
which will inherit the locks */
|
|
|
|
const buf_block_t* block); /*!< in: index page
|
|
|
|
which will be discarded */
|
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when a new user record is inserted. */
|
|
|
|
void
|
|
|
|
lock_update_insert(
|
|
|
|
/*===============*/
|
|
|
|
const buf_block_t* block, /*!< in: buffer block containing rec */
|
|
|
|
const rec_t* rec); /*!< in: the inserted record */
|
|
|
|
/*************************************************************//**
|
|
|
|
Updates the lock table when a record is removed. */
|
|
|
|
void
|
|
|
|
lock_update_delete(
|
|
|
|
/*===============*/
|
|
|
|
const buf_block_t* block, /*!< in: buffer block containing rec */
|
|
|
|
const rec_t* rec); /*!< in: the record to be removed */
|
|
|
|
/*********************************************************************//**
|
|
|
|
Stores on the page infimum record the explicit locks of another record.
|
|
|
|
This function is used to store the lock state of a record when it is
|
|
|
|
updated and the size of the record changes in the update. The record
|
|
|
|
is in such an update moved, perhaps to another page. The infimum record
|
|
|
|
acts as a dummy carrier record, taking care of lock releases while the
|
|
|
|
actual record is being moved. */
|
|
|
|
void
|
|
|
|
lock_rec_store_on_page_infimum(
|
|
|
|
/*===========================*/
|
|
|
|
const buf_block_t* block, /*!< in: buffer block containing rec */
|
|
|
|
const rec_t* rec); /*!< in: record whose lock state
|
|
|
|
is stored on the infimum
|
|
|
|
record of the same page; lock
|
|
|
|
bits are reset on the
|
|
|
|
record */
|
2021-02-05 18:29:30 +02:00
|
|
|
/** Restore the explicit lock requests on a single record, where the
|
|
|
|
state was stored on the infimum of a page.
|
|
|
|
@param block buffer block containing rec
|
|
|
|
@param rec record whose lock state is restored
|
|
|
|
@param donator page (rec is not necessarily on this page)
|
|
|
|
whose infimum stored the lock state; lock bits are reset on the infimum */
|
|
|
|
void lock_rec_restore_from_page_infimum(const buf_block_t &block,
|
|
|
|
const rec_t *rec, page_id_t donator);
|
MDEV-25919: Lock tables before acquiring dict_sys.latch
In commit 1bd681c8b3c5213ce1f7976940a7dc38b48a0d39 (MDEV-25506 part 3)
we introduced a "fake instant timeout" when a transaction would wait
for a table or record lock while holding dict_sys.latch. This prevented
a deadlock of the server but could cause bogus errors for operations
on the InnoDB persistent statistics tables.
A better fix is to ensure that whenever a transaction is being
executed in the InnoDB internal SQL parser (which will for now
require dict_sys.latch to be held), it will already have acquired
all locks that could be required for the execution. So, we will
acquire the following locks upfront, before acquiring dict_sys.latch:
(1) MDL on the affected user table (acquired by the SQL layer)
(2) If applicable (not for RENAME TABLE): InnoDB table lock
(3) If persistent statistics are going to be modified:
(3.a) MDL_SHARED on mysql.innodb_table_stats, mysql.innodb_index_stats
(3.b) exclusive table locks on the statistics tables
(4) Exclusive table locks on the InnoDB data dictionary tables
(not needed in ANALYZE TABLE and the like)
Note: Acquiring exclusive locks on the statistics tables may cause
more locking conflicts between concurrent DDL operations.
Notably, RENAME TABLE will lock the statistics tables
even if no persistent statistics are enabled for the table.
DROP DATABASE will only acquire locks on statistics tables if
persistent statistics are enabled for the tables on which the
SQL layer is invoking ha_innobase::delete_table().
For any "garbage collection" in innodb_drop_database(), a timeout
while acquiring locks on the statistics tables will result in any
statistics not being deleted for any tables that the SQL layer
did not know about.
If innodb_defragment=ON, information may be written to the statistics
tables even for tables for which InnoDB persistent statistics are
disabled. But, DROP TABLE will no longer attempt to delete that
information if persistent statistics are not enabled for the table.
This change should also fix the hangs related to InnoDB persistent
statistics and STATS_AUTO_RECALC (MDEV-15020) as well as
a bug that running ALTER TABLE on the statistics tables
concurrently with running ALTER TABLE on InnoDB tables could
cause trouble.
lock_rec_enqueue_waiting(), lock_table_enqueue_waiting():
Do not issue a fake instant timeout error when the transaction
is holding dict_sys.latch. Instead, assert that the dict_sys.latch
is never being held here.
lock_sys_tables(): A new function to acquire exclusive locks on all
dictionary tables, in case DROP TABLE or similar operation is
being executed. Locking non-hard-coded tables is optional to avoid
a crash in row_merge_drop_temp_indexes(). The SYS_VIRTUAL table was
introduced in MySQL 5.7 and MariaDB Server 10.2. Normally, we require
all these dictionary tables to exist before executing any DDL, but
the function row_merge_drop_temp_indexes() is an exception.
When upgrading from MariaDB Server 10.1 or MySQL 5.6 or earlier,
the table SYS_VIRTUAL would not exist at this point.
ha_innobase::commit_inplace_alter_table(): Invoke
log_write_up_to() while not holding dict_sys.latch.
dict_sys_t::remove(), dict_table_close(): No longer try to
drop index stubs that were left behind by aborted online ADD INDEX.
Such indexes should be dropped from the InnoDB data dictionary by
row_merge_drop_indexes() as part of the failed DDL operation.
Stubs for aborted indexes may only be left behind in the
data dictionary cache.
dict_stats_fetch_from_ps(): Use a normal read-only transaction.
ha_innobase::delete_table(), ha_innobase::truncate(), fts_lock_table():
While waiting for purge to stop using the table,
do not hold dict_sys.latch.
ha_innobase::delete_table(): Implement a work-around for the rollback
of ALTER TABLE...ADD PARTITION. MDL_EXCLUSIVE would not be held if
ALTER TABLE hits lock_wait_timeout while trying to upgrade the MDL
due to a conflicting LOCK TABLES, such as in the first ALTER TABLE
in the test case of Bug#53676 in parts.partition_special_innodb.
Therefore, we must explicitly stop purge, because it would not be
stopped by MDL.
dict_stats_func(), btr_defragment_chunk(): Allocate a THD so that
we can acquire MDL on the InnoDB persistent statistics tables.
mysqltest_embedded: Invoke ha_pre_shutdown() before free_used_memory()
in order to avoid ASAN heap-use-after-free related to acquire_thd().
trx_t::dict_operation_lock_mode: Changed the type to bool.
row_mysql_lock_data_dictionary(), row_mysql_unlock_data_dictionary():
Implemented as macros.
rollback_inplace_alter_table(): Apply an infinite timeout to lock waits.
innodb_thd_increment_pending_ops(): Wrapper for
thd_increment_pending_ops(). Never attempt async operation for
InnoDB background threads, such as the trx_t::commit() in
dict_stats_process_entry_from_recalc_pool().
lock_sys_t::cancel(trx_t*): Make dictionary transactions immune to KILL.
lock_wait(): Make dictionary transactions immune to KILL, and to
lock wait timeout when waiting for locks on dictionary tables.
parts.partition_special_innodb: Use lock_wait_timeout=0 to instantly
get ER_LOCK_WAIT_TIMEOUT.
main.mdl: Filter out MDL on InnoDB persistent statistics tables
Reviewed by: Thirunarayanan Balathandayuthapani
2021-08-31 13:54:44 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
Create a table lock, without checking for deadlocks or lock compatibility.
|
|
|
|
@param table table on which the lock is created
|
|
|
|
@param type_mode lock type and mode
|
|
|
|
@param trx transaction
|
|
|
|
@param c_lock conflicting lock
|
|
|
|
@return the created lock object */
|
|
|
|
lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
|
|
|
|
lock_t *c_lock= nullptr);
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Checks if locks of other transactions prevent an immediate insert of
|
|
|
|
a record. If they do, first tests if the query thread should anyway
|
|
|
|
be suspended for some reason; if not, then puts the transaction and
|
|
|
|
the query thread to the lock wait state and inserts a waiting request
|
|
|
|
for a gap x-lock to the lock queue.
|
2018-03-13 11:07:34 +02:00
|
|
|
@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
|
2014-02-26 19:11:54 +01:00
|
|
|
dberr_t
|
|
|
|
lock_rec_insert_check_and_lock(
|
|
|
|
/*===========================*/
|
|
|
|
const rec_t* rec, /*!< in: record after which to insert */
|
|
|
|
buf_block_t* block, /*!< in/out: buffer block of rec */
|
|
|
|
dict_index_t* index, /*!< in: index */
|
|
|
|
que_thr_t* thr, /*!< in: query thread */
|
|
|
|
mtr_t* mtr, /*!< in/out: mini-transaction */
|
2018-02-13 23:29:51 +03:00
|
|
|
bool* inherit)/*!< out: set to true if the new
|
2014-02-26 19:11:54 +01:00
|
|
|
inserted record maybe should inherit
|
|
|
|
LOCK_GAP type locks from the successor
|
|
|
|
record */
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Checks if locks of other transactions prevent an immediate modify (update,
|
|
|
|
delete mark, or delete unmark) of a clustered index record. If they do,
|
|
|
|
first tests if the query thread should anyway be suspended for some
|
|
|
|
reason; if not, then puts the transaction and the query thread to the
|
|
|
|
lock wait state and inserts a waiting request for a record x-lock to the
|
|
|
|
lock queue.
|
2018-03-13 11:07:34 +02:00
|
|
|
@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
|
2014-02-26 19:11:54 +01:00
|
|
|
dberr_t
|
|
|
|
lock_clust_rec_modify_check_and_lock(
|
|
|
|
/*=================================*/
|
|
|
|
const buf_block_t* block, /*!< in: buffer block of rec */
|
|
|
|
const rec_t* rec, /*!< in: record which should be
|
|
|
|
modified */
|
|
|
|
dict_index_t* index, /*!< in: clustered index */
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
|
2014-02-26 19:11:54 +01:00
|
|
|
que_thr_t* thr) /*!< in: query thread */
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Checks if locks of other transactions prevent an immediate modify
|
|
|
|
(delete mark or delete unmark) of a secondary index record.
|
2018-03-13 11:07:34 +02:00
|
|
|
@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
|
2014-02-26 19:11:54 +01:00
|
|
|
dberr_t
|
|
|
|
lock_sec_rec_modify_check_and_lock(
|
|
|
|
/*===============================*/
|
|
|
|
ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
|
|
|
|
bit is set, does nothing */
|
|
|
|
buf_block_t* block, /*!< in/out: buffer block of rec */
|
|
|
|
const rec_t* rec, /*!< in: record which should be
|
|
|
|
modified; NOTE: as this is a secondary
|
|
|
|
index, we always have to modify the
|
|
|
|
clustered index record first: see the
|
|
|
|
comment below */
|
|
|
|
dict_index_t* index, /*!< in: secondary index */
|
|
|
|
que_thr_t* thr, /*!< in: query thread
|
|
|
|
(can be NULL if BTR_NO_LOCKING_FLAG) */
|
|
|
|
mtr_t* mtr) /*!< in/out: mini-transaction */
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Like lock_clust_rec_read_check_and_lock(), but reads a
|
|
|
|
secondary index record.
|
2018-03-13 11:07:34 +02:00
|
|
|
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
|
2014-02-26 19:11:54 +01:00
|
|
|
dberr_t
|
|
|
|
lock_sec_rec_read_check_and_lock(
|
|
|
|
/*=============================*/
|
|
|
|
ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
|
|
|
|
bit is set, does nothing */
|
|
|
|
const buf_block_t* block, /*!< in: buffer block of rec */
|
|
|
|
const rec_t* rec, /*!< in: user record or page
|
|
|
|
supremum record which should
|
|
|
|
be read or passed over by a
|
|
|
|
read cursor */
|
|
|
|
dict_index_t* index, /*!< in: secondary index */
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_mode mode, /*!< in: mode of the lock which
|
2014-02-26 19:11:54 +01:00
|
|
|
the read cursor should set on
|
|
|
|
records: LOCK_S or LOCK_X; the
|
|
|
|
latter is possible in
|
|
|
|
SELECT FOR UPDATE */
|
2020-03-10 20:05:17 +02:00
|
|
|
unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
|
2014-02-26 19:11:54 +01:00
|
|
|
LOCK_REC_NOT_GAP */
|
|
|
|
que_thr_t* thr); /*!< in: query thread */
|
|
|
|
/*********************************************************************//**
|
|
|
|
Checks if locks of other transactions prevent an immediate read, or passing
|
|
|
|
over by a read cursor, of a clustered index record. If they do, first tests
|
|
|
|
if the query thread should anyway be suspended for some reason; if not, then
|
|
|
|
puts the transaction and the query thread to the lock wait state and inserts a
|
|
|
|
waiting request for a record lock to the lock queue. Sets the requested mode
|
|
|
|
lock on the record.
|
2018-03-13 11:07:34 +02:00
|
|
|
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
|
2014-02-26 19:11:54 +01:00
|
|
|
dberr_t
|
|
|
|
lock_clust_rec_read_check_and_lock(
|
|
|
|
/*===============================*/
|
|
|
|
ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
|
|
|
|
bit is set, does nothing */
|
|
|
|
const buf_block_t* block, /*!< in: buffer block of rec */
|
|
|
|
const rec_t* rec, /*!< in: user record or page
|
|
|
|
supremum record which should
|
|
|
|
be read or passed over by a
|
|
|
|
read cursor */
|
|
|
|
dict_index_t* index, /*!< in: clustered index */
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_mode mode, /*!< in: mode of the lock which
|
2014-02-26 19:11:54 +01:00
|
|
|
the read cursor should set on
|
|
|
|
records: LOCK_S or LOCK_X; the
|
|
|
|
latter is possible in
|
|
|
|
SELECT FOR UPDATE */
|
2020-03-10 20:05:17 +02:00
|
|
|
unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
|
2014-02-26 19:11:54 +01:00
|
|
|
LOCK_REC_NOT_GAP */
|
|
|
|
que_thr_t* thr); /*!< in: query thread */
|
|
|
|
/*********************************************************************//**
|
|
|
|
Checks if locks of other transactions prevent an immediate read, or passing
|
|
|
|
over by a read cursor, of a clustered index record. If they do, first tests
|
|
|
|
if the query thread should anyway be suspended for some reason; if not, then
|
|
|
|
puts the transaction and the query thread to the lock wait state and inserts a
|
|
|
|
waiting request for a record lock to the lock queue. Sets the requested mode
|
|
|
|
lock on the record. This is an alternative version of
|
|
|
|
lock_clust_rec_read_check_and_lock() that does not require the parameter
|
|
|
|
"offsets".
|
2018-03-13 11:07:34 +02:00
|
|
|
@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
|
2014-02-26 19:11:54 +01:00
|
|
|
dberr_t
|
|
|
|
lock_clust_rec_read_check_and_lock_alt(
|
|
|
|
/*===================================*/
|
|
|
|
ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
|
|
|
|
bit is set, does nothing */
|
|
|
|
const buf_block_t* block, /*!< in: buffer block of rec */
|
|
|
|
const rec_t* rec, /*!< in: user record or page
|
|
|
|
supremum record which should
|
|
|
|
be read or passed over by a
|
|
|
|
read cursor */
|
|
|
|
dict_index_t* index, /*!< in: clustered index */
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_mode mode, /*!< in: mode of the lock which
|
2014-02-26 19:11:54 +01:00
|
|
|
the read cursor should set on
|
|
|
|
records: LOCK_S or LOCK_X; the
|
|
|
|
latter is possible in
|
|
|
|
SELECT FOR UPDATE */
|
2020-03-10 20:05:17 +02:00
|
|
|
unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
|
2014-02-26 19:11:54 +01:00
|
|
|
LOCK_REC_NOT_GAP */
|
|
|
|
que_thr_t* thr) /*!< in: query thread */
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2022-04-26 18:09:03 +03:00
|
|
|
|
|
|
|
/** Acquire a table lock.
|
|
|
|
@param table table to be locked
|
|
|
|
@param fktable pointer to table, in case of a FOREIGN key check
|
|
|
|
@param mode lock mode
|
|
|
|
@param thr SQL execution thread
|
|
|
|
@retval DB_SUCCESS if the lock was acquired
|
|
|
|
@retval DB_DEADLOCK if a deadlock occurred, or fktable && *fktable != table
|
|
|
|
@retval DB_LOCK_WAIT if lock_wait() must be invoked */
|
|
|
|
dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
|
|
|
|
lock_mode mode, que_thr_t *thr)
|
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2016-09-06 09:43:16 +03:00
|
|
|
|
2021-02-04 17:26:08 +02:00
|
|
|
/** Create a table lock object for a resurrected transaction.
|
MDEV-515 Reduce InnoDB undo logging for insert into empty table
We implement an idea that was suggested by Michael 'Monty' Widenius
in October 2017: When InnoDB is inserting into an empty table or partition,
we can write a single undo log record TRX_UNDO_EMPTY, which will cause
ROLLBACK to clear the table.
For this to work, the insert into an empty table or partition must be
covered by an exclusive table lock that will be held until the transaction
has been committed or rolled back, or the INSERT operation has been
rolled back (and the table is empty again), in lock_table_x_unlock().
Clustered index records that are covered by the TRX_UNDO_EMPTY record
will carry DB_TRX_ID=0 and DB_ROLL_PTR=1<<55, and thus they cannot
be distinguished from what MDEV-12288 leaves behind after purging the
history of row-logged operations.
Concurrent non-locking reads must be adjusted: If the read view was
created before the INSERT into an empty table, then we must continue
to imagine that the table is empty, and not try to read any records.
If the read view was created after the INSERT was committed, then
all records must be visible normally. To implement this, we introduce
the field dict_table_t::bulk_trx_id.
This special handling only applies to the very first INSERT statement
of a transaction for the empty table or partition. If a subsequent
statement in the transaction is modifying the initially empty table again,
we must enable row-level undo logging, so that we will be able to
roll back to the start of the statement in case of an error (such as
duplicate key).
INSERT IGNORE will continue to use row-level logging and locking, because
implementing it would require the ability to roll back the latest row.
Since the undo log that we write only allows us to roll back the entire
statement, we cannot support INSERT IGNORE. We will introduce a
handler::extra() parameter HA_EXTRA_IGNORE_INSERT to indicate to storage
engines that INSERT IGNORE is being executed.
In many test cases, we add an extra record to the table, so that during
the 'interesting' part of the test, row-level locking and logging will
be used.
Replicas will continue to use row-level logging and locking until
MDEV-24622 has been addressed. Likewise, this optimization will be
disabled in Galera cluster until MDEV-24623 enables it.
dict_table_t::bulk_trx_id: The latest active or committed transaction
that initiated an insert into an empty table or partition.
Protected by exclusive table lock and a clustered index leaf page latch.
ins_node_t::bulk_insert: Whether bulk insert was initiated.
trx_t::mod_tables: Use C++11 style accessors (emplace instead of insert).
Unlike earlier, this collection will cover also temporary tables.
trx_mod_table_time_t: Add start_bulk_insert(), end_bulk_insert(),
is_bulk_insert(), was_bulk_insert().
trx_undo_report_row_operation(): Before accessing any undo log pages,
invoke trx->mod_tables.emplace() in order to determine whether undo
logging was disabled, or whether this is the first INSERT and we are
supposed to write a TRX_UNDO_EMPTY record.
row_ins_clust_index_entry_low(): If we are inserting into an empty
clustered index leaf page, set the ins_node_t::bulk_insert flag for
the subsequent trx_undo_report_row_operation() call.
lock_rec_insert_check_and_lock(), lock_prdt_insert_check_and_lock():
Remove the redundant parameter 'flags' that can be checked in the caller.
btr_cur_ins_lock_and_undo(): Simplify the logic. Correctly write
DB_TRX_ID,DB_ROLL_PTR after invoking trx_undo_report_row_operation().
trx_mark_sql_stat_end(), ha_innobase::extra(HA_EXTRA_IGNORE_INSERT),
ha_innobase::external_lock(): Invoke trx_t::end_bulk_insert() so that
the next statement will not be covered by table-level undo logging.
ReadView::changes_visible(trx_id_t) const: New accessor for the case
where the trx_id_t is not read from a potentially corrupted index page
but directly from the memory. In this case, we can skip a sanity check.
row_sel(), row_sel_try_search_shortcut(), row_search_mvcc():
row_sel_try_search_shortcut_for_mysql(),
row_merge_read_clustered_index(): Check dict_table_t::bulk_trx_id.
row_sel_clust_sees(): Replaces lock_clust_rec_cons_read_sees().
lock_sec_rec_cons_read_sees(): Replaced with lower-level code.
btr_root_page_init(): Refactored from btr_create().
dict_index_t::clear(), dict_table_t::clear(): Empty an index or table,
for the ROLLBACK of an INSERT operation.
ROW_T_EMPTY, ROW_OP_EMPTY: Note a concurrent ROLLBACK of an INSERT
into an empty table.
This is joint work with Thirunarayanan Balathandayuthapani,
who created a working prototype.
Thanks to Matthias Leich for extensive testing.
2021-01-25 18:41:27 +02:00
|
|
|
@param table table to be X-locked
|
2021-02-04 17:26:08 +02:00
|
|
|
@param trx transaction
|
|
|
|
@param mode LOCK_X or LOCK_IX */
|
|
|
|
void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode);
|
MDEV-515 Reduce InnoDB undo logging for insert into empty table
We implement an idea that was suggested by Michael 'Monty' Widenius
in October 2017: When InnoDB is inserting into an empty table or partition,
we can write a single undo log record TRX_UNDO_EMPTY, which will cause
ROLLBACK to clear the table.
For this to work, the insert into an empty table or partition must be
covered by an exclusive table lock that will be held until the transaction
has been committed or rolled back, or the INSERT operation has been
rolled back (and the table is empty again), in lock_table_x_unlock().
Clustered index records that are covered by the TRX_UNDO_EMPTY record
will carry DB_TRX_ID=0 and DB_ROLL_PTR=1<<55, and thus they cannot
be distinguished from what MDEV-12288 leaves behind after purging the
history of row-logged operations.
Concurrent non-locking reads must be adjusted: If the read view was
created before the INSERT into an empty table, then we must continue
to imagine that the table is empty, and not try to read any records.
If the read view was created after the INSERT was committed, then
all records must be visible normally. To implement this, we introduce
the field dict_table_t::bulk_trx_id.
This special handling only applies to the very first INSERT statement
of a transaction for the empty table or partition. If a subsequent
statement in the transaction is modifying the initially empty table again,
we must enable row-level undo logging, so that we will be able to
roll back to the start of the statement in case of an error (such as
duplicate key).
INSERT IGNORE will continue to use row-level logging and locking, because
implementing it would require the ability to roll back the latest row.
Since the undo log that we write only allows us to roll back the entire
statement, we cannot support INSERT IGNORE. We will introduce a
handler::extra() parameter HA_EXTRA_IGNORE_INSERT to indicate to storage
engines that INSERT IGNORE is being executed.
In many test cases, we add an extra record to the table, so that during
the 'interesting' part of the test, row-level locking and logging will
be used.
Replicas will continue to use row-level logging and locking until
MDEV-24622 has been addressed. Likewise, this optimization will be
disabled in Galera cluster until MDEV-24623 enables it.
dict_table_t::bulk_trx_id: The latest active or committed transaction
that initiated an insert into an empty table or partition.
Protected by exclusive table lock and a clustered index leaf page latch.
ins_node_t::bulk_insert: Whether bulk insert was initiated.
trx_t::mod_tables: Use C++11 style accessors (emplace instead of insert).
Unlike earlier, this collection will cover also temporary tables.
trx_mod_table_time_t: Add start_bulk_insert(), end_bulk_insert(),
is_bulk_insert(), was_bulk_insert().
trx_undo_report_row_operation(): Before accessing any undo log pages,
invoke trx->mod_tables.emplace() in order to determine whether undo
logging was disabled, or whether this is the first INSERT and we are
supposed to write a TRX_UNDO_EMPTY record.
row_ins_clust_index_entry_low(): If we are inserting into an empty
clustered index leaf page, set the ins_node_t::bulk_insert flag for
the subsequent trx_undo_report_row_operation() call.
lock_rec_insert_check_and_lock(), lock_prdt_insert_check_and_lock():
Remove the redundant parameter 'flags' that can be checked in the caller.
btr_cur_ins_lock_and_undo(): Simplify the logic. Correctly write
DB_TRX_ID,DB_ROLL_PTR after invoking trx_undo_report_row_operation().
trx_mark_sql_stat_end(), ha_innobase::extra(HA_EXTRA_IGNORE_INSERT),
ha_innobase::external_lock(): Invoke trx_t::end_bulk_insert() so that
the next statement will not be covered by table-level undo logging.
ReadView::changes_visible(trx_id_t) const: New accessor for the case
where the trx_id_t is not read from a potentially corrupted index page
but directly from the memory. In this case, we can skip a sanity check.
row_sel(), row_sel_try_search_shortcut(), row_search_mvcc():
row_sel_try_search_shortcut_for_mysql(),
row_merge_read_clustered_index(): Check dict_table_t::bulk_trx_id.
row_sel_clust_sees(): Replaces lock_clust_rec_cons_read_sees().
lock_sec_rec_cons_read_sees(): Replaced with lower-level code.
btr_root_page_init(): Refactored from btr_create().
dict_index_t::clear(), dict_table_t::clear(): Empty an index or table,
for the ROLLBACK of an INSERT operation.
ROW_T_EMPTY, ROW_OP_EMPTY: Note a concurrent ROLLBACK of an INSERT
into an empty table.
This is joint work with Thirunarayanan Balathandayuthapani,
who created a working prototype.
Thanks to Matthias Leich for extensive testing.
2021-01-25 18:41:27 +02:00
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** Sets a lock on a table based on the given mode.
|
2022-03-18 10:52:08 +02:00
|
|
|
@param table table to lock
|
|
|
|
@param trx transaction
|
|
|
|
@param mode LOCK_X or LOCK_S
|
|
|
|
@param no_wait whether to skip handling DB_LOCK_WAIT
|
|
|
|
@return error code */
|
|
|
|
dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
|
|
|
|
bool no_wait= false)
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((nonnull, warn_unused_result));
|
|
|
|
|
MDEV-25919: Lock tables before acquiring dict_sys.latch
In commit 1bd681c8b3c5213ce1f7976940a7dc38b48a0d39 (MDEV-25506 part 3)
we introduced a "fake instant timeout" when a transaction would wait
for a table or record lock while holding dict_sys.latch. This prevented
a deadlock of the server but could cause bogus errors for operations
on the InnoDB persistent statistics tables.
A better fix is to ensure that whenever a transaction is being
executed in the InnoDB internal SQL parser (which will for now
require dict_sys.latch to be held), it will already have acquired
all locks that could be required for the execution. So, we will
acquire the following locks upfront, before acquiring dict_sys.latch:
(1) MDL on the affected user table (acquired by the SQL layer)
(2) If applicable (not for RENAME TABLE): InnoDB table lock
(3) If persistent statistics are going to be modified:
(3.a) MDL_SHARED on mysql.innodb_table_stats, mysql.innodb_index_stats
(3.b) exclusive table locks on the statistics tables
(4) Exclusive table locks on the InnoDB data dictionary tables
(not needed in ANALYZE TABLE and the like)
Note: Acquiring exclusive locks on the statistics tables may cause
more locking conflicts between concurrent DDL operations.
Notably, RENAME TABLE will lock the statistics tables
even if no persistent statistics are enabled for the table.
DROP DATABASE will only acquire locks on statistics tables if
persistent statistics are enabled for the tables on which the
SQL layer is invoking ha_innobase::delete_table().
For any "garbage collection" in innodb_drop_database(), a timeout
while acquiring locks on the statistics tables will result in any
statistics not being deleted for any tables that the SQL layer
did not know about.
If innodb_defragment=ON, information may be written to the statistics
tables even for tables for which InnoDB persistent statistics are
disabled. But, DROP TABLE will no longer attempt to delete that
information if persistent statistics are not enabled for the table.
This change should also fix the hangs related to InnoDB persistent
statistics and STATS_AUTO_RECALC (MDEV-15020) as well as
a bug that running ALTER TABLE on the statistics tables
concurrently with running ALTER TABLE on InnoDB tables could
cause trouble.
lock_rec_enqueue_waiting(), lock_table_enqueue_waiting():
Do not issue a fake instant timeout error when the transaction
is holding dict_sys.latch. Instead, assert that the dict_sys.latch
is never being held here.
lock_sys_tables(): A new function to acquire exclusive locks on all
dictionary tables, in case DROP TABLE or similar operation is
being executed. Locking non-hard-coded tables is optional to avoid
a crash in row_merge_drop_temp_indexes(). The SYS_VIRTUAL table was
introduced in MySQL 5.7 and MariaDB Server 10.2. Normally, we require
all these dictionary tables to exist before executing any DDL, but
the function row_merge_drop_temp_indexes() is an exception.
When upgrading from MariaDB Server 10.1 or MySQL 5.6 or earlier,
the table SYS_VIRTUAL would not exist at this point.
ha_innobase::commit_inplace_alter_table(): Invoke
log_write_up_to() while not holding dict_sys.latch.
dict_sys_t::remove(), dict_table_close(): No longer try to
drop index stubs that were left behind by aborted online ADD INDEX.
Such indexes should be dropped from the InnoDB data dictionary by
row_merge_drop_indexes() as part of the failed DDL operation.
Stubs for aborted indexes may only be left behind in the
data dictionary cache.
dict_stats_fetch_from_ps(): Use a normal read-only transaction.
ha_innobase::delete_table(), ha_innobase::truncate(), fts_lock_table():
While waiting for purge to stop using the table,
do not hold dict_sys.latch.
ha_innobase::delete_table(): Implement a work-around for the rollback
of ALTER TABLE...ADD PARTITION. MDL_EXCLUSIVE would not be held if
ALTER TABLE hits lock_wait_timeout while trying to upgrade the MDL
due to a conflicting LOCK TABLES, such as in the first ALTER TABLE
in the test case of Bug#53676 in parts.partition_special_innodb.
Therefore, we must explicitly stop purge, because it would not be
stopped by MDL.
dict_stats_func(), btr_defragment_chunk(): Allocate a THD so that
we can acquire MDL on the InnoDB persistent statistics tables.
mysqltest_embedded: Invoke ha_pre_shutdown() before free_used_memory()
in order to avoid ASAN heap-use-after-free related to acquire_thd().
trx_t::dict_operation_lock_mode: Changed the type to bool.
row_mysql_lock_data_dictionary(), row_mysql_unlock_data_dictionary():
Implemented as macros.
rollback_inplace_alter_table(): Apply an infinite timeout to lock waits.
innodb_thd_increment_pending_ops(): Wrapper for
thd_increment_pending_ops(). Never attempt async operation for
InnoDB background threads, such as the trx_t::commit() in
dict_stats_process_entry_from_recalc_pool().
lock_sys_t::cancel(trx_t*): Make dictionary transactions immune to KILL.
lock_wait(): Make dictionary transactions immune to KILL, and to
lock wait timeout when waiting for locks on dictionary tables.
parts.partition_special_innodb: Use lock_wait_timeout=0 to instantly
get ER_LOCK_WAIT_TIMEOUT.
main.mdl: Filter out MDL on InnoDB persistent statistics tables
Reviewed by: Thirunarayanan Balathandayuthapani
2021-08-31 13:54:44 +03:00
|
|
|
/** Exclusively lock the data dictionary tables.
|
|
|
|
@param trx dictionary transaction
|
|
|
|
@return error code
|
|
|
|
@retval DB_SUCCESS on success */
|
|
|
|
dberr_t lock_sys_tables(trx_t *trx);
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*************************************************************//**
|
|
|
|
Removes a granted record lock of a transaction from the queue and grants
|
|
|
|
locks to other transactions waiting in the queue if they now are entitled
|
|
|
|
to a lock. */
|
|
|
|
void
|
|
|
|
lock_rec_unlock(
|
|
|
|
/*============*/
|
|
|
|
trx_t* trx, /*!< in/out: transaction that has
|
|
|
|
set a record lock */
|
2021-02-05 18:29:30 +02:00
|
|
|
const page_id_t id, /*!< in: page containing rec */
|
2014-02-26 19:11:54 +01:00
|
|
|
const rec_t* rec, /*!< in: record */
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */
|
MDEV-15326: InnoDB: Failing assertion: !other_lock
MySQL 5.7.9 (and MariaDB 10.2.2) introduced a race condition
between InnoDB transaction commit and the conversion of implicit
locks into explicit ones.
The assertion failure can be triggered with a test that runs
3 concurrent single-statement transactions in a loop on a simple
table:
CREATE TABLE t (a INT PRIMARY KEY) ENGINE=InnoDB;
thread1: INSERT INTO t SET a=1;
thread2: DELETE FROM t;
thread3: SELECT * FROM t FOR UPDATE; -- or DELETE FROM t;
The failure scenarios are like the following:
(1) The INSERT statement is being committed, waiting for lock_sys->mutex.
(2) At the time of the failure, both the DELETE and SELECT transactions
are active but have not logged any changes yet.
(3) The transaction where the !other_lock assertion fails started
lock_rec_convert_impl_to_expl().
(4) After this point, the commit of the INSERT removed the transaction from
trx_sys->rw_trx_set, in trx_erase_lists().
(5) The other transaction consulted trx_sys->rw_trx_set and determined
that there is no implicit lock. Hence, it grabbed the lock.
(6) The !other_lock assertion fails in lock_rec_add_to_queue()
for the lock_rec_convert_impl_to_expl(), because the lock was 'stolen'.
This assertion failure looks genuine, because the INSERT transaction
is still active (trx->state=TRX_STATE_ACTIVE).
The problematic step (4) was introduced in
mysql/mysql-server@e27e0e0bb75b4d35e87059816f1cc370c09890ad
which fixed something related to MVCC (covered by the test
innodb.innodb-read-view). Basically, it reintroduced an error
that had been mentioned in an earlier commit
mysql/mysql-server@a17be6963fc0d9210fa0642d3985b7219cdaf0c5:
"The active transaction was removed from trx_sys->rw_trx_set prematurely."
Our fix goes along the following lines:
(a) Implicit locks will released by assigning
trx->state=TRX_STATE_COMMITTED_IN_MEMORY as the first step.
This transition will no longer be protected by lock_sys_t::mutex,
only by trx->mutex. This idea is by Sergey Vojtovich.
(b) We detach the transaction from trx_sys before starting to release
explicit locks.
(c) All callers of trx_rw_is_active() and trx_rw_is_active_low() must
recheck trx->state after acquiring trx->mutex.
(d) Before releasing any explicit locks, we will ensure that any activity
by other threads to convert implicit locks into explicit will have ceased,
by checking !trx_is_referenced(trx). There was a glitch
in this check when it was part of lock_trx_release_locks(); at the end
we would release trx->mutex and acquire lock_sys->mutex and trx->mutex,
and fail to recheck (trx_is_referenced() is protected by trx_t::mutex).
(e) Explicit locks can be released in batches (LOCK_RELEASE_INTERVAL=1000)
just like we did before.
trx_t::state: Document that the transition to COMMITTED is only
protected by trx_t::mutex, no longer by lock_sys_t::mutex.
trx_rw_is_active_low(), trx_rw_is_active(): Document that the transaction
state should be rechecked after acquiring trx_t::mutex.
trx_t::commit_state(): New function to change a transaction to committed
state, to release implicit locks.
trx_t::release_locks(): New function to release the explicit locks
after commit_state().
lock_trx_release_locks(): Move much of the logic to the caller
(which must invoke trx_t::commit_state() and trx_t::release_locks()
as needed), and assert that the transaction will have locks.
trx_get_trx_by_xid(): Make the parameter a pointer to const.
lock_rec_other_trx_holds_expl(): Recheck trx->state after acquiring
trx->mutex, and avoid a redundant lookup of the transaction.
lock_rec_queue_validate(): Recheck impl_trx->state while holding
impl_trx->mutex.
row_vers_impl_x_locked(), row_vers_impl_x_locked_low():
Document that the transaction state must be rechecked after
trx_mutex_enter().
trx_free_prepared(): Adjust for the changes to lock_trx_release_locks().
2019-09-03 12:31:37 +03:00
|
|
|
|
|
|
|
/** Release the explicit locks of a committing transaction,
|
|
|
|
and release possible other transactions waiting because of these locks. */
|
2019-09-05 15:57:39 +03:00
|
|
|
void lock_release(trx_t* trx);
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2022-04-26 18:09:03 +03:00
|
|
|
/** Release the explicit locks of a committing transaction while
|
|
|
|
dict_sys.latch is exclusively locked,
|
|
|
|
and release possible other transactions waiting because of these locks. */
|
|
|
|
void lock_release_on_drop(trx_t *trx);
|
|
|
|
|
2021-10-18 12:49:10 +03:00
|
|
|
/** Release non-exclusive locks on XA PREPARE,
|
|
|
|
and release possible other transactions waiting because of these locks. */
|
|
|
|
void lock_release_on_prepare(trx_t *trx);
|
|
|
|
|
MDEV-25691: Simplify handlerton::drop_database for InnoDB
The implementation of handlerton::drop_database in InnoDB is
unnecessarily complex. The minimal implementation should check
that no conflicting locks or references exist on the tables,
delete all table metadata in a single transaction, and finally
delete the tablespaces.
Note: DROP DATABASE will delete each individual table that the
SQL layer knows about, one table per transaction.
The handlerton::drop_database is basically a final cleanup step
for removing any garbage that could have been left behind
in InnoDB due to some bug, or not having atomic DDL in the past.
hash_node_t: Remove. Use the proper data type name in pointers.
dict_drop_index_tree(): Do not take the table as a parameter.
Instead, return the tablespace ID if the tablespace should be dropped
(we are dropping a clustered index tree).
fil_delete_tablespace(), fil_system_t::detach(): Return a single
detached file handle. Multi-file tablespaces cannot be deleted
via this interface.
ha_innobase::delete_table(): Remove a work-around for non-atomic DDL
and do not try to drop tables with similar-looking name.
innodb_drop_database(): Complete rewrite.
innobase_drop_database(), dict_get_first_table_name_in_db(),
row_drop_database_for_mysql(), drop_all_foreign_keys_in_db(): Remove.
row_purge_remove_clust_if_poss_low(), row_undo_ins_remove_clust_rec():
If the tablespace is to be deleted, try to evict the table definition
from the cache. Failing that, set dict_table_t::space to nullptr.
lock_release_on_rollback(): On the rollback of CREATE TABLE, release all
locks that the transaction had on the table, to avoid heap-use-after-free.
2021-05-18 12:53:40 +03:00
|
|
|
/** Release locks on a table whose creation is being rolled back */
|
|
|
|
ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table);
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/**********************************************************************//**
|
|
|
|
Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
|
|
|
|
if none found.
|
|
|
|
@return bit index == heap number of the record, or ULINT_UNDEFINED if
|
|
|
|
none found */
|
|
|
|
ulint
|
|
|
|
lock_rec_find_set_bit(
|
|
|
|
/*==================*/
|
|
|
|
const lock_t* lock); /*!< in: record lock with at least one
|
|
|
|
bit set */
|
|
|
|
|
|
|
|
/*********************************************************************//**
|
|
|
|
Checks if a lock request lock1 has to wait for request lock2.
|
2018-02-16 22:15:51 +03:00
|
|
|
@return whether lock1 has to wait for lock2 to be removed */
|
2018-02-13 22:03:26 +03:00
|
|
|
bool
|
2014-02-26 19:11:54 +01:00
|
|
|
lock_has_to_wait(
|
|
|
|
/*=============*/
|
|
|
|
const lock_t* lock1, /*!< in: waiting lock */
|
|
|
|
const lock_t* lock2); /*!< in: another lock; NOTE that it is
|
|
|
|
assumed that this has a lock bit set
|
|
|
|
on the same record as in lock1 if the
|
|
|
|
locks are record locks */
|
|
|
|
/*********************************************************************//**
|
|
|
|
Reports that a transaction id is insensible, i.e., in the future. */
|
2020-06-04 10:24:10 +03:00
|
|
|
ATTRIBUTE_COLD
|
2014-02-26 19:11:54 +01:00
|
|
|
void
|
|
|
|
lock_report_trx_id_insanity(
|
|
|
|
/*========================*/
|
|
|
|
trx_id_t trx_id, /*!< in: trx id */
|
|
|
|
const rec_t* rec, /*!< in: user record */
|
|
|
|
dict_index_t* index, /*!< in: index */
|
2020-04-28 10:46:51 +10:00
|
|
|
const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
|
2017-12-22 16:15:41 +02:00
|
|
|
trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Prints info of locks for all transactions.
|
2021-02-11 14:52:10 +02:00
|
|
|
@return FALSE if not able to acquire lock_sys.latch (and display info) */
|
2014-02-26 19:11:54 +01:00
|
|
|
ibool
|
|
|
|
lock_print_info_summary(
|
|
|
|
/*====================*/
|
|
|
|
FILE* file, /*!< in: file where to print */
|
2021-02-11 14:52:10 +02:00
|
|
|
ibool nowait) /*!< in: whether to wait for lock_sys.latch */
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Prints transaction lock wait and MVCC state.
|
|
|
|
@param[in,out] file file where to print
|
2019-07-25 12:08:50 +03:00
|
|
|
@param[in] trx transaction
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
@param[in] now current my_hrtime_coarse() */
|
|
|
|
void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
|
|
|
|
my_hrtime_t now);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
2021-02-11 14:52:10 +02:00
|
|
|
Prints info of locks for each transaction. This function will release
|
|
|
|
lock_sys.latch, which the caller must be holding in exclusive mode. */
|
2014-02-26 19:11:54 +01:00
|
|
|
void
|
|
|
|
lock_print_info_all_transactions(
|
|
|
|
/*=============================*/
|
|
|
|
FILE* file); /*!< in: file where to print */
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/*********************************************************************//**
|
|
|
|
Return the number of table locks for a transaction.
|
2021-02-11 14:52:10 +02:00
|
|
|
The caller must be holding lock_sys.latch. */
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint
|
|
|
|
lock_number_of_tables_locked(
|
|
|
|
/*=========================*/
|
|
|
|
const trx_lock_t* trx_lock) /*!< in: transaction locks */
|
2016-09-06 09:43:16 +03:00
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2021-02-28 13:46:16 +02:00
|
|
|
/** Check if there are any locks on a table.
|
|
|
|
@return true if table has either table or record locks. */
|
|
|
|
bool lock_table_has_locks(dict_table_t *table);
|
2014-02-26 19:11:54 +01:00
|
|
|
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** Wait for a lock to be released.
|
|
|
|
@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
|
|
|
|
@retval DB_INTERRUPTED if the execution was interrupted by the user
|
|
|
|
@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
|
|
|
|
@retval DB_SUCCESS if the lock was granted */
|
|
|
|
dberr_t lock_wait(que_thr_t *thr);
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
|
|
|
|
function should be called at the the end of an SQL statement, by the
|
|
|
|
connection thread that owns the transaction (trx->mysql_thd). */
|
|
|
|
void
|
|
|
|
lock_unlock_table_autoinc(
|
|
|
|
/*======================*/
|
|
|
|
trx_t* trx); /*!< in/out: transaction */
|
2021-02-28 13:46:16 +02:00
|
|
|
|
|
|
|
/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
|
|
|
|
while holding a clustered index leaf page latch.
|
|
|
|
@param trx transaction that is or was waiting for a lock
|
|
|
|
@retval DB_SUCCESS if the lock was granted
|
|
|
|
@retval DB_DEADLOCK if the transaction must be aborted due to a deadlock
|
|
|
|
@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
|
|
|
|
lock request was released */
|
|
|
|
dberr_t lock_trx_handle_wait(trx_t *trx);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/*********************************************************************//**
|
|
|
|
Checks that a transaction id is sensible, i.e., not in the future.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return true if ok */
|
2014-02-26 19:11:54 +01:00
|
|
|
bool
|
|
|
|
lock_check_trx_id_sanity(
|
|
|
|
/*=====================*/
|
|
|
|
trx_id_t trx_id, /*!< in: trx id */
|
|
|
|
const rec_t* rec, /*!< in: user record */
|
|
|
|
dict_index_t* index, /*!< in: index */
|
2020-05-04 16:47:11 +02:00
|
|
|
const rec_offs* offsets); /*!< in: rec_get_offsets(rec, index) */
|
2017-12-27 20:07:20 +04:00
|
|
|
#ifdef UNIV_DEBUG
|
2014-02-26 19:11:54 +01:00
|
|
|
/*******************************************************************//**
|
|
|
|
Check if the transaction holds any locks on the sys tables
|
|
|
|
or its records.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return the strongest lock found on any sys table or 0 for none */
|
2014-02-26 19:11:54 +01:00
|
|
|
const lock_t*
|
|
|
|
lock_trx_has_sys_table_locks(
|
|
|
|
/*=========================*/
|
|
|
|
const trx_t* trx) /*!< in: transaction to check */
|
2018-07-03 15:10:06 +03:00
|
|
|
MY_ATTRIBUTE((nonnull, warn_unused_result));
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2018-07-03 15:10:06 +03:00
|
|
|
/** Check if the transaction holds an explicit exclusive lock on a record.
|
|
|
|
@param[in] trx transaction
|
|
|
|
@param[in] table table
|
2021-02-05 18:29:30 +02:00
|
|
|
@param[in] id leaf page identifier
|
2018-07-03 15:10:06 +03:00
|
|
|
@param[in] heap_no heap number identifying the record
|
|
|
|
@return whether an explicit X-lock is held */
|
2021-02-05 18:29:30 +02:00
|
|
|
bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
|
|
|
|
page_id_t id, ulint heap_no);
|
2014-02-26 19:11:54 +01:00
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
|
|
|
|
/** Lock operation struct */
|
|
|
|
struct lock_op_t{
|
|
|
|
dict_table_t* table; /*!< table to be locked */
|
2016-08-12 11:17:45 +03:00
|
|
|
lock_mode mode; /*!< lock mode */
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/** The lock system struct */
|
2018-02-22 20:46:42 +04:00
|
|
|
class lock_sys_t
|
|
|
|
{
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
friend struct LockGuard;
|
|
|
|
friend struct LockMultiGuard;
|
2021-10-22 12:38:45 +03:00
|
|
|
friend struct TMLockGuard;
|
|
|
|
friend struct TMLockMutexGuard;
|
|
|
|
friend struct TMLockTrxGuard;
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
|
|
|
|
/** Hash table latch */
|
|
|
|
struct hash_latch
|
MDEV-25404: ssux_lock_low: Introduce a separate writer mutex
Having both readers and writers use a single lock word in
futex system calls caused performance regression compared to
SRW_LOCK_DUMMY (mutex and 2 condition variables).
A contributing factor is that we did not accurately keep
track of the number of waiting threads and thus had to invoke
system calls to wake up any waiting threads.
SUX_LOCK_GENERIC: Renamed from SRW_LOCK_DUMMY. This is the
original implementation, with rw_lock (std::atomic<uint32_t>),
a mutex and two condition variables. Using a separate writer
mutex (as described below) is not possible, because the mutex ownership
in a buf_block_t::lock must be able to transfer from a write submitter
thread to an I/O completion thread, and pthread_mutex_lock() may assume
that the submitter thread is recursively acquiring the mutex that it
already holds, while in reality the I/O completion thread is the real
owner. POSIX does not define an interface for requesting a mutex to
be non-recursive.
On Microsoft Windows, srw_lock_low will remain a simple wrapper of
SRWLOCK. On 32-bit Microsoft Windows, sizeof(SRWLOCK)=4 while
sizeof(srw_lock_low)=8.
On other platforms, srw_lock_low is an alias of ssux_lock_low,
the Simple (non-recursive) Shared/Update/eXclusive lock.
In the futex-based implementation of ssux_lock_low (Linux, OpenBSD,
Microsoft Windows), we shall use a dedicated mutex for exclusive
requests (writer), and have a WRITER flag in the 'readers' lock word
to inform that a writer is holding the lock or waiting for the lock to
be granted. When the WRITER flag is set, all lock requests must acquire
the writer mutex. Normally, shared (S) lock requests simply perform a
compare-and-swap on the 'readers' word.
Update locks are implemented as a combination of writer mutex
and a normal counter in the 'readers' lock word. The conflict between
U and X locks is guaranteed by the writer mutex.
Unlike SUX_LOCK_GENERIC, wr_u_downgrade() will not wake up any pending
rd_lock() waits. They will wait until u_unlock() releases the writer mutex.
The ssux_lock_low is always wrapped by sux_lock (with a recursion count
of U and X locks), used for dict_index_t::lock and buf_block_t::lock.
Their memory footprint for the futex-based implementation will increase
by sizeof(srw_mutex), or 4 bytes.
This change addresses a performance regression in read-only benchmarks,
such as sysbench oltp_read_only. Also write performance was improved.
On 32-bit Linux and OpenBSD, lock_sys_t::hash_table will allocate
two hash table elements for each srw_lock (14 instead of 15 hash
table cells per 64-byte cache line on IA-32). On Microsoft Windows,
sizeof(SRWLOCK)==sizeof(void*) and there is no change.
Reviewed by: Vladislav Vaintroub
Tested by: Axel Schwenke and Vladislav Vaintroub
2021-04-19 18:15:49 +03:00
|
|
|
#ifdef SUX_LOCK_GENERIC
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
: private rw_lock
|
|
|
|
{
|
|
|
|
/** Wait for an exclusive lock */
|
|
|
|
void wait();
|
2021-02-12 17:42:18 +02:00
|
|
|
/** Try to acquire a lock */
|
|
|
|
bool try_acquire() { return write_trylock(); }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Acquire a lock */
|
2021-02-12 17:42:18 +02:00
|
|
|
void acquire() { if (!try_acquire()) wait(); }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Release a lock */
|
|
|
|
void release();
|
2021-10-22 12:38:45 +03:00
|
|
|
/** @return whether any lock is being held or waited for by any thread */
|
|
|
|
bool is_locked_or_waiting() const
|
|
|
|
{ return rw_lock::is_locked_or_waiting(); }
|
|
|
|
/** @return whether this latch is possibly held by any thread */
|
|
|
|
bool is_locked() const { return rw_lock::is_locked(); }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
#else
|
|
|
|
{
|
|
|
|
private:
|
2021-09-06 12:32:24 +03:00
|
|
|
srw_spin_lock_low lock;
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
public:
|
2021-02-12 17:42:18 +02:00
|
|
|
/** Try to acquire a lock */
|
|
|
|
bool try_acquire() { return lock.wr_lock_try(); }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Acquire a lock */
|
|
|
|
void acquire() { lock.wr_lock(); }
|
|
|
|
/** Release a lock */
|
|
|
|
void release() { lock.wr_unlock(); }
|
2021-10-22 12:38:45 +03:00
|
|
|
/** @return whether any lock may be held by any thread */
|
|
|
|
bool is_locked_or_waiting() const noexcept
|
|
|
|
{ return lock.is_locked_or_waiting(); }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** @return whether this latch is possibly held by any thread */
|
2021-10-22 12:38:45 +03:00
|
|
|
bool is_locked() const noexcept { return lock.is_locked(); }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
public:
|
|
|
|
struct hash_table
|
|
|
|
{
|
MDEV-25404: ssux_lock_low: Introduce a separate writer mutex
Having both readers and writers use a single lock word in
futex system calls caused performance regression compared to
SRW_LOCK_DUMMY (mutex and 2 condition variables).
A contributing factor is that we did not accurately keep
track of the number of waiting threads and thus had to invoke
system calls to wake up any waiting threads.
SUX_LOCK_GENERIC: Renamed from SRW_LOCK_DUMMY. This is the
original implementation, with rw_lock (std::atomic<uint32_t>),
a mutex and two condition variables. Using a separate writer
mutex (as described below) is not possible, because the mutex ownership
in a buf_block_t::lock must be able to transfer from a write submitter
thread to an I/O completion thread, and pthread_mutex_lock() may assume
that the submitter thread is recursively acquiring the mutex that it
already holds, while in reality the I/O completion thread is the real
owner. POSIX does not define an interface for requesting a mutex to
be non-recursive.
On Microsoft Windows, srw_lock_low will remain a simple wrapper of
SRWLOCK. On 32-bit Microsoft Windows, sizeof(SRWLOCK)=4 while
sizeof(srw_lock_low)=8.
On other platforms, srw_lock_low is an alias of ssux_lock_low,
the Simple (non-recursive) Shared/Update/eXclusive lock.
In the futex-based implementation of ssux_lock_low (Linux, OpenBSD,
Microsoft Windows), we shall use a dedicated mutex for exclusive
requests (writer), and have a WRITER flag in the 'readers' lock word
to inform that a writer is holding the lock or waiting for the lock to
be granted. When the WRITER flag is set, all lock requests must acquire
the writer mutex. Normally, shared (S) lock requests simply perform a
compare-and-swap on the 'readers' word.
Update locks are implemented as a combination of writer mutex
and a normal counter in the 'readers' lock word. The conflict between
U and X locks is guaranteed by the writer mutex.
Unlike SUX_LOCK_GENERIC, wr_u_downgrade() will not wake up any pending
rd_lock() waits. They will wait until u_unlock() releases the writer mutex.
The ssux_lock_low is always wrapped by sux_lock (with a recursion count
of U and X locks), used for dict_index_t::lock and buf_block_t::lock.
Their memory footprint for the futex-based implementation will increase
by sizeof(srw_mutex), or 4 bytes.
This change addresses a performance regression in read-only benchmarks,
such as sysbench oltp_read_only. Also write performance was improved.
On 32-bit Linux and OpenBSD, lock_sys_t::hash_table will allocate
two hash table elements for each srw_lock (14 instead of 15 hash
table cells per 64-byte cache line on IA-32). On Microsoft Windows,
sizeof(SRWLOCK)==sizeof(void*) and there is no change.
Reviewed by: Vladislav Vaintroub
Tested by: Axel Schwenke and Vladislav Vaintroub
2021-04-19 18:15:49 +03:00
|
|
|
/** Number of consecutive array[] elements occupied by a hash_latch */
|
|
|
|
static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2;
|
|
|
|
static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation");
|
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Number of array[] elements per hash_latch.
|
MDEV-25404: ssux_lock_low: Introduce a separate writer mutex
Having both readers and writers use a single lock word in
futex system calls caused performance regression compared to
SRW_LOCK_DUMMY (mutex and 2 condition variables).
A contributing factor is that we did not accurately keep
track of the number of waiting threads and thus had to invoke
system calls to wake up any waiting threads.
SUX_LOCK_GENERIC: Renamed from SRW_LOCK_DUMMY. This is the
original implementation, with rw_lock (std::atomic<uint32_t>),
a mutex and two condition variables. Using a separate writer
mutex (as described below) is not possible, because the mutex ownership
in a buf_block_t::lock must be able to transfer from a write submitter
thread to an I/O completion thread, and pthread_mutex_lock() may assume
that the submitter thread is recursively acquiring the mutex that it
already holds, while in reality the I/O completion thread is the real
owner. POSIX does not define an interface for requesting a mutex to
be non-recursive.
On Microsoft Windows, srw_lock_low will remain a simple wrapper of
SRWLOCK. On 32-bit Microsoft Windows, sizeof(SRWLOCK)=4 while
sizeof(srw_lock_low)=8.
On other platforms, srw_lock_low is an alias of ssux_lock_low,
the Simple (non-recursive) Shared/Update/eXclusive lock.
In the futex-based implementation of ssux_lock_low (Linux, OpenBSD,
Microsoft Windows), we shall use a dedicated mutex for exclusive
requests (writer), and have a WRITER flag in the 'readers' lock word
to inform that a writer is holding the lock or waiting for the lock to
be granted. When the WRITER flag is set, all lock requests must acquire
the writer mutex. Normally, shared (S) lock requests simply perform a
compare-and-swap on the 'readers' word.
Update locks are implemented as a combination of writer mutex
and a normal counter in the 'readers' lock word. The conflict between
U and X locks is guaranteed by the writer mutex.
Unlike SUX_LOCK_GENERIC, wr_u_downgrade() will not wake up any pending
rd_lock() waits. They will wait until u_unlock() releases the writer mutex.
The ssux_lock_low is always wrapped by sux_lock (with a recursion count
of U and X locks), used for dict_index_t::lock and buf_block_t::lock.
Their memory footprint for the futex-based implementation will increase
by sizeof(srw_mutex), or 4 bytes.
This change addresses a performance regression in read-only benchmarks,
such as sysbench oltp_read_only. Also write performance was improved.
On 32-bit Linux and OpenBSD, lock_sys_t::hash_table will allocate
two hash table elements for each srw_lock (14 instead of 15 hash
table cells per 64-byte cache line on IA-32). On Microsoft Windows,
sizeof(SRWLOCK)==sizeof(void*) and there is no change.
Reviewed by: Vladislav Vaintroub
Tested by: Axel Schwenke and Vladislav Vaintroub
2021-04-19 18:15:49 +03:00
|
|
|
Must be LATCH less than a power of 2. */
|
2021-09-15 16:18:39 +08:00
|
|
|
static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH;
|
|
|
|
static constexpr size_t EMPTY_SLOTS_PER_LATCH=
|
|
|
|
((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
|
|
|
|
/** number of payload elements in array[]. Protected by lock_sys.latch. */
|
|
|
|
ulint n_cells;
|
|
|
|
/** the hash table, with pad(n_cells) elements, aligned to L1 cache size;
|
|
|
|
in any hash chain, lock_t::is_waiting() entries must not precede
|
|
|
|
granted locks */
|
|
|
|
hash_cell_t *array;
|
|
|
|
|
|
|
|
/** Create the hash table.
|
|
|
|
@param n the lower bound of n_cells */
|
|
|
|
void create(ulint n);
|
|
|
|
|
|
|
|
/** Resize the hash table.
|
|
|
|
@param n the lower bound of n_cells */
|
|
|
|
void resize(ulint n);
|
|
|
|
|
|
|
|
/** Free the hash table. */
|
|
|
|
void free() { aligned_free(array); array= nullptr; }
|
|
|
|
|
|
|
|
/** @return the index of an array element */
|
|
|
|
inline ulint calc_hash(ulint fold) const;
|
2021-09-15 16:18:39 +08:00
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** @return raw array index converted to padded index */
|
MDEV-25404: ssux_lock_low: Introduce a separate writer mutex
Having both readers and writers use a single lock word in
futex system calls caused performance regression compared to
SRW_LOCK_DUMMY (mutex and 2 condition variables).
A contributing factor is that we did not accurately keep
track of the number of waiting threads and thus had to invoke
system calls to wake up any waiting threads.
SUX_LOCK_GENERIC: Renamed from SRW_LOCK_DUMMY. This is the
original implementation, with rw_lock (std::atomic<uint32_t>),
a mutex and two condition variables. Using a separate writer
mutex (as described below) is not possible, because the mutex ownership
in a buf_block_t::lock must be able to transfer from a write submitter
thread to an I/O completion thread, and pthread_mutex_lock() may assume
that the submitter thread is recursively acquiring the mutex that it
already holds, while in reality the I/O completion thread is the real
owner. POSIX does not define an interface for requesting a mutex to
be non-recursive.
On Microsoft Windows, srw_lock_low will remain a simple wrapper of
SRWLOCK. On 32-bit Microsoft Windows, sizeof(SRWLOCK)=4 while
sizeof(srw_lock_low)=8.
On other platforms, srw_lock_low is an alias of ssux_lock_low,
the Simple (non-recursive) Shared/Update/eXclusive lock.
In the futex-based implementation of ssux_lock_low (Linux, OpenBSD,
Microsoft Windows), we shall use a dedicated mutex for exclusive
requests (writer), and have a WRITER flag in the 'readers' lock word
to inform that a writer is holding the lock or waiting for the lock to
be granted. When the WRITER flag is set, all lock requests must acquire
the writer mutex. Normally, shared (S) lock requests simply perform a
compare-and-swap on the 'readers' word.
Update locks are implemented as a combination of writer mutex
and a normal counter in the 'readers' lock word. The conflict between
U and X locks is guaranteed by the writer mutex.
Unlike SUX_LOCK_GENERIC, wr_u_downgrade() will not wake up any pending
rd_lock() waits. They will wait until u_unlock() releases the writer mutex.
The ssux_lock_low is always wrapped by sux_lock (with a recursion count
of U and X locks), used for dict_index_t::lock and buf_block_t::lock.
Their memory footprint for the futex-based implementation will increase
by sizeof(srw_mutex), or 4 bytes.
This change addresses a performance regression in read-only benchmarks,
such as sysbench oltp_read_only. Also write performance was improved.
On 32-bit Linux and OpenBSD, lock_sys_t::hash_table will allocate
two hash table elements for each srw_lock (14 instead of 15 hash
table cells per 64-byte cache line on IA-32). On Microsoft Windows,
sizeof(SRWLOCK)==sizeof(void*) and there is no change.
Reviewed by: Vladislav Vaintroub
Tested by: Axel Schwenke and Vladislav Vaintroub
2021-04-19 18:15:49 +03:00
|
|
|
static ulint pad(ulint h)
|
2021-09-15 16:18:39 +08:00
|
|
|
{
|
|
|
|
ulint latches= LATCH * (h / ELEMENTS_PER_LATCH);
|
|
|
|
ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH;
|
|
|
|
return LATCH + latches + empty_slots + h;
|
|
|
|
}
|
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Get a latch. */
|
2021-02-22 18:32:51 +02:00
|
|
|
static hash_latch *latch(hash_cell_t *cell)
|
|
|
|
{
|
MDEV-25404: ssux_lock_low: Introduce a separate writer mutex
Having both readers and writers use a single lock word in
futex system calls caused performance regression compared to
SRW_LOCK_DUMMY (mutex and 2 condition variables).
A contributing factor is that we did not accurately keep
track of the number of waiting threads and thus had to invoke
system calls to wake up any waiting threads.
SUX_LOCK_GENERIC: Renamed from SRW_LOCK_DUMMY. This is the
original implementation, with rw_lock (std::atomic<uint32_t>),
a mutex and two condition variables. Using a separate writer
mutex (as described below) is not possible, because the mutex ownership
in a buf_block_t::lock must be able to transfer from a write submitter
thread to an I/O completion thread, and pthread_mutex_lock() may assume
that the submitter thread is recursively acquiring the mutex that it
already holds, while in reality the I/O completion thread is the real
owner. POSIX does not define an interface for requesting a mutex to
be non-recursive.
On Microsoft Windows, srw_lock_low will remain a simple wrapper of
SRWLOCK. On 32-bit Microsoft Windows, sizeof(SRWLOCK)=4 while
sizeof(srw_lock_low)=8.
On other platforms, srw_lock_low is an alias of ssux_lock_low,
the Simple (non-recursive) Shared/Update/eXclusive lock.
In the futex-based implementation of ssux_lock_low (Linux, OpenBSD,
Microsoft Windows), we shall use a dedicated mutex for exclusive
requests (writer), and have a WRITER flag in the 'readers' lock word
to inform that a writer is holding the lock or waiting for the lock to
be granted. When the WRITER flag is set, all lock requests must acquire
the writer mutex. Normally, shared (S) lock requests simply perform a
compare-and-swap on the 'readers' word.
Update locks are implemented as a combination of writer mutex
and a normal counter in the 'readers' lock word. The conflict between
U and X locks is guaranteed by the writer mutex.
Unlike SUX_LOCK_GENERIC, wr_u_downgrade() will not wake up any pending
rd_lock() waits. They will wait until u_unlock() releases the writer mutex.
The ssux_lock_low is always wrapped by sux_lock (with a recursion count
of U and X locks), used for dict_index_t::lock and buf_block_t::lock.
Their memory footprint for the futex-based implementation will increase
by sizeof(srw_mutex), or 4 bytes.
This change addresses a performance regression in read-only benchmarks,
such as sysbench oltp_read_only. Also write performance was improved.
On 32-bit Linux and OpenBSD, lock_sys_t::hash_table will allocate
two hash table elements for each srw_lock (14 instead of 15 hash
table cells per 64-byte cache line on IA-32). On Microsoft Windows,
sizeof(SRWLOCK)==sizeof(void*) and there is no change.
Reviewed by: Vladislav Vaintroub
Tested by: Axel Schwenke and Vladislav Vaintroub
2021-04-19 18:15:49 +03:00
|
|
|
void *l= ut_align_down(cell, sizeof *cell *
|
|
|
|
(ELEMENTS_PER_LATCH + LATCH));
|
2021-02-22 18:32:51 +02:00
|
|
|
return static_cast<hash_latch*>(l);
|
|
|
|
}
|
|
|
|
/** Get a hash table cell. */
|
|
|
|
inline hash_cell_t *cell_get(ulint fold) const;
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
void assert_locked(const page_id_t id) const;
|
|
|
|
#else
|
|
|
|
void assert_locked(const page_id_t) const {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
private:
|
|
|
|
/** @return the hash value before any ELEMENTS_PER_LATCH padding */
|
|
|
|
static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
|
|
|
|
|
|
|
|
/** @return the index of an array element */
|
|
|
|
static ulint calc_hash(ulint fold, ulint n_cells)
|
|
|
|
{
|
|
|
|
return pad(hash(fold, n_cells));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
private:
|
2018-02-22 20:46:42 +04:00
|
|
|
bool m_initialised;
|
|
|
|
|
2020-12-04 16:18:04 +02:00
|
|
|
/** mutex proteting the locks */
|
2022-04-14 10:40:26 +03:00
|
|
|
alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
|
2021-02-11 14:52:10 +02:00
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
/** The owner of exclusive latch (0 if none); protected by latch */
|
2022-04-19 13:49:52 +03:00
|
|
|
std::atomic<pthread_t> writer{0};
|
2021-02-11 14:52:10 +02:00
|
|
|
/** Number of shared latches */
|
|
|
|
std::atomic<ulint> readers{0};
|
|
|
|
#endif
|
MDEV-25404: ssux_lock_low: Introduce a separate writer mutex
Having both readers and writers use a single lock word in
futex system calls caused performance regression compared to
SRW_LOCK_DUMMY (mutex and 2 condition variables).
A contributing factor is that we did not accurately keep
track of the number of waiting threads and thus had to invoke
system calls to wake up any waiting threads.
SUX_LOCK_GENERIC: Renamed from SRW_LOCK_DUMMY. This is the
original implementation, with rw_lock (std::atomic<uint32_t>),
a mutex and two condition variables. Using a separate writer
mutex (as described below) is not possible, because the mutex ownership
in a buf_block_t::lock must be able to transfer from a write submitter
thread to an I/O completion thread, and pthread_mutex_lock() may assume
that the submitter thread is recursively acquiring the mutex that it
already holds, while in reality the I/O completion thread is the real
owner. POSIX does not define an interface for requesting a mutex to
be non-recursive.
On Microsoft Windows, srw_lock_low will remain a simple wrapper of
SRWLOCK. On 32-bit Microsoft Windows, sizeof(SRWLOCK)=4 while
sizeof(srw_lock_low)=8.
On other platforms, srw_lock_low is an alias of ssux_lock_low,
the Simple (non-recursive) Shared/Update/eXclusive lock.
In the futex-based implementation of ssux_lock_low (Linux, OpenBSD,
Microsoft Windows), we shall use a dedicated mutex for exclusive
requests (writer), and have a WRITER flag in the 'readers' lock word
to inform that a writer is holding the lock or waiting for the lock to
be granted. When the WRITER flag is set, all lock requests must acquire
the writer mutex. Normally, shared (S) lock requests simply perform a
compare-and-swap on the 'readers' word.
Update locks are implemented as a combination of writer mutex
and a normal counter in the 'readers' lock word. The conflict between
U and X locks is guaranteed by the writer mutex.
Unlike SUX_LOCK_GENERIC, wr_u_downgrade() will not wake up any pending
rd_lock() waits. They will wait until u_unlock() releases the writer mutex.
The ssux_lock_low is always wrapped by sux_lock (with a recursion count
of U and X locks), used for dict_index_t::lock and buf_block_t::lock.
Their memory footprint for the futex-based implementation will increase
by sizeof(srw_mutex), or 4 bytes.
This change addresses a performance regression in read-only benchmarks,
such as sysbench oltp_read_only. Also write performance was improved.
On 32-bit Linux and OpenBSD, lock_sys_t::hash_table will allocate
two hash table elements for each srw_lock (14 instead of 15 hash
table cells per 64-byte cache line on IA-32). On Microsoft Windows,
sizeof(SRWLOCK)==sizeof(void*) and there is no change.
Reviewed by: Vladislav Vaintroub
Tested by: Axel Schwenke and Vladislav Vaintroub
2021-04-19 18:15:49 +03:00
|
|
|
#ifdef SUX_LOCK_GENERIC
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
protected:
|
|
|
|
/** mutex for hash_latch::wait() */
|
|
|
|
pthread_mutex_t hash_mutex;
|
|
|
|
/** condition variable for hash_latch::wait() */
|
|
|
|
pthread_cond_t hash_cond;
|
|
|
|
#endif
|
2020-12-04 19:02:58 +02:00
|
|
|
public:
|
2020-06-18 12:26:28 +03:00
|
|
|
/** record locks */
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
hash_table rec_hash;
|
2020-06-18 12:26:28 +03:00
|
|
|
/** predicate locks for SPATIAL INDEX */
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
hash_table prdt_hash;
|
2020-06-18 12:26:28 +03:00
|
|
|
/** page locks for SPATIAL INDEX */
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
hash_table prdt_page_hash;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** mutex covering lock waits; @see trx_lock_t::wait_lock */
|
2022-04-14 10:40:26 +03:00
|
|
|
alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
private:
|
2021-03-09 08:58:28 +02:00
|
|
|
/** The increment of wait_count for a wait. Anything smaller is a
|
|
|
|
pending wait count. */
|
|
|
|
static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19;
|
|
|
|
/** waits and total number of lock waits; protected by wait_mutex */
|
|
|
|
uint64_t wait_count;
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** Cumulative wait time; protected by wait_mutex */
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
uint32_t wait_time;
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** Longest wait time; protected by wait_mutex */
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
uint32_t wait_time_max;
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
public:
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
/** number of deadlocks detected; protected by wait_mutex */
|
|
|
|
ulint deadlocks;
|
2021-02-26 14:52:51 +02:00
|
|
|
/** number of lock wait timeouts; protected by wait_mutex */
|
|
|
|
ulint timeouts;
|
2018-02-22 20:46:42 +04:00
|
|
|
/**
|
|
|
|
Constructor.
|
|
|
|
|
|
|
|
Some members may require late initialisation, thus we just mark object as
|
|
|
|
uninitialised. Real initialisation happens in create().
|
|
|
|
*/
|
|
|
|
lock_sys_t(): m_initialised(false) {}
|
|
|
|
|
|
|
|
|
2021-02-11 14:52:10 +02:00
|
|
|
bool is_initialised() const { return m_initialised; }
|
2018-02-22 20:46:42 +04:00
|
|
|
|
2021-02-11 14:52:10 +02:00
|
|
|
#ifdef UNIV_PFS_RWLOCK
|
|
|
|
/** Acquire exclusive lock_sys.latch */
|
|
|
|
ATTRIBUTE_NOINLINE
|
|
|
|
void wr_lock(const char *file, unsigned line);
|
|
|
|
/** Release exclusive lock_sys.latch */
|
|
|
|
ATTRIBUTE_NOINLINE void wr_unlock();
|
|
|
|
/** Acquire shared lock_sys.latch */
|
|
|
|
ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line);
|
|
|
|
/** Release shared lock_sys.latch */
|
|
|
|
ATTRIBUTE_NOINLINE void rd_unlock();
|
2020-12-04 19:02:58 +02:00
|
|
|
#else
|
2021-02-11 14:52:10 +02:00
|
|
|
/** Acquire exclusive lock_sys.latch */
|
|
|
|
void wr_lock()
|
|
|
|
{
|
2021-04-08 13:32:16 +03:00
|
|
|
mysql_mutex_assert_not_owner(&wait_mutex);
|
2021-02-11 14:52:10 +02:00
|
|
|
ut_ad(!is_writer());
|
|
|
|
latch.wr_lock();
|
2022-04-19 13:49:52 +03:00
|
|
|
ut_ad(!writer.exchange(pthread_self(),
|
2021-02-11 14:52:10 +02:00
|
|
|
std::memory_order_relaxed));
|
|
|
|
}
|
|
|
|
/** Release exclusive lock_sys.latch */
|
|
|
|
void wr_unlock()
|
|
|
|
{
|
|
|
|
ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
|
2022-04-19 13:49:52 +03:00
|
|
|
pthread_self());
|
2021-02-11 14:52:10 +02:00
|
|
|
latch.wr_unlock();
|
|
|
|
}
|
|
|
|
/** Acquire shared lock_sys.latch */
|
|
|
|
void rd_lock()
|
|
|
|
{
|
2021-04-08 13:32:16 +03:00
|
|
|
mysql_mutex_assert_not_owner(&wait_mutex);
|
2021-02-11 14:52:10 +02:00
|
|
|
ut_ad(!is_writer());
|
|
|
|
latch.rd_lock();
|
|
|
|
ut_ad(!writer.load(std::memory_order_relaxed));
|
|
|
|
ut_d(readers.fetch_add(1, std::memory_order_relaxed));
|
|
|
|
}
|
|
|
|
/** Release shared lock_sys.latch */
|
|
|
|
void rd_unlock()
|
|
|
|
{
|
|
|
|
ut_ad(!is_writer());
|
|
|
|
ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
|
|
|
|
latch.rd_unlock();
|
|
|
|
}
|
2020-12-04 19:02:58 +02:00
|
|
|
#endif
|
2021-02-11 14:52:10 +02:00
|
|
|
/** Try to acquire exclusive lock_sys.latch
|
|
|
|
@return whether the latch was acquired */
|
|
|
|
bool wr_lock_try()
|
|
|
|
{
|
|
|
|
ut_ad(!is_writer());
|
|
|
|
if (!latch.wr_lock_try()) return false;
|
2022-04-19 13:49:52 +03:00
|
|
|
ut_ad(!writer.exchange(pthread_self(),
|
2021-02-11 14:52:10 +02:00
|
|
|
std::memory_order_relaxed));
|
|
|
|
return true;
|
|
|
|
}
|
2021-02-28 13:46:16 +02:00
|
|
|
/** Try to acquire shared lock_sys.latch
|
|
|
|
@return whether the latch was acquired */
|
|
|
|
bool rd_lock_try()
|
|
|
|
{
|
|
|
|
ut_ad(!is_writer());
|
|
|
|
if (!latch.rd_lock_try()) return false;
|
|
|
|
ut_ad(!writer.load(std::memory_order_relaxed));
|
|
|
|
ut_d(readers.fetch_add(1, std::memory_order_relaxed));
|
|
|
|
return true;
|
|
|
|
}
|
2020-12-04 19:02:58 +02:00
|
|
|
|
2021-02-11 14:52:10 +02:00
|
|
|
/** Assert that wr_lock() has been invoked by this thread */
|
|
|
|
void assert_locked() const { ut_ad(is_writer()); }
|
|
|
|
/** Assert that wr_lock() has not been invoked by this thread */
|
|
|
|
void assert_unlocked() const { ut_ad(!is_writer()); }
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
/** @return whether the current thread is the lock_sys.latch writer */
|
|
|
|
bool is_writer() const
|
2021-10-22 12:38:45 +03:00
|
|
|
{
|
|
|
|
# ifdef SUX_LOCK_GENERIC
|
2022-04-19 13:49:52 +03:00
|
|
|
return writer.load(std::memory_order_relaxed) == pthread_self();
|
2021-10-22 12:38:45 +03:00
|
|
|
# else
|
2022-04-19 13:49:52 +03:00
|
|
|
return writer.load(std::memory_order_relaxed) == pthread_self() ||
|
2021-10-22 12:38:45 +03:00
|
|
|
(xtest() && !latch.is_locked_or_waiting());
|
|
|
|
# endif
|
|
|
|
}
|
2021-02-22 18:32:51 +02:00
|
|
|
/** Assert that a lock shard is exclusively latched (by some thread) */
|
2021-02-11 14:52:10 +02:00
|
|
|
void assert_locked(const lock_t &lock) const;
|
|
|
|
/** Assert that a table lock shard is exclusively latched by this thread */
|
|
|
|
void assert_locked(const dict_table_t &table) const;
|
2021-02-22 18:32:51 +02:00
|
|
|
/** Assert that a hash table cell is exclusively latched (by some thread) */
|
|
|
|
void assert_locked(const hash_cell_t &cell) const;
|
2021-02-11 14:52:10 +02:00
|
|
|
#else
|
|
|
|
void assert_locked(const lock_t &) const {}
|
|
|
|
void assert_locked(const dict_table_t &) const {}
|
2021-02-22 18:32:51 +02:00
|
|
|
void assert_locked(const hash_cell_t &) const {}
|
2021-02-11 14:52:10 +02:00
|
|
|
#endif
|
2018-02-22 20:46:42 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
Creates the lock system at database start.
|
|
|
|
|
|
|
|
@param[in] n_cells number of slots in lock hash table
|
|
|
|
*/
|
|
|
|
void create(ulint n_cells);
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
Resize the lock hash table.
|
|
|
|
|
|
|
|
@param[in] n_cells number of slots in lock hash table
|
|
|
|
*/
|
|
|
|
void resize(ulint n_cells);
|
|
|
|
|
|
|
|
|
|
|
|
/** Closes the lock system at database shutdown. */
|
|
|
|
void close();
|
2020-09-11 15:55:30 +03:00
|
|
|
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
|
2021-02-28 13:46:16 +02:00
|
|
|
/** Check for deadlocks while holding only lock_sys.wait_mutex. */
|
|
|
|
void deadlock_check();
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
|
2021-02-28 13:46:16 +02:00
|
|
|
/** Cancel a waiting lock request.
|
2021-10-22 12:38:45 +03:00
|
|
|
@tparam check_victim whether to check for DB_DEADLOCK
|
|
|
|
@param lock waiting lock request
|
|
|
|
@param trx active transaction
|
2021-03-03 13:49:49 +02:00
|
|
|
@retval DB_SUCCESS if no lock existed
|
|
|
|
@retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set
|
|
|
|
@retval DB_LOCK_WAIT if the lock was canceled */
|
2021-10-22 12:38:45 +03:00
|
|
|
template<bool check_victim>
|
|
|
|
static dberr_t cancel(trx_t *trx, lock_t *lock);
|
2021-02-28 13:46:16 +02:00
|
|
|
/** Cancel a waiting lock request (if any) when killing a transaction */
|
|
|
|
static void cancel(trx_t *trx);
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** Note that a record lock wait started */
|
|
|
|
inline void wait_start();
|
|
|
|
|
|
|
|
/** Note that a record lock wait resumed */
|
|
|
|
inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now);
|
|
|
|
|
|
|
|
/** @return pending number of lock waits */
|
2021-03-09 08:58:28 +02:00
|
|
|
ulint get_wait_pending() const
|
|
|
|
{
|
|
|
|
return static_cast<ulint>(wait_count & (WAIT_COUNT_STEP - 1));
|
|
|
|
}
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** @return cumulative number of lock waits */
|
2021-03-09 08:58:28 +02:00
|
|
|
ulint get_wait_cumulative() const
|
|
|
|
{ return static_cast<ulint>(wait_count / WAIT_COUNT_STEP); }
|
MDEV-24671: Replace lock_wait_timeout_task with mysql_cond_timedwait()
lock_wait(): Replaces lock_wait_suspend_thread(). Wait for the lock to
be granted or the transaction to be killed using mysql_cond_timedwait()
or mysql_cond_wait().
lock_wait_end(): Replaces que_thr_end_lock_wait() and
lock_wait_release_thread_if_suspended().
lock_wait_timeout_task: Remove. The operating system kernel will
resume the mysql_cond_timedwait() in lock_wait(). An added benefit
is that innodb_lock_wait_timeout no longer has a 'jitter' of 1 second,
which was caused by this wake-up task waking up only once per second,
and then waking up any threads for which the timeout (which was only
measured in seconds) was exceeded.
innobase_kill_query(): Set trx->error_state=DB_INTERRUPTED,
so that a call trx_is_interrupted(trx) in lock_wait() can be avoided.
We will protect things more consistently with lock_sys.wait_mutex,
which will be moved below lock_sys.mutex in the latching order.
trx_lock_t::cond: Condition variable for !wait_lock, used with
lock_sys.wait_mutex.
srv_slot_t: Remove. Replaced by trx_lock_t::cond,
lock_grant_after_reset(): Merged to to lock_grant().
lock_rec_get_index_name(): Remove.
lock_sys_t: Introduce wait_pending, wait_count, wait_time, wait_time_max
that are protected by wait_mutex.
trx_lock_t::que_state: Remove.
que_thr_state_t: Remove QUE_THR_COMMAND_WAIT, QUE_THR_LOCK_WAIT.
que_thr_t: Remove is_active, start_running(), stop_no_error().
que_fork_t::n_active_thrs, trx_lock_t::n_active_thrs: Remove.
2021-01-26 16:39:56 +02:00
|
|
|
/** Cumulative wait time; protected by wait_mutex */
|
|
|
|
ulint get_wait_time_cumulative() const { return wait_time; }
|
|
|
|
/** Longest wait time; protected by wait_mutex */
|
|
|
|
ulint get_wait_time_max() const { return wait_time_max; }
|
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Get the lock hash table for a mode */
|
|
|
|
hash_table &hash_get(ulint mode)
|
|
|
|
{
|
|
|
|
if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))))
|
|
|
|
return rec_hash;
|
|
|
|
return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash;
|
|
|
|
}
|
2020-09-11 15:55:30 +03:00
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Get the lock hash table for predicate a mode */
|
|
|
|
hash_table &prdt_hash_get(bool page)
|
|
|
|
{ return page ? prdt_page_hash : prdt_hash; }
|
|
|
|
|
2021-02-22 18:32:51 +02:00
|
|
|
/** Get the first lock on a page.
|
|
|
|
@param cell hash table cell
|
|
|
|
@param id page number
|
|
|
|
@return first lock
|
|
|
|
@retval nullptr if none exists */
|
|
|
|
static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id);
|
2020-09-11 15:55:30 +03:00
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Get the first explicit lock request on a record.
|
2021-02-22 18:32:51 +02:00
|
|
|
@param cell first lock hash table cell
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
@param id page identifier
|
|
|
|
@param heap_no record identifier in page
|
2020-09-11 15:55:30 +03:00
|
|
|
@return first lock
|
|
|
|
@retval nullptr if none exists */
|
2021-02-22 18:32:51 +02:00
|
|
|
static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id,
|
|
|
|
ulint heap_no);
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
|
|
|
|
/** Remove locks on a discarded SPATIAL INDEX page.
|
|
|
|
@param id page to be discarded
|
|
|
|
@param page whether to discard also from lock_sys.prdt_hash */
|
|
|
|
void prdt_page_free_from_discard(const page_id_t id, bool all= false);
|
MDEV-24966 Galera multi-master regression
After the merging of MDEV-24915, 10.6 branch has regressions with handling of
concurrent write load against two or more cluster nodes. These regressions may
surface as cluster hanging, node crashes or data inconsistency. With some test
scenarios, the only visible symptom could be that the BF victim aborting happens
only by innodb lock wait timeout expiration. This would result only to poor
performance (by default 50 sec hang for each BF conflict), and could be somewhat
difficult to diagnose.
This pull request has following fixes to handle concurrent write load from
multiple nodes:
In lock_wait_wsrep_kill(), the victim trx was expected to be only in
TRX_STATE_ACTIVE state. With the delayed BF conflict handling, it can happen
that victim has advanced into pre commit state. This was fixed by choosing
victim both in TRX_STATE_ACTIVE and TRX_STATE_PREPARED states.
Victim transaction may be in several different states at the time of detected
lock conflict, and due to delayed BF aborting practice in MDEV-24915, the victim
may advance further before the actual BF aborting takes place. The BF aborting
in MDEV-24915 did not wake the victim, if it was in the state of waiting for
some other lock (than the one that was blocking the high priority thread).
This anomaly caused the innodb lock wait timeout expiration delays and poor
performance symptom. To fix this, lock_wait_wsrep_kill() now looks if
victim is in lock waiting state, and uses lock_cancel_waiting_and_release()
to cancel this lock wait.
wsrep_bf_abort() checks if the victim has active transaction (in wsrep-lib),
and starts a new transaction if there was no active transaction before.
Due to late BF aborting, the victim may have e.g. failed in certification
and is already aborting or has aborted at this stage. This has caused
problems in testing where BF aborter tries to BF abort himself.
The fix in wsrep_bf_abort() now skips the BF abort, if victim is aborting
or has aborted. Victim may not have started transaction yet in wsrep context,
but it may have acquired MDL locks (due to DDL execution), and this has
caused BF conflict. Such case does not require aborting in wsrep or
replication provider state.
BF aborting could cause BF-BF conflict scenario, if victim was already aborted
and changed to replayer having high priority as well. This BF-BF conflict
scenario is now avoided in lock_wait_wsrep() where we now check if blocking
lock holder is also high priority and is ordered before, caller should wait
for the lock in this situation.
The natural innodb deadlock resolving algorithm could pick BF thread as
deadlock victim. This is fixed by giving max weigh to BF threads in
Deadlock::report().
MDEV-24341 has changed excution paths in do_command() and this affects BF
aborted victim execution. This PR fixes one assert in do_command():
DBUG_ASSERT(!thd->async_state.pending_ops())
Which fired if the thd was BF aborted earlier. This assert is now changed
to allow pending_ops() if thd was BF aborted before.
With these fixes, long term highly conflicting write load could be run against
to node cluster. If binlogging is configured, log_slave_updates should be
also set.
2021-02-28 19:33:30 +02:00
|
|
|
|
|
|
|
/** Cancel possible lock waiting for a transaction */
|
|
|
|
static void cancel_lock_wait_for_trx(trx_t *trx);
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
2021-02-04 16:38:07 +02:00
|
|
|
/** The lock system */
|
|
|
|
extern lock_sys_t lock_sys;
|
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** @return the index of an array element */
|
|
|
|
inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
|
|
|
|
{
|
|
|
|
ut_ad(lock_sys.is_writer() || lock_sys.readers);
|
|
|
|
return calc_hash(fold, n_cells);
|
|
|
|
}
|
|
|
|
|
2021-02-22 18:32:51 +02:00
|
|
|
/** Get a hash table cell. */
|
|
|
|
inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
{
|
|
|
|
ut_ad(lock_sys.is_writer() || lock_sys.readers);
|
2021-02-22 18:32:51 +02:00
|
|
|
return &array[calc_hash(fold)];
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Get the first lock on a page.
|
|
|
|
@param cell hash table cell
|
|
|
|
@param id page number
|
|
|
|
@return first lock
|
|
|
|
@retval nullptr if none exists */
|
|
|
|
inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id)
|
|
|
|
{
|
|
|
|
lock_sys.assert_locked(cell);
|
|
|
|
for (auto lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
|
|
|
|
{
|
|
|
|
ut_ad(!lock->is_table());
|
|
|
|
if (lock->un_member.rec_lock.page_id == id)
|
|
|
|
return lock;
|
|
|
|
}
|
|
|
|
return nullptr;
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
}
|
|
|
|
|
2021-02-22 18:32:51 +02:00
|
|
|
/** lock_sys.latch exclusive guard */
|
2021-02-04 16:38:07 +02:00
|
|
|
struct LockMutexGuard
|
|
|
|
{
|
2021-02-11 14:52:10 +02:00
|
|
|
LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
|
|
|
|
{ lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); }
|
|
|
|
~LockMutexGuard() { lock_sys.wr_unlock(); }
|
2021-02-04 16:38:07 +02:00
|
|
|
};
|
|
|
|
|
2021-02-22 18:32:51 +02:00
|
|
|
/** lock_sys latch guard for 1 page_id_t */
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
struct LockGuard
|
|
|
|
{
|
|
|
|
LockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
|
|
|
|
~LockGuard()
|
|
|
|
{
|
2021-02-22 18:32:51 +02:00
|
|
|
lock_sys_t::hash_table::latch(cell_)->release();
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
|
|
|
|
lock_sys.rd_unlock();
|
|
|
|
}
|
2021-02-22 18:32:51 +02:00
|
|
|
/** @return the hash array cell */
|
|
|
|
hash_cell_t &cell() const { return *cell_; }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
private:
|
2021-02-22 18:32:51 +02:00
|
|
|
/** The hash array cell */
|
|
|
|
hash_cell_t *cell_;
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
};
|
|
|
|
|
2021-02-22 18:32:51 +02:00
|
|
|
/** lock_sys latch guard for 2 page_id_t */
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
struct LockMultiGuard
|
|
|
|
{
|
|
|
|
LockMultiGuard(lock_sys_t::hash_table &hash,
|
|
|
|
const page_id_t id1, const page_id_t id2);
|
|
|
|
~LockMultiGuard();
|
2021-02-22 18:32:51 +02:00
|
|
|
|
|
|
|
/** @return the first hash array cell */
|
|
|
|
hash_cell_t &cell1() const { return *cell1_; }
|
|
|
|
/** @return the second hash array cell */
|
|
|
|
hash_cell_t &cell2() const { return *cell2_; }
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
private:
|
2021-02-22 18:32:51 +02:00
|
|
|
/** The first hash array cell */
|
|
|
|
hash_cell_t *cell1_;
|
|
|
|
/** The second hash array cell */
|
|
|
|
hash_cell_t *cell2_;
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
};
|
|
|
|
|
2021-10-22 12:38:45 +03:00
|
|
|
/** lock_sys.latch exclusive guard using transactional memory */
|
|
|
|
struct TMLockMutexGuard
|
|
|
|
{
|
|
|
|
TRANSACTIONAL_INLINE
|
|
|
|
TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (xbegin())
|
|
|
|
{
|
|
|
|
if (was_elided())
|
|
|
|
return;
|
|
|
|
xabort();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
|
|
|
|
}
|
|
|
|
TRANSACTIONAL_INLINE
|
|
|
|
~TMLockMutexGuard()
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (was_elided()) xend(); else
|
|
|
|
#endif
|
|
|
|
lock_sys.wr_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
bool was_elided() const noexcept
|
|
|
|
{ return !lock_sys.latch.is_locked_or_waiting(); }
|
|
|
|
#else
|
|
|
|
bool was_elided() const noexcept { return false; }
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
/** lock_sys latch guard for 1 page_id_t, using transactional memory */
|
|
|
|
struct TMLockGuard
|
|
|
|
{
|
|
|
|
TRANSACTIONAL_TARGET
|
|
|
|
TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
|
|
|
|
TRANSACTIONAL_INLINE ~TMLockGuard()
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (elided)
|
|
|
|
{
|
|
|
|
xend();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
lock_sys_t::hash_table::latch(cell_)->release();
|
|
|
|
/* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
|
|
|
|
lock_sys.rd_unlock();
|
|
|
|
}
|
|
|
|
/** @return the hash array cell */
|
|
|
|
hash_cell_t &cell() const { return *cell_; }
|
|
|
|
private:
|
|
|
|
/** The hash array cell */
|
|
|
|
hash_cell_t *cell_;
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
/** whether the latches were elided */
|
|
|
|
bool elided;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
/** guard for shared lock_sys.latch and trx_t::mutex using
|
|
|
|
transactional memory */
|
|
|
|
struct TMLockTrxGuard
|
|
|
|
{
|
|
|
|
trx_t &trx;
|
|
|
|
|
|
|
|
TRANSACTIONAL_INLINE
|
|
|
|
#ifndef UNIV_PFS_RWLOCK
|
|
|
|
TMLockTrxGuard(trx_t &trx) : trx(trx)
|
|
|
|
# define TMLockTrxArgs(trx) trx
|
|
|
|
#else
|
|
|
|
TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
|
|
|
|
# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (xbegin())
|
|
|
|
{
|
|
|
|
if (!lock_sys.latch.is_write_locked() && was_elided())
|
|
|
|
return;
|
|
|
|
xabort();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
|
|
|
|
trx.mutex_lock();
|
|
|
|
}
|
|
|
|
TRANSACTIONAL_INLINE
|
|
|
|
~TMLockTrxGuard()
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (was_elided())
|
|
|
|
{
|
|
|
|
xend();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
lock_sys.rd_unlock();
|
|
|
|
trx.mutex_unlock();
|
|
|
|
}
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
|
|
|
|
#else
|
|
|
|
bool was_elided() const noexcept { return false; }
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
/** guard for trx_t::mutex using transactional memory */
|
|
|
|
struct TMTrxGuard
|
|
|
|
{
|
|
|
|
trx_t &trx;
|
|
|
|
|
|
|
|
TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (xbegin())
|
|
|
|
{
|
|
|
|
if (was_elided())
|
|
|
|
return;
|
|
|
|
xabort();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
trx.mutex_lock();
|
|
|
|
}
|
|
|
|
TRANSACTIONAL_INLINE ~TMTrxGuard()
|
|
|
|
{
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
if (was_elided())
|
|
|
|
{
|
|
|
|
xend();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
trx.mutex_unlock();
|
|
|
|
}
|
|
|
|
#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
|
|
|
|
bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
|
|
|
|
#else
|
|
|
|
bool was_elided() const noexcept { return false; }
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2018-03-11 23:34:23 +02:00
|
|
|
/*********************************************************************//**
|
|
|
|
Creates a new record lock and inserts it to the lock queue. Does NOT check
|
|
|
|
for deadlocks or lock compatibility!
|
|
|
|
@return created lock */
|
|
|
|
UNIV_INLINE
|
|
|
|
lock_t*
|
|
|
|
lock_rec_create(
|
|
|
|
/*============*/
|
|
|
|
lock_t* c_lock, /*!< conflicting lock */
|
2021-01-26 16:28:02 +02:00
|
|
|
unsigned type_mode,/*!< in: lock mode and wait flag */
|
2018-03-11 23:34:23 +02:00
|
|
|
const buf_block_t* block, /*!< in: buffer block containing
|
|
|
|
the record */
|
|
|
|
ulint heap_no,/*!< in: heap number of the record */
|
|
|
|
dict_index_t* index, /*!< in: index of record */
|
|
|
|
trx_t* trx, /*!< in,out: transaction */
|
|
|
|
bool caller_owns_trx_mutex);
|
|
|
|
/*!< in: true if caller owns
|
|
|
|
trx mutex */
|
|
|
|
|
MDEV-20612: Partition lock_sys.latch
We replace the old lock_sys.mutex (which was renamed to lock_sys.latch)
with a combination of a global lock_sys.latch and table or page hash lock
mutexes.
The global lock_sys.latch can be acquired in exclusive mode, or
it can be acquired in shared mode and another mutex will be acquired
to protect the locks for a particular page or a table.
This is inspired by
mysql/mysql-server@1d259b87a63defa814e19a7534380cb43ee23c48
but the optimization of lock_release() will be done in the next commit.
Also, we will interleave mutexes with the hash table elements, similar
to how buf_pool.page_hash was optimized
in commit 5155a300fab85e97217c75e3ba3c3ce78082dd8a (MDEV-22871).
dict_table_t::autoinc_trx: Use Atomic_relaxed.
dict_table_t::autoinc_mutex: Use srw_mutex in order to reduce the
memory footprint. On 64-bit Linux or OpenBSD, both this and the new
dict_table_t::lock_mutex should be 32 bits and be stored in the same
64-bit word. On Microsoft Windows, the underlying SRWLOCK is 32 or 64
bits, and on other systems, sizeof(pthread_mutex_t) can be much larger.
ib_lock_t::trx_locks, trx_lock_t::trx_locks: Document the new rules.
Writers must assert lock_sys.is_writer() || trx->mutex_is_owner().
LockGuard: A RAII wrapper for acquiring a page hash table lock.
LockGGuard: Like LockGuard, but when Galera Write-Set Replication
is enabled, we must acquire all shards, for updating arbitrary trx_locks.
LockMultiGuard: A RAII wrapper for acquiring two page hash table locks.
lock_rec_create_wsrep(), lock_table_create_wsrep(): Special
Galera conflict resolution in non-inlined functions in order
to keep the common code paths shorter.
lock_sys_t::prdt_page_free_from_discard(): Refactored from
lock_prdt_page_free_from_discard() and
lock_rec_free_all_from_discard_page().
trx_t::commit_tables(): Replaces trx_update_mod_tables_timestamp().
lock_release(): Let trx_t::commit_tables() invalidate the query cache
for those tables that were actually modified by the transaction.
Merge lock_check_dict_lock() to lock_release().
We must never release lock_sys.latch while holding any
lock_sys_t::hash_latch. Failure to do that could lead to
memory corruption if the buffer pool is resized between
the time lock_sys.latch is released and the hash_latch is released.
2021-02-12 17:35:42 +02:00
|
|
|
/** Remove a record lock request, waiting or granted, on a discarded page
|
|
|
|
@param hash hash table
|
|
|
|
@param in_lock lock object */
|
|
|
|
void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2018-03-11 23:34:23 +02:00
|
|
|
/** Create a new record lock and inserts it to the lock queue,
|
|
|
|
without checking for deadlocks or conflicts.
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
@param[in] c_lock conflicting lock, or NULL
|
2021-01-26 16:28:02 +02:00
|
|
|
@param[in] type_mode lock mode and wait flag
|
2020-09-11 15:55:30 +03:00
|
|
|
@param[in] page_id index page number
|
2018-03-11 23:34:23 +02:00
|
|
|
@param[in] page R-tree index page, or NULL
|
|
|
|
@param[in] heap_no record heap number in the index page
|
|
|
|
@param[in] index the index tree
|
|
|
|
@param[in,out] trx transaction
|
|
|
|
@param[in] holds_trx_mutex whether the caller holds trx->mutex
|
|
|
|
@return created lock */
|
|
|
|
lock_t*
|
|
|
|
lock_rec_create_low(
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
lock_t* c_lock,
|
2020-03-10 20:05:17 +02:00
|
|
|
unsigned type_mode,
|
2020-09-11 15:55:30 +03:00
|
|
|
const page_id_t page_id,
|
2018-03-11 23:34:23 +02:00
|
|
|
const page_t* page,
|
|
|
|
ulint heap_no,
|
|
|
|
dict_index_t* index,
|
|
|
|
trx_t* trx,
|
2022-03-07 13:03:53 +03:00
|
|
|
bool holds_trx_mutex);
|
2021-02-05 18:29:30 +02:00
|
|
|
|
2018-03-11 23:34:23 +02:00
|
|
|
/** Enqueue a waiting request for a lock which cannot be granted immediately.
|
|
|
|
Check for deadlocks.
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
@param[in] c_lock conflicting lock
|
2018-03-11 23:34:23 +02:00
|
|
|
@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X)
|
|
|
|
possibly ORed with LOCK_GAP or
|
|
|
|
LOCK_REC_NOT_GAP, ORed with
|
|
|
|
LOCK_INSERT_INTENTION if this
|
|
|
|
waiting lock request is set
|
|
|
|
when performing an insert of
|
|
|
|
an index record
|
2021-02-05 18:29:30 +02:00
|
|
|
@param[in] id page identifier
|
|
|
|
@param[in] page leaf page in the index
|
2018-03-11 23:34:23 +02:00
|
|
|
@param[in] heap_no record heap number in the block
|
|
|
|
@param[in] index index tree
|
|
|
|
@param[in,out] thr query thread
|
|
|
|
@param[in] prdt minimum bounding box (spatial index)
|
|
|
|
@retval DB_LOCK_WAIT if the waiting lock was enqueued
|
2021-02-04 16:38:07 +02:00
|
|
|
@retval DB_DEADLOCK if this transaction was chosen as the victim */
|
2018-03-11 23:34:23 +02:00
|
|
|
dberr_t
|
|
|
|
lock_rec_enqueue_waiting(
|
MDEV-24738 Improve the InnoDB deadlock checker
A new configuration parameter innodb_deadlock_report is introduced:
* innodb_deadlock_report=off: Do not report any details of deadlocks.
* innodb_deadlock_report=basic: Report transactions and waiting locks.
* innodb_deadlock_report=full (default): Report also the blocking locks.
The improved deadlock checker will consider all involved transactions
in one loop, even if the deadlock loop includes several transactions.
The theoretical maximum number of transactions that can be involved in
a deadlock is `innodb_page_size` * 8, limited by the persistent data
structures.
Note: Similar to
mysql/mysql-server@3859219875b62154b921e8c6078c751198071b9c
our deadlock checker will consider at most one blocking transaction
for each waiting transaction. The new field trx->lock.wait_trx be
nullptr if and only if trx->lock.wait_lock is nullptr. Note that
trx->lock.wait_lock->trx == trx (the waiting transaction), while
trx->lock.wait_trx points to one of the transactions whose lock is
conflicting with trx->lock.wait_lock.
Considering only one blocking transaction will greatly simplify
our deadlock checker, but it may also make the deadlock checker
blind to some deadlocks where the deadlock cycle is 'hidden' by
the fact that the registered trx->lock.wait_trx is not actually
waiting for any InnoDB lock, but something else. So, instead of
deadlocks, sometimes lock wait timeout may be reported.
To improve on this, whenever trx->lock.wait_trx is changed, we
will register further 'candidate' transactions in Deadlock::to_check(),
and check for 'revealed' deadlocks as soon as possible, in lock_release()
and innobase_kill_query().
The old DeadlockChecker was holding lock_sys.latch, even though using
lock_sys.wait_mutex should be less contended (and thus preferred)
in the likely case that no deadlock is present.
lock_wait(): Defer the deadlock check to this function, instead of
executing it in lock_rec_enqueue_waiting(), lock_table_enqueue_waiting().
DeadlockChecker: Complete rewrite:
(1) Explicitly keep track of transactions that are being waited for,
in trx->lock.wait_trx, protected by lock_sys.wait_mutex. Previously,
we were painstakingly traversing the lock heaps while blocking
concurrent registration or removal of any locks (even uncontended ones).
(2) Use Brent's cycle-detection algorithm for deadlock detection,
traversing each trx->lock.wait_trx edge at most 2 times.
(3) If a deadlock is detected, release lock_sys.wait_mutex,
acquire LockMutexGuard, re-acquire lock_sys.wait_mutex and re-invoke
find_cycle() to find out whether the deadlock is still present.
(4) Display information on all transactions that are involved in the
deadlock, and choose a victim to be rolled back.
lock_sys.deadlocks: Replaces lock_deadlock_found. Protected by wait_mutex.
Deadlock::find_cycle(): Quickly find a cycle of trx->lock.wait_trx...
using Brent's cycle detection algorithm.
Deadlock::report(): Report a deadlock cycle that was found by
Deadlock::find_cycle(), and choose a victim with the least weight.
Altogether, we may traverse each trx->lock.wait_trx edge up to 5
times (2*find_cycle()+1 time for reporting and choosing the victim).
Deadlock::check_and_resolve(): Find and resolve a deadlock.
lock_wait_rpl_report(): Report the waits-for information to
replication. This used to be executed as part of DeadlockChecker.
Replication must know the waits-for relations even if no deadlocks
are present in InnoDB.
Reviewed by: Vladislav Vaintroub
2021-02-17 12:43:33 +02:00
|
|
|
lock_t* c_lock,
|
2020-03-10 20:05:17 +02:00
|
|
|
unsigned type_mode,
|
2021-02-05 18:29:30 +02:00
|
|
|
const page_id_t id,
|
|
|
|
const page_t* page,
|
2018-03-11 23:34:23 +02:00
|
|
|
ulint heap_no,
|
|
|
|
dict_index_t* index,
|
|
|
|
que_thr_t* thr,
|
|
|
|
lock_prdt_t* prdt);
|
2016-08-12 11:17:45 +03:00
|
|
|
/*************************************************************//**
|
|
|
|
Moves the explicit locks on user records to another page if a record
|
|
|
|
list start is moved to another page. */
|
|
|
|
void
|
|
|
|
lock_rtr_move_rec_list(
|
|
|
|
/*===================*/
|
|
|
|
const buf_block_t* new_block, /*!< in: index page to
|
|
|
|
move to */
|
|
|
|
const buf_block_t* block, /*!< in: index page */
|
|
|
|
rtr_rec_move_t* rec_move, /*!< in: recording records
|
|
|
|
moved */
|
|
|
|
ulint num_move); /*!< in: num of rec to move */
|
|
|
|
|
2022-01-13 17:27:28 +01:00
|
|
|
#include "lock0lock.inl"
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
#endif
|