From be8113861c2d12bb0f7048523f886cab53326d8e Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Tue, 11 Jan 2022 18:19:39 +0300 Subject: [PATCH] MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock The code was backported from 10.6 bd03c0e51629e1c3969a171137712a6bb854c232 commit. See that commit message for details. Apart from the above commit trx_lock_t::wait_trx was also backported from MDEV-24738. trx_lock_t::wait_trx is protected with lock_sys.wait_mutex in 10.6, but that mutex was implemented only in MDEV-24789. As there is no need to backport MDEV-24789 for MDEV-27025, trx_lock_t::wait_trx is protected with the same mutexes as trx_lock_t::wait_lock. This fix should not break innodb-lock-schedule-algorithm=VATS. This algorithm uses an Eldest-Transaction-First (ETF) heuristic, which prefers older transactions over new ones. In this fix we just insert granted lock just before the last granted lock of the same transaction, what does not change transactions execution order. The changes in lock_rec_create_low() should not break Galera Cluster, there is a big "if" branch for WSREP. This branch is necessary to provide the correct transactions execution order, and should not be changed for the current bug fix. --- .../suite/innodb/r/lock_wait_conflict.result | 27 ++ .../suite/innodb/t/lock_wait_conflict.test | 60 ++++ mysql-test/suite/versioning/r/update.result | 1 - mysql-test/suite/versioning/t/update.test | 4 +- storage/innobase/include/hash0hash.h | 27 +- storage/innobase/include/lock0lock.h | 50 +-- storage/innobase/include/lock0lock.ic | 41 +-- storage/innobase/include/lock0priv.h | 29 +- storage/innobase/include/lock0priv.ic | 15 +- storage/innobase/include/trx0trx.h | 4 +- storage/innobase/lock/lock0lock.cc | 320 +++++++++++------- storage/innobase/lock/lock0prdt.cc | 29 +- 12 files changed, 408 insertions(+), 199 deletions(-) create mode 100644 mysql-test/suite/innodb/r/lock_wait_conflict.result create mode 100644 mysql-test/suite/innodb/t/lock_wait_conflict.test diff --git a/mysql-test/suite/innodb/r/lock_wait_conflict.result b/mysql-test/suite/innodb/r/lock_wait_conflict.result new file mode 100644 index 00000000000..25d18c03ea1 --- /dev/null +++ b/mysql-test/suite/innodb/r/lock_wait_conflict.result @@ -0,0 +1,27 @@ +# +# MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock +# +CREATE TABLE t (a INT PRIMARY KEY, b INT NOT NULL UNIQUE) ENGINE=InnoDB; +connect prevent_purge,localhost,root,,; +start transaction with consistent snapshot; +connection default; +INSERT INTO t VALUES (20,20); +DELETE FROM t WHERE b = 20; +connect con_ins,localhost,root,,; +SET DEBUG_SYNC = 'row_ins_sec_index_entry_dup_locks_created SIGNAL ins_set_locks WAIT_FOR ins_cont'; +INSERT INTO t VALUES(10, 20); +connect con_del,localhost,root,,; +SET DEBUG_SYNC = 'now WAIT_FOR ins_set_locks'; +SET DEBUG_SYNC = 'lock_wait_suspend_thread_enter SIGNAL del_locked'; +DELETE FROM t WHERE b = 20; +connection default; +SET DEBUG_SYNC = 'now WAIT_FOR del_locked'; +SET DEBUG_SYNC = 'now SIGNAL ins_cont'; +connection con_ins; +disconnect con_ins; +connection con_del; +disconnect con_del; +disconnect prevent_purge; +connection default; +SET DEBUG_SYNC = 'RESET'; +DROP TABLE t; diff --git a/mysql-test/suite/innodb/t/lock_wait_conflict.test b/mysql-test/suite/innodb/t/lock_wait_conflict.test new file mode 100644 index 00000000000..46a29e14b43 --- /dev/null +++ b/mysql-test/suite/innodb/t/lock_wait_conflict.test @@ -0,0 +1,60 @@ +--source include/have_innodb.inc +--source include/count_sessions.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc + +--echo # +--echo # MDEV-27025 insert-intention lock conflicts with waiting ORDINARY lock +--echo # + +# The test checks the ability to acquire exclusive record lock if the acquiring +# transaction already holds a shared lock on the record and another transaction +# is waiting for a lock. + +CREATE TABLE t (a INT PRIMARY KEY, b INT NOT NULL UNIQUE) ENGINE=InnoDB; + +--connect(prevent_purge,localhost,root,,) +start transaction with consistent snapshot; + +--connection default +INSERT INTO t VALUES (20,20); +DELETE FROM t WHERE b = 20; + +--connect(con_ins,localhost,root,,) +SET DEBUG_SYNC = 'row_ins_sec_index_entry_dup_locks_created SIGNAL ins_set_locks WAIT_FOR ins_cont'; +send +INSERT INTO t VALUES(10, 20); + +--connect(con_del,localhost,root,,) +SET DEBUG_SYNC = 'now WAIT_FOR ins_set_locks'; +SET DEBUG_SYNC = 'lock_wait_suspend_thread_enter SIGNAL del_locked'; +############################################################################### +# This DELETE creates waiting ORDINARY X-lock for heap_no 2 as the record is +# delete-marked, this lock conflicts with ORDINARY S-lock set by the the last +# INSERT. After the last INSERT creates insert-intention lock on +# heap_no 2, this lock will conflict with waiting ORDINARY X-lock of this +# DELETE, what causes DEADLOCK error for this DELETE. +############################################################################### +send +DELETE FROM t WHERE b = 20; + +--connection default +SET DEBUG_SYNC = 'now WAIT_FOR del_locked'; +SET DEBUG_SYNC = 'now SIGNAL ins_cont'; + +--connection con_ins +--reap +--disconnect con_ins + +--connection con_del +# Without the fix, ER_LOCK_DEADLOCK would be reported here. +--reap +--disconnect con_del + +--disconnect prevent_purge + +--connection default + +SET DEBUG_SYNC = 'RESET'; +DROP TABLE t; +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/versioning/r/update.result b/mysql-test/suite/versioning/r/update.result index da893432749..18872928174 100644 --- a/mysql-test/suite/versioning/r/update.result +++ b/mysql-test/suite/versioning/r/update.result @@ -283,7 +283,6 @@ connection default; update t1 set b = 'foo'; connection con1; update t1 set a = 'bar'; -ERROR 40001: Deadlock found when trying to get lock; try restarting transaction disconnect con1; connection default; drop table t1; diff --git a/mysql-test/suite/versioning/t/update.test b/mysql-test/suite/versioning/t/update.test index 47a56a71bd3..5e3c5581abf 100644 --- a/mysql-test/suite/versioning/t/update.test +++ b/mysql-test/suite/versioning/t/update.test @@ -186,7 +186,9 @@ send update t1 set b = 'foo'; connection con1; let $wait_condition= select count(*) from information_schema.innodb_lock_waits; source include/wait_condition.inc; -error ER_LOCK_DEADLOCK; +# There must no be DEADLOCK here as con1 transaction already holds locks, and +# default's transaction lock is waiting, so the locks of the following "UPDATE" +# must not conflict with waiting lock. update t1 set a = 'bar'; disconnect con1; connection default; diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h index 981ff5a0814..57c9d42a0e2 100644 --- a/storage/innobase/include/hash0hash.h +++ b/storage/innobase/include/hash0hash.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, 2020, MariaDB Corporation. +Copyright (c) 2018, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,8 +28,29 @@ Created 5/20/1997 Heikki Tuuri #include "ut0rnd.h" struct hash_table_t; -struct hash_cell_t{ - void* node; /*!< hash chain node, NULL if none */ +struct hash_cell_t +{ + /** singly-linked, nullptr terminated list of hash buckets */ + void *node; + + /** Insert an element after another. + @tparam T type of the element + @param after the element after which to insert + @param insert the being-inserted element + @param next the next-element pointer in T */ + template + void insert_after(T &after, T &insert, T *T::*next) + { +#ifdef UNIV_DEBUG + for (const T *c= static_cast(node); c; c= c->*next) + if (c == &after) + goto found; + ut_error; + found: +#endif + insert.*next= after.*next; + after.*next= &insert; + } }; typedef void* hash_node_t; diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 225c246f4e7..4f19575d3fe 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -832,26 +832,29 @@ public: /*********************************************************************//** Creates a new record lock and inserts it to the lock queue. Does NOT check for deadlocks or lock compatibility! +@param[in] c_lock conflicting lock +@param[in] thr thread owning trx +@param[in] type_mode lock mode and wait flag, type is ignored and replaced by +LOCK_REC +@param[in] block buffer block containing the record +@param[in] heap_no heap number of the record +@param[in] index index of record +@param[in,out] trx transaction +@param[in] caller_owns_trx_mutex TRUE if caller owns trx mutex +@param[in] insert_before_waiting if true, inserts new B-tree record lock +just after the last non-waiting lock of the current transaction which is +located before the first waiting for the current transaction lock, otherwise +the lock is inserted at the end of the queue @return created lock */ UNIV_INLINE -lock_t* -lock_rec_create( -/*============*/ +lock_t *lock_rec_create(lock_t *c_lock, #ifdef WITH_WSREP - lock_t* c_lock, /*!< conflicting lock */ - que_thr_t* thr, /*!< thread owning trx */ + que_thr_t *thr, #endif - unsigned type_mode,/*!< in: lock mode and wait - flag, type is ignored and - replaced by LOCK_REC */ - const buf_block_t* block, /*!< in: buffer block containing - the record */ - ulint heap_no,/*!< in: heap number of the record */ - dict_index_t* index, /*!< in: index of record */ - trx_t* trx, /*!< in,out: transaction */ - bool caller_owns_trx_mutex); - /*!< in: true if caller owns - trx mutex */ + unsigned type_mode, const buf_block_t *block, + ulint heap_no, dict_index_t *index, trx_t *trx, + bool caller_owns_trx_mutex, + bool insert_before_waiting= false); /*************************************************************//** Removes a record lock request, waiting or granted, from the queue. */ @@ -864,6 +867,7 @@ lock_rec_discard( /** Create a new record lock and inserts it to the lock queue, without checking for deadlocks or conflicts. +@param[in] c_lock conflicting lock @param[in] type_mode lock mode and wait flag; type will be replaced with LOCK_REC @param[in] page_id index page number @@ -872,11 +876,15 @@ without checking for deadlocks or conflicts. @param[in] index the index tree @param[in,out] trx transaction @param[in] holds_trx_mutex whether the caller holds trx->mutex +@param[in] insert_before_waiting if true, inserts new B-tree record lock +just after the last non-waiting lock of the current transaction which is +located before the first waiting for the current transaction lock, otherwise +the lock is inserted at the end of the queue @return created lock */ lock_t* lock_rec_create_low( + lock_t* c_lock, #ifdef WITH_WSREP - lock_t* c_lock, /*!< conflicting lock */ que_thr_t* thr, /*!< thread owning trx */ #endif unsigned type_mode, @@ -885,7 +893,9 @@ lock_rec_create_low( ulint heap_no, dict_index_t* index, trx_t* trx, - bool holds_trx_mutex); + bool holds_trx_mutex, + bool insert_before_waiting = false); + /** Enqueue a waiting request for a lock which cannot be granted immediately. Check for deadlocks. @param[in] type_mode the requested lock mode (LOCK_S or LOCK_X) @@ -906,9 +916,7 @@ Check for deadlocks. (or it happened to commit) */ dberr_t lock_rec_enqueue_waiting( -#ifdef WITH_WSREP lock_t* c_lock, /*!< conflicting lock */ -#endif unsigned type_mode, const buf_block_t* block, ulint heap_no, diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic index 2d5b6ff37f1..2d35796e3c4 100644 --- a/storage/innobase/include/lock0lock.ic +++ b/storage/innobase/include/lock0lock.ic @@ -72,32 +72,35 @@ lock_hash_get( /*********************************************************************//** Creates a new record lock and inserts it to the lock queue. Does NOT check for deadlocks or lock compatibility! +@param[in] c_lock conflicting lock +@param[in] thr thread owning trx +@param[in] type_mode lock mode and wait flag, type is ignored and replaced by +LOCK_REC +@param[in] block buffer block containing the record +@param[in] heap_no heap number of the record +@param[in] index index of record +@param[in,out] trx transaction +@param[in] caller_owns_trx_mutex TRUE if caller owns trx mutex +@param[in] insert_before_waiting if true, inserts new B-tree record lock +just after the last non-waiting lock of the current transaction which is +located before the first waiting for the current transaction lock, otherwise +the lock is inserted at the end of the queue @return created lock */ UNIV_INLINE -lock_t* -lock_rec_create( -/*============*/ +lock_t *lock_rec_create(lock_t *c_lock, #ifdef WITH_WSREP - lock_t* c_lock, /*!< conflicting lock */ - que_thr_t* thr, /*!< thread owning trx */ + que_thr_t *thr, #endif - unsigned type_mode,/*!< in: lock mode and wait - flag, type is ignored and - replaced by LOCK_REC */ - const buf_block_t* block, /*!< in: buffer block containing - the record */ - ulint heap_no,/*!< in: heap number of the record */ - dict_index_t* index, /*!< in: index of record */ - trx_t* trx, /*!< in,out: transaction */ - bool caller_owns_trx_mutex) - /*!< in: TRUE if caller owns - trx mutex */ + unsigned type_mode, const buf_block_t *block, + ulint heap_no, dict_index_t *index, trx_t *trx, + bool caller_owns_trx_mutex, + bool insert_before_waiting) { btr_assert_not_corrupted(block, index); - return lock_rec_create_low( + return lock_rec_create_low(c_lock, #ifdef WITH_WSREP - c_lock, thr, + thr, #endif type_mode, block->page.id(), block->frame, heap_no, - index, trx, caller_owns_trx_mutex); + index, trx, caller_owns_trx_mutex, insert_before_waiting); } diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h index 1b2f9d0f0d3..17200138ada 100644 --- a/storage/innobase/include/lock0priv.h +++ b/storage/innobase/include/lock0priv.h @@ -562,14 +562,13 @@ lock_rec_get_next_const( /*********************************************************************//** Gets the first explicit lock request on a record. +@param[in] hash hash chain the lock on +@param[in] page_id page id +@param[in] heap_no heap number of the record @return first lock, NULL if none exists */ UNIV_INLINE -lock_t* -lock_rec_get_first( -/*===============*/ - hash_table_t* hash, /*!< in: hash chain the lock on */ - const buf_block_t* block, /*!< in: block containing the record */ - ulint heap_no);/*!< in: heap number of the record */ +lock_t *lock_rec_get_first(hash_table_t *hash, page_id_t page_id, + ulint heap_no); /*********************************************************************//** Gets the mode of a lock. @@ -623,15 +622,26 @@ lock_table_has( /** Set the wait status of a lock. @param[in,out] lock lock that will be waited for -@param[in,out] trx transaction that will wait for the lock */ -inline void lock_set_lock_and_trx_wait(lock_t* lock, trx_t* trx) +@param[in,out] trx transaction that will wait for the lock +@param[in] c_lock conflicting lock */ +inline void lock_set_lock_and_trx_wait(lock_t* lock, trx_t* trx, + const lock_t *c_lock) { ut_ad(lock); ut_ad(lock->trx == trx); - ut_ad(trx->lock.wait_lock == NULL); ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(trx)); + if (trx->lock.wait_trx) { + ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx); + ut_ad(trx->lock.wait_lock); + ut_ad((*trx->lock.wait_lock).trx == trx); + } else { + ut_ad(c_lock); + trx->lock.wait_trx = c_lock->trx; + ut_ad(!trx->lock.wait_lock); + } + trx->lock.wait_lock = lock; lock->type_mode |= LOCK_WAIT; } @@ -644,6 +654,7 @@ inline void lock_reset_lock_and_trx_wait(lock_t* lock) ut_ad(lock_mutex_own()); ut_ad(lock->trx->lock.wait_lock == NULL || lock->trx->lock.wait_lock == lock); + lock->trx->lock.wait_trx= nullptr; lock->trx->lock.wait_lock = NULL; lock->type_mode &= ~LOCK_WAIT; } diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic index e16949a4917..966c8a22028 100644 --- a/storage/innobase/include/lock0priv.ic +++ b/storage/innobase/include/lock0priv.ic @@ -156,16 +156,15 @@ lock_rec_get_next_const( /*********************************************************************//** Gets the first explicit lock request on a record. -@return first lock, NULL if none exists */ +@param[in] hash hash chain the lock on +@param[in] page_id page id +@param[in] heap_no heap number of the record +@return first lock, NULL if none exists */ UNIV_INLINE -lock_t* -lock_rec_get_first( -/*===============*/ - hash_table_t* hash, /*!< in: hash chain the lock on */ - const buf_block_t* block, /*!< in: block containing the record */ - ulint heap_no)/*!< in: heap number of the record */ +lock_t *lock_rec_get_first(hash_table_t *hash, page_id_t page_id, + ulint heap_no) { - for (lock_t *lock= lock_sys.get_first(*hash, block->page.id()); + for (lock_t *lock= lock_sys.get_first(*hash, page_id); lock; lock= lock_rec_get_next_on_page(lock)) if (lock_rec_get_nth_bit(lock, heap_no)) return lock; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 09132e7cb98..00eb4c4c03a 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -426,7 +426,9 @@ struct trx_lock_t { trx_que_t que_state; /*!< valid when trx->state == TRX_STATE_ACTIVE: TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT, ... */ - + /** Transaction being waited for; protected by the same mutexes as + wait_lock */ + trx_t* wait_trx; lock_t* wait_lock; /*!< if trx execution state is TRX_QUE_LOCK_WAIT, this points to the lock request, otherwise this is diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 07fff9b21ad..3b3ec1f0824 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2021, MariaDB Corporation. +Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -970,7 +970,8 @@ lock_rec_has_expl( || (precise_mode & LOCK_MODE_MASK) == LOCK_X); ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); - for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + for (lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -1024,7 +1025,7 @@ lock_rec_other_has_expl_req( } for (lock_t* lock = lock_rec_get_first(&lock_sys.rec_hash, - block, heap_no); + block->page.id(), heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -1075,19 +1076,18 @@ static void wsrep_kill_victim(const trx_t * const trx, const lock_t *lock) /*********************************************************************//** Checks if some other transaction has a conflicting explicit lock request in the queue, so that we have to wait. +@param[in] mode LOCK_S or LOCK_X, possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP, +LOCK_INSERT_INTENTION +@param[in] block buffer block containing the record +@param[in] heap_no heap number of the record +@param[in] trx our transaction +@param[out] was_ignored true if conflicting locks waiting for the current +transaction were ignored @return lock or NULL */ -static -lock_t* -lock_rec_other_has_conflicting( -/*===========================*/ - unsigned mode, /*!< in: LOCK_S or LOCK_X, - possibly ORed to LOCK_GAP or - LOC_REC_NOT_GAP, - LOCK_INSERT_INTENTION */ - const buf_block_t* block, /*!< in: buffer block containing - the record */ - ulint heap_no,/*!< in: heap number of the record */ - const trx_t* trx) /*!< in: our transaction */ +static lock_t *lock_rec_other_has_conflicting(unsigned mode, + const buf_block_t *block, + ulint heap_no, const trx_t *trx, + bool *was_ignored= nullptr) { lock_t* lock; @@ -1095,10 +1095,21 @@ lock_rec_other_has_conflicting( bool is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM); - for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + for (lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { + /* There can't be lock loops for one record, because + all waiting locks of the record will always wait for the same + lock of the record in a cell array, and check for + conflicting lock will always start with the first lock for the + heap_no, and go ahead with the same order(the order of the + locks in the cell array) */ + if (lock_get_wait(lock) && lock->trx->lock.wait_trx == trx) { + if (was_ignored) *was_ignored= true; + continue; + } if (lock_rec_has_to_wait(true, trx, mode, lock, is_supremum)) { #ifdef WITH_WSREP if (trx->is_wsrep()) { @@ -1255,6 +1266,7 @@ static void check_trx_state(const trx_t *trx) /** Create a new record lock and inserts it to the lock queue, without checking for deadlocks or conflicts. +@param[in] c_lock conflicting lock @param[in] type_mode lock mode and wait flag; type will be replaced with LOCK_REC @param[in] page_id index page number @@ -1263,11 +1275,15 @@ without checking for deadlocks or conflicts. @param[in] index the index tree @param[in,out] trx transaction @param[in] holds_trx_mutex whether the caller holds trx->mutex +@param[in] insert_before_waiting if true, inserts new B-tree record lock +just after the last non-waiting lock of the current transaction which is +located before the first waiting for the current transaction lock, otherwise +the lock is inserted at the end of the queue @return created lock */ lock_t* lock_rec_create_low( + lock_t* c_lock, #ifdef WITH_WSREP - lock_t* c_lock, /*!< conflicting lock */ que_thr_t* thr, /*!< thread owning trx */ #endif unsigned type_mode, @@ -1276,7 +1292,8 @@ lock_rec_create_low( ulint heap_no, dict_index_t* index, trx_t* trx, - bool holds_trx_mutex) + bool holds_trx_mutex, + bool insert_before_waiting) { lock_t* lock; ulint n_bits; @@ -1385,7 +1402,7 @@ lock_rec_create_low( } trx->lock.que_state = TRX_QUE_LOCK_WAIT; - lock_set_lock_and_trx_wait(lock, trx); + lock_set_lock_and_trx_wait(lock, trx, c_lock); UT_LIST_ADD_LAST(trx->lock.trx_locks, lock); trx->lock.wait_thr = thr; @@ -1413,15 +1430,46 @@ lock_rec_create_low( trx_mutex_exit(c_lock->trx); } else #endif /* WITH_WSREP */ - if (!(type_mode & (LOCK_WAIT | LOCK_PREDICATE | LOCK_PRDT_PAGE)) - && innodb_lock_schedule_algorithm - == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS - && !thd_is_replication_slave_thread(trx->mysql_thd)) { - HASH_PREPEND(lock_t, hash, &lock_sys.rec_hash, - page_id.fold(), lock); - } else { - HASH_INSERT(lock_t, hash, lock_hash_get(type_mode), - page_id.fold(), lock); + if (insert_before_waiting + && !(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))) { + /* Try to insert the lock just after the last non-waiting + lock of the current transaction which immediately + precedes the first waiting lock request. */ + hash_table_t *lock_hash = lock_hash_get(type_mode); + hash_cell_t& cell = lock_hash->array[lock_hash->calc_hash( + page_id.fold())]; + + lock_t* last_non_waiting = nullptr; + + for (lock_t* l = lock_rec_get_first(lock_hash, page_id, + heap_no); l; l = lock_rec_get_next(heap_no, l)) { + if (lock_get_wait(lock) + && l->trx->lock.wait_trx == trx) { + break; + } + if (l->trx == trx) { + last_non_waiting = l; + } + } + + if (!last_non_waiting) { + goto append_last; + } + + cell.insert_after(*last_non_waiting, *lock, &lock_t::hash); + } + else { +append_last: + if (!(type_mode & (LOCK_WAIT | LOCK_PREDICATE | LOCK_PRDT_PAGE)) + && innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !thd_is_replication_slave_thread(trx->mysql_thd)) { + HASH_PREPEND(lock_t, hash, &lock_sys.rec_hash, + page_id.fold(), lock); + } else { + HASH_INSERT(lock_t, hash, lock_hash_get(type_mode), + page_id.fold(), lock); + } } if (!holds_trx_mutex) { @@ -1429,7 +1477,7 @@ lock_rec_create_low( } ut_ad(trx_mutex_own(trx)); if (type_mode & LOCK_WAIT) { - lock_set_lock_and_trx_wait(lock, trx); + lock_set_lock_and_trx_wait(lock, trx, c_lock); } UT_LIST_ADD_LAST(trx->lock.trx_locks, lock); if (!holds_trx_mutex) { @@ -1576,6 +1624,7 @@ lock_rec_insert_to_head( /** Enqueue a waiting request for a lock which cannot be granted immediately. Check for deadlocks. +@param[in] c_lock conflicting lock @param[in] type_mode the requested lock mode (LOCK_S or LOCK_X) possibly ORed with LOCK_GAP or LOCK_REC_NOT_GAP, ORed with @@ -1594,9 +1643,7 @@ Check for deadlocks. (or it happened to commit) */ dberr_t lock_rec_enqueue_waiting( -#ifdef WITH_WSREP - lock_t* c_lock, /*!< conflicting lock */ -#endif + lock_t* c_lock, unsigned type_mode, const buf_block_t* block, ulint heap_no, @@ -1634,9 +1681,9 @@ lock_rec_enqueue_waiting( /* Enqueue the lock request that will wait to be granted, note that we already own the trx mutex. */ - lock_t* lock = lock_rec_create( + lock_t* lock = lock_rec_create(c_lock, #ifdef WITH_WSREP - c_lock, thr, + thr, #endif type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE); @@ -1730,22 +1777,20 @@ on the record, and the request to be added is not a waiting request, we can reuse a suitable record lock object already existing on the same page, just setting the appropriate bit in its bitmap. This is a low-level function which does NOT check for deadlocks or lock compatibility! +@param[in] type_mode lock mode, wait, gap etc. flags; type is ignored and +replaced by LOCK_REC +@param[in] block buffer block containing the record +@param[in] heap_no heap number of the record +@param[in] index index of record +@param[in/out] trx transaction +@param[in] caller_owns_trx_mutex, TRUE if caller owns the transaction mutex +@param[in] insert_before_waiting true=insert B-tree record lock right before +a waiting lock request; false=insert the lock at the end of the queue @return lock where the bit was set */ -static -void -lock_rec_add_to_queue( -/*==================*/ - unsigned type_mode,/*!< in: lock mode, wait, gap - etc. flags; type is ignored - and replaced by LOCK_REC */ - const buf_block_t* block, /*!< in: buffer block containing - the record */ - ulint heap_no,/*!< in: heap number of the record */ - dict_index_t* index, /*!< in: index of record */ - trx_t* trx, /*!< in/out: transaction */ - bool caller_owns_trx_mutex) - /*!< in: TRUE if caller owns the - transaction mutex */ +static void lock_rec_add_to_queue(unsigned type_mode, const buf_block_t *block, + ulint heap_no, dict_index_t *index, + trx_t *trx, bool caller_owns_trx_mutex, + bool insert_before_waiting= false) { #ifdef UNIV_DEBUG ut_ad(lock_mutex_own()); @@ -1834,11 +1879,16 @@ lock_rec_add_to_queue( } } - lock_rec_create( + /* Note: We will not pass any conflicting lock to lock_rec_create(), + because we should be moving an existing waiting lock request. */ + ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx); + + lock_rec_create(NULL, #ifdef WITH_WSREP - NULL, NULL, + NULL, #endif - type_mode, block, heap_no, index, trx, caller_owns_trx_mutex); + type_mode, block, heap_no, index, trx, caller_owns_trx_mutex, + insert_before_waiting); } /*********************************************************************//** @@ -1896,28 +1946,23 @@ lock_rec_lock( /* Do nothing if the trx already has a strong enough lock on rec */ if (!lock_rec_has_expl(mode, block, heap_no, trx)) { - if ( -#ifdef WITH_WSREP - lock_t *c_lock= -#endif - lock_rec_other_has_conflicting(mode, block, heap_no, trx)) + bool was_ignored = false; + if (lock_t *c_lock= lock_rec_other_has_conflicting( + mode, block, heap_no, trx, &was_ignored)) { /* If another transaction has a non-gap conflicting request in the queue, as this transaction does not have a lock strong enough already granted on the record, we have to wait. */ - err = lock_rec_enqueue_waiting( -#ifdef WITH_WSREP - c_lock, -#endif /* WITH_WSREP */ - mode, block, heap_no, index, thr, NULL); + err = lock_rec_enqueue_waiting(c_lock, mode, block, heap_no, index, + thr, NULL); } else if (!impl) { /* Set the requested lock on the record. */ lock_rec_add_to_queue(LOCK_REC | mode, block, heap_no, index, trx, - true); + true, was_ignored); err= DB_SUCCESS_LOCKED_REC; } } @@ -1943,9 +1988,9 @@ lock_rec_lock( Note that we don't own the trx mutex. */ if (!impl) - lock_rec_create( + lock_rec_create(NULL, #ifdef WITH_WSREP - NULL, NULL, + NULL, #endif mode, block, heap_no, index, trx, false); @@ -2168,8 +2213,17 @@ static void lock_rec_dequeue_from_page(lock_t* in_lock) if (!lock_get_wait(lock)) { continue; } - const lock_t* c = lock_rec_has_to_wait_in_queue(lock); - if (!c) { + + ut_ad(lock->trx->lock.wait_trx); + ut_ad(lock->trx->lock.wait_lock); + + if (const lock_t* c = lock_rec_has_to_wait_in_queue( + lock)) { + trx_mutex_enter(lock->trx); + lock->trx->lock.wait_trx = c->trx; + trx_mutex_exit(lock->trx); + } + else { /* Grant the lock */ ut_ad(lock->trx != in_lock->trx); lock_grant(lock); @@ -2259,7 +2313,7 @@ lock_rec_reset_and_release_wait_low( ut_ad(lock_mutex_own()); - for (lock = lock_rec_get_first(hash, block, heap_no); + for (lock = lock_rec_get_first(hash, block->page.id(), heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -2321,7 +2375,8 @@ lock_rec_inherit_to_gap( DO want S-locks/X-locks(taken for replace) set by a consistency constraint to be inherited also then. */ - for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + for (lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -2357,7 +2412,8 @@ lock_rec_inherit_to_gap_if_gap_lock( lock_mutex_enter(); - for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + for (lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); lock != NULL; lock = lock_rec_get_next(heap_no, lock)) { @@ -2400,12 +2456,12 @@ lock_rec_move_low( /* If the lock is predicate lock, it resides on INFIMUM record */ ut_ad(lock_rec_get_first( - lock_hash, receiver, receiver_heap_no) == NULL + lock_hash, receiver->page.id(), receiver_heap_no) == NULL || lock_hash == &lock_sys.prdt_hash || lock_hash == &lock_sys.prdt_page_hash); for (lock = lock_rec_get_first(lock_hash, - donator, donator_heap_no); + donator->page.id(), donator_heap_no); lock != NULL; lock = lock_rec_get_next(donator_heap_no, lock)) { @@ -2414,7 +2470,8 @@ lock_rec_move_low( lock_rec_reset_nth_bit(lock, donator_heap_no); if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + ut_ad(lock->trx->lock.wait_lock == lock); + lock->type_mode &= ~LOCK_WAIT; } /* Note that we FIRST reset the bit, and then set the lock: @@ -2426,7 +2483,7 @@ lock_rec_move_low( } ut_ad(!lock_rec_get_first(&lock_sys.rec_hash, - donator, donator_heap_no)); + donator->page.id(), donator_heap_no)); } /** Move all the granted locks to the front of the given lock list. @@ -2531,8 +2588,8 @@ lock_move_reorganize_page( lock_rec_bitmap_reset(lock); if (lock_get_wait(lock)) { - - lock_reset_lock_and_trx_wait(lock); + ut_ad(lock->trx->lock.wait_lock == lock); + lock->type_mode&= ~LOCK_WAIT; } lock = lock_rec_get_next_on_page(lock); @@ -2708,7 +2765,9 @@ lock_move_rec_list_end( ut_ad(!page_rec_is_metadata(orec)); if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + ut_ad(lock->trx->lock.wait_lock == + lock); + lock->type_mode&= ~LOCK_WAIT; } lock_rec_add_to_queue( @@ -2806,7 +2865,9 @@ lock_move_rec_list_start( ut_ad(!page_rec_is_metadata(prev)); if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + ut_ad(lock->trx->lock.wait_lock + == lock); + lock->type_mode&= ~LOCK_WAIT; } lock_rec_add_to_queue( @@ -2902,7 +2963,9 @@ lock_rtr_move_rec_list( if (rec1_heap_no < lock->un_member.rec_lock.n_bits && lock_rec_reset_nth_bit(lock, rec1_heap_no)) { if (type_mode & LOCK_WAIT) { - lock_reset_lock_and_trx_wait(lock); + ut_ad(lock->trx->lock.wait_lock + == lock); + lock->type_mode&= ~LOCK_WAIT; } lock_rec_add_to_queue( @@ -3348,10 +3411,8 @@ lock_table_create( in dictionary cache */ unsigned type_mode,/*!< in: lock mode possibly ORed with LOCK_WAIT */ - trx_t* trx /*!< in: trx */ -#ifdef WITH_WSREP - , lock_t* c_lock = NULL /*!< in: conflicting lock */ -#endif + trx_t* trx, /*!< in: trx */ + lock_t* c_lock = NULL /*!< in: conflicting lock */ ) { lock_t* lock; @@ -3434,8 +3495,7 @@ lock_table_create( ut_list_append(table->locks, lock, TableLockGetNode()); if (type_mode & LOCK_WAIT) { - - lock_set_lock_and_trx_wait(lock, trx); + lock_set_lock_and_trx_wait(lock, trx, c_lock); } lock->trx->lock.table_locks.push_back(lock); @@ -3590,10 +3650,8 @@ lock_table_enqueue_waiting( unsigned mode, /*!< in: lock mode this transaction is requesting */ dict_table_t* table, /*!< in/out: table */ - que_thr_t* thr /*!< in: query thread */ -#ifdef WITH_WSREP - , lock_t* c_lock /*!< in: conflicting lock or NULL */ -#endif + que_thr_t* thr, /*!< in: query thread */ + lock_t* c_lock /*!< in: conflicting lock or NULL */ ) { trx_t* trx; @@ -3624,11 +3682,7 @@ lock_table_enqueue_waiting( #endif /* WITH_WSREP */ /* Enqueue the lock request that will wait to be granted */ - lock = lock_table_create(table, mode | LOCK_WAIT, trx -#ifdef WITH_WSREP - , c_lock -#endif - ); + lock = lock_table_create(table, mode | LOCK_WAIT, trx, c_lock); const trx_t* victim_trx = DeadlockChecker::check_and_resolve(lock, trx); @@ -3784,11 +3838,7 @@ lock_table( if (wait_for != NULL) { err = lock_table_enqueue_waiting(flags | mode, table, - thr -#ifdef WITH_WSREP - , wait_for -#endif - ); + thr, wait_for); } else { lock_table_create(table, flags | mode, trx); @@ -3836,7 +3886,7 @@ lock_table_ix_resurrect( Checks if a waiting table lock request still has to wait in a queue. @return TRUE if still has to wait */ static -bool +const lock_t* lock_table_has_to_wait_in_queue( /*============================*/ const lock_t* wait_lock) /*!< in: waiting table lock */ @@ -3855,11 +3905,11 @@ lock_table_has_to_wait_in_queue( if (lock_has_to_wait(wait_lock, lock)) { - return(true); + return(lock); } } - return(false); + return(NULL); } /*************************************************************//** @@ -3888,9 +3938,17 @@ lock_table_dequeue( lock != NULL; lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { - if (lock_get_wait(lock) - && !lock_table_has_to_wait_in_queue(lock)) { + if (!lock_get_wait(lock)) + continue; + ut_ad(lock->trx->lock.wait_trx); + ut_ad(lock->trx->lock.wait_lock); + + if (const lock_t *c = lock_table_has_to_wait_in_queue(lock)) { + trx_mutex_enter(lock->trx); + lock->trx->lock.wait_trx = c->trx; + trx_mutex_exit(lock->trx); + } else { /* Grant the lock */ ut_ad(in_lock->trx != lock->trx); lock_grant(lock); @@ -4035,7 +4093,8 @@ lock_rec_unlock( lock_mutex_enter(); trx_mutex_enter(trx); - first_lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + first_lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); /* Find the last lock with the same lock_mode and transaction on the record. */ @@ -4078,8 +4137,16 @@ released: if (!lock_get_wait(lock)) { continue; } - const lock_t* c = lock_rec_has_to_wait_in_queue(lock); - if (!c) { + ut_ad(lock->trx->lock.wait_trx); + ut_ad(lock->trx->lock.wait_lock); + if (const lock_t* c = lock_rec_has_to_wait_in_queue( + lock)) { + if (lock->trx != trx) + trx_mutex_enter(lock->trx); + lock->trx->lock.wait_trx = c->trx; + if (lock->trx != trx) + trx_mutex_exit(lock->trx); + } else { /* Grant the lock */ ut_ad(trx != lock->trx); lock_grant(lock); @@ -4773,7 +4840,7 @@ lock_rec_queue_validate( if (!page_rec_is_user_rec(rec)) { for (lock = lock_rec_get_first(&lock_sys.rec_hash, - block, heap_no); + block->page.id(), heap_no); lock != NULL; lock = lock_rec_get_next_const(heap_no, lock)) { @@ -4847,7 +4914,7 @@ func_exit: wsrep_report_bf_lock_wait(impl_trx->mysql_thd, impl_trx->id); wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id); - if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + if (!lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP, block, heap_no, impl_trx)) { ib::info() << "WSREP impl BF lock conflict"; @@ -4856,7 +4923,20 @@ func_exit: #endif /* WITH_WSREP */ { ut_ad(lock_get_wait(other_lock)); - ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + /* After MDEV-27025 fix the following case is + possible: + 1. trx 1 acquires S-lock; + 2. trx 2 creates X-lock waiting for trx 1; + 3. trx 1 creates implicit lock, as + lock_rec_other_has_conflicting() returns no + conflicting trx 2 X-lock, the explicit lock + will not be created; + 4. trx 3 creates waiting X-lock, + it will wait for S-lock of trx 1. + That is why we relaxing the condition here and + check only for S-lock. + */ + ut_ad(lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP, block, heap_no, impl_trx)); } } @@ -4864,7 +4944,8 @@ func_exit: mutex_exit(&impl_trx->mutex); } - for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + for (lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); lock != NULL; lock = lock_rec_get_next_const(heap_no, lock)) { ut_ad(!lock->trx->read_only @@ -5199,7 +5280,8 @@ lock_rec_insert_check_and_lock( BTR_NO_LOCKING_FLAG and skip the locking altogether. */ ut_ad(lock_table_has(trx, index->table, LOCK_IX)); - lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no); + lock = lock_rec_get_first(&lock_sys.rec_hash, block->page.id(), + heap_no); if (lock == NULL) { /* We optimize CPU time usage in the simplest case */ @@ -5238,19 +5320,13 @@ lock_rec_insert_check_and_lock( const unsigned type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION; - if ( -#ifdef WITH_WSREP - lock_t* c_lock = -#endif /* WITH_WSREP */ + if (lock_t* c_lock = lock_rec_other_has_conflicting(type_mode, block, heap_no, trx)) { /* Note that we may get DB_SUCCESS also here! */ trx_mutex_enter(trx); - err = lock_rec_enqueue_waiting( -#ifdef WITH_WSREP - c_lock, -#endif /* WITH_WSREP */ - type_mode, block, heap_no, index, thr, NULL); + err = lock_rec_enqueue_waiting(c_lock, type_mode, block, + heap_no, index, thr, NULL); trx_mutex_exit(trx); } else { @@ -5327,7 +5403,7 @@ lock_rec_convert_impl_to_expl_for_trx( && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no, trx)) { lock_rec_add_to_queue(LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP, - block, heap_no, index, trx, true); + block, heap_no, index, trx, true, true); } lock_mutex_exit(); diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc index 1eb96a0dcf0..0ebf3b9fd1e 100644 --- a/storage/innobase/lock/lock0prdt.cc +++ b/storage/innobase/lock/lock0prdt.cc @@ -243,7 +243,7 @@ lock_prdt_has_lock( ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); for (lock = lock_rec_get_first( - lock_hash_get(type_mode), block, PRDT_HEAPNO); + lock_hash_get(type_mode), block->page.id(), PRDT_HEAPNO); lock != NULL; lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { ut_ad(lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); @@ -298,7 +298,7 @@ lock_prdt_other_has_conflicting( ut_ad(lock_mutex_own()); for (lock_t* lock = lock_rec_get_first( - lock_hash_get(mode), block, PRDT_HEAPNO); + lock_hash_get(mode), block->page.id(), PRDT_HEAPNO); lock != NULL; lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { @@ -489,9 +489,13 @@ lock_prdt_add_to_queue( } } - lock = lock_rec_create( + /* Note: We will not pass any conflicting lock to lock_rec_create(), + because we should be moving an existing waiting lock request. */ + ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx); + + lock = lock_rec_create(NULL, #ifdef WITH_WSREP - NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */ + NULL, /* FIXME: replicate SPATIAL INDEX locks */ #endif type_mode, block, PRDT_HEAPNO, index, trx, caller_owns_trx_mutex); @@ -543,7 +547,8 @@ lock_prdt_insert_check_and_lock( lock_t* lock; /* Only need to check locks on prdt_hash */ - lock = lock_rec_get_first(&lock_sys.prdt_hash, block, PRDT_HEAPNO); + lock = lock_rec_get_first(&lock_sys.prdt_hash, block->page.id(), + PRDT_HEAPNO); if (lock == NULL) { lock_mutex_exit(); @@ -581,9 +586,7 @@ lock_prdt_insert_check_and_lock( trx_mutex_enter(trx); err = lock_rec_enqueue_waiting( -#ifdef WITH_WSREP NULL, /* FIXME: replicate SPATIAL INDEX locks */ -#endif LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION, block, PRDT_HEAPNO, index, thr, prdt); @@ -822,9 +825,9 @@ lock_prdt_lock( lock_t* lock = lock_sys.get_first(hash, block->page.id()); if (lock == NULL) { - lock = lock_rec_create( + lock = lock_rec_create(NULL, #ifdef WITH_WSREP - NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */ + NULL, /* FIXME: replicate SPATIAL INDEX locks */ #endif prdt_mode, block, PRDT_HEAPNO, index, trx, FALSE); @@ -854,10 +857,8 @@ lock_prdt_lock( if (wait_for != NULL) { err = lock_rec_enqueue_waiting( -#ifdef WITH_WSREP NULL, /* FIXME: replicate SPATIAL INDEX locks */ -#endif prdt_mode, block, PRDT_HEAPNO, index, thr, prdt); @@ -937,9 +938,9 @@ lock_place_prdt_page_lock( } if (lock == NULL) { - lock = lock_rec_create_low( + lock = lock_rec_create_low(NULL, #ifdef WITH_WSREP - NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */ + NULL, /* FIXME: replicate SPATIAL INDEX locks */ #endif mode, page_id, NULL, PRDT_HEAPNO, index, trx, FALSE); @@ -985,7 +986,7 @@ lock_prdt_rec_move( lock_mutex_enter(); for (lock_t *lock = lock_rec_get_first(&lock_sys.prdt_hash, - donator, PRDT_HEAPNO); + donator->page.id(), PRDT_HEAPNO); lock != NULL; lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {