Merge 11.2 into 11.4

2025-01-15 19:42:28 +01:00 · 2024-09-13 13:09:23 +03:00 · 2024-09-13 13:09:23 +03:00 · 762ad01c7f
commit 762ad01c7f
parent 2c3b298337 f0de610d0c
17 changed files with 846 additions and 664 deletions
--- a/mysql-test/suite/galera/r/MDEV-33133.result
+++ b/mysql-test/suite/galera/r/MDEV-33133.result
@ -0,0 +1,34 @@
+connection node_2;
+connection node_1;
+connect node_1a,127.0.0.1,root,,test,$NODE_MYPORT_1;
+connection node_1;
+CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
+SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_rollback_mdl_release';
+connection node_2;
+SET SESSION wsrep_trx_fragment_size = 1;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1);
+connection node_1a;
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+0
+SET SESSION wsrep_retry_autocommit = 0;
+SET DEBUG_SYNC = 'ha_write_row_start SIGNAL may_toi WAIT_FOR bf_abort';
+INSERT INTO t1 VALUES (2);
+connection node_1;
+SET DEBUG_SYNC = 'now WAIT_FOR may_toi';
+SET DEBUG_SYNC = 'after_wsrep_thd_abort WAIT_FOR sync.wsrep_rollback_mdl_release_reached';
+TRUNCATE TABLE t1;
+connection node_1a;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+SET DEBUG_SYNC = 'now SIGNAL signal.wsrep_rollback_mdl_release';
+connection node_2;
+INSERT INTO t1 VALUES (3);
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+connection node_1;
+SET GLOBAL DEBUG_DBUG = '';
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t1;
+disconnect node_1a;
+disconnect node_2;
+disconnect node_1;
--- a/mysql-test/suite/galera/t/MDEV-33133.test
+++ b/mysql-test/suite/galera/t/MDEV-33133.test
@ -0,0 +1,80 @@
+#
+# MDEV-33133: MDL conflict handling code should skip transactions
+# BF-aborted before.
+#
+# It's possible that MDL conflict handling code is called more
+# than once for a transaction when:
+# - it holds more than one conflicting MDL lock
+# - reschedule_waiters() is executed,
+# which results in repeated attempts to BF-abort already aborted
+# transaction.
+# In such situations, it might be that BF-aborting logic sees
+# a partially rolled back transaction and erroneously decides
+# on future actions for such a transaction.
+#
+# The specific situation tested and fixed is when a SR transaction
+# applied in the node gets BF-aborted by a started TOI operation.
+# It's then caught with the server transaction already rolled back,
+# but with no MDL locks yet released. This caused wrong state
+# detection for such a transaction during repeated MDL conflict
+# handling code execution.
+#
+
+--source include/galera_cluster.inc
+--source include/have_debug_sync.inc
+--source include/have_debug.inc
+
+--connect node_1a,127.0.0.1,root,,test,$NODE_MYPORT_1
+
+--connection node_1
+CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
+SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_rollback_mdl_release';
+
+--connection node_2
+SET SESSION wsrep_trx_fragment_size = 1;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1);
+
+--connection node_1a
+# Sync wait for SR transaction to replicate and apply fragment.
+SELECT COUNT(*) FROM t1;
+SET SESSION wsrep_retry_autocommit = 0;
+SET DEBUG_SYNC = 'ha_write_row_start SIGNAL may_toi WAIT_FOR bf_abort';
+--send
+  INSERT INTO t1 VALUES (2);
+
+--connection node_1
+SET DEBUG_SYNC = 'now WAIT_FOR may_toi';
+# BF-abort SR transaction and wait until it reaches the point
+# prior to release MDL locks.
+# Then abort local INSERT, which will go through rescedule_waiters()
+# and see SR transaction holding MDL locks but already rolled back.
+# In this case SR transaction should be skipped in MDL conflict
+# handling code.
+SET DEBUG_SYNC = 'after_wsrep_thd_abort WAIT_FOR sync.wsrep_rollback_mdl_release_reached';
+--send
+  TRUNCATE TABLE t1;
+
+--connection node_1a
+# Local INSERT gets aborted.
+--error ER_LOCK_DEADLOCK
+--reap
+# Let the aborted SR transaction continue and finally release MDL locks,
+# which in turn allows TRUNCATE to complete.
+SET DEBUG_SYNC = 'now SIGNAL signal.wsrep_rollback_mdl_release';
+
+--connection node_2
+# SR transaction has been BF-aborted.
+--error ER_LOCK_DEADLOCK
+INSERT INTO t1 VALUES (3);
+
+--connection node_1
+# TRUNCATE completes.
+--reap
+
+# Cleanup
+SET GLOBAL DEBUG_DBUG = '';
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t1;
+--disconnect node_1a
+--source include/galera_end.inc
--- a/scripts/sys_schema/README.md
+++ b/scripts/sys_schema/README.md
--- a/sql/wsrep_high_priority_service.cc
+++ b/sql/wsrep_high_priority_service.cc
@ -393,6 +393,18 @@ int Wsrep_high_priority_service::rollback(const wsrep::ws_handle& ws_handle,
              wsrep_thd_transaction_state_str(m_thd),
              m_thd->killed);

+#ifdef ENABLED_DEBUG_SYNC
+  DBUG_EXECUTE_IF("sync.wsrep_rollback_mdl_release",
+                  {
+                    const char act[]=
+                      "now "
+                      "SIGNAL sync.wsrep_rollback_mdl_release_reached "
+                      "WAIT_FOR signal.wsrep_rollback_mdl_release";
+                    DBUG_ASSERT(!debug_sync_set_action(m_thd,
+                                                       STRING_WITH_LEN(act)));
+                  };);
+#endif
+
  m_thd->release_transactional_locks();

  free_root(m_thd->mem_root, MYF(MY_KEEP_PREALLOC));
--- a/sql/wsrep_mysqld.cc
+++ b/sql/wsrep_mysqld.cc
@ -3207,8 +3207,13 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx,
    mysql_mutex_lock(&granted_thd->LOCK_thd_kill);
    mysql_mutex_lock(&granted_thd->LOCK_thd_data);

-    if (wsrep_thd_is_toi(granted_thd) ||
-        wsrep_thd_is_applying(granted_thd))
+    if (granted_thd->wsrep_aborter != 0)
+    {
+      DBUG_ASSERT(granted_thd->wsrep_aborter == request_thd->thread_id);
+      WSREP_DEBUG("BF thread waiting for a victim to release locks");
+    }
+    else if (wsrep_thd_is_toi(granted_thd) ||
+             wsrep_thd_is_applying(granted_thd))
    {
      if (wsrep_thd_is_aborting(granted_thd))
      {
@ -3298,6 +3303,7 @@ void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx,
    }
    mysql_mutex_unlock(&granted_thd->LOCK_thd_data);
    mysql_mutex_unlock(&granted_thd->LOCK_thd_kill);
+    DEBUG_SYNC(request_thd, "after_wsrep_thd_abort");
  }
  else
  {
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@ -1126,7 +1126,7 @@ void btr_drop_temporary_table(const dict_table_t &table)
  {
    if (buf_block_t *block= buf_page_get_gen({SRV_TMP_SPACE_ID, index->page},
                                             0, RW_X_LATCH, nullptr, BUF_GET,
-                                             &mtr, nullptr, nullptr))
+                                             &mtr, nullptr))
    {
      btr_free_but_not_root(block, MTR_LOG_NO_REDO);
      mtr.set_log_mode(MTR_LOG_NO_REDO);
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@ -926,24 +926,21 @@ static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)

 MY_ATTRIBUTE((nonnull,warn_unused_result))
 /** Acquire a latch on the previous page without violating the latching order.
-@param block    index page
-@param page_id  page identifier with valid space identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH)
-@param mtr      mini-transaction
+@param page_id  page identifier with valid space identifier
@param err      error code
+@param mtr      mini-transaction
@retval 0  if an error occurred
@retval 1  if the page could be latched in the wrong order
@retval -1 if the latch on block was temporarily released */
-static int btr_latch_prev(buf_block_t *block, page_id_t page_id,
-                          ulint zip_size,
-                          rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err)
+static int btr_latch_prev(rw_lock_type_t rw_latch,
+                          page_id_t page_id, dberr_t *err, mtr_t *mtr)
 {
  ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
-  ut_ad(page_id.space() == block->page.id().space());

-  const auto prev_savepoint= mtr->get_savepoint();
-  ut_ad(block == mtr->at_savepoint(prev_savepoint - 1));
+  buf_block_t *block= mtr->at_savepoint(mtr->get_savepoint() - 1);
+
+  ut_ad(page_id.space() == block->page.id().space());

  const page_t *const page= block->page.frame;
  page_id.set_page_no(btr_page_get_prev(page));
@ -959,68 +956,78 @@ static int btr_latch_prev(buf_block_t *block, page_id_t page_id,
  buffer-fixes on both blocks will prevent eviction. */

 retry:
-  /* Pass no_wait pointer to ensure that we don't wait on the current page
-  latch while holding the next page latch to avoid latch ordering violation. */
-  bool no_wait= false;
  int ret= 1;
-
-  buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr,
-                                      BUF_GET, mtr, err,  &no_wait);
+  buf_block_t *prev= buf_pool.page_fix(page_id, err, buf_pool_t::FIX_NOWAIT);
  if (UNIV_UNLIKELY(!prev))
-  {
-    /* Check if we had to return because we couldn't wait on latch. */
-    if (no_wait)
-      goto ordered_latch;
    return 0;
+  if (prev == reinterpret_cast<buf_block_t*>(-1))
+  {
+    /* The block existed in buf_pool.page_hash, but not in a state that is
+    safe to access without waiting for some pending operation, such as
+    buf_page_t::read_complete() or buf_pool_t::unzip().
+
+    Retry while temporarily releasing the successor block->page.lock
+    (but retaining a buffer-fix so that the block cannot be evicted. */
+
+    if (rw_latch == RW_S_LATCH)
+      block->page.lock.s_unlock();
+    else
+      block->page.lock.x_unlock();
+
+    prev= buf_pool.page_fix(page_id, err, buf_pool_t::FIX_WAIT_READ);
+
+    if (!prev)
+    {
+      ut_ad(*err != DB_SUCCESS);
+      if (rw_latch == RW_S_LATCH)
+        block->page.lock.s_lock();
+      else
+        block->page.lock.x_lock();
+      return 0;
+    }
+    else if (rw_latch == RW_S_LATCH)
+      goto wait_for_s;
+    else
+      goto wait_for_x;
  }

  static_assert(MTR_MEMO_PAGE_S_FIX == mtr_memo_type_t(BTR_SEARCH_LEAF), "");
  static_assert(MTR_MEMO_PAGE_X_FIX == mtr_memo_type_t(BTR_MODIFY_LEAF), "");

  if (rw_latch == RW_S_LATCH
-      ? prev->page.lock.s_lock_try() : prev->page.lock.x_lock_try())
-  {
-    mtr->lock_register(prev_savepoint, mtr_memo_type_t(rw_latch));
-    if (UNIV_UNLIKELY(prev->page.id() != page_id))
-    {
-    fail:
-      /* the page was just read and found to be corrupted */
-      mtr->rollback_to_savepoint(prev_savepoint);
-      return 0;
-    }
-  }
+      ? prev->page.lock.s_lock_try()
+      : prev->page.lock.x_lock_try())
+    mtr->memo_push(prev, mtr_memo_type_t(rw_latch));
  else
  {
-    ut_ad(mtr->at_savepoint(mtr->get_savepoint() - 1)->page.id() == page_id);
-    mtr->release_last_page();
-ordered_latch:
    if (rw_latch == RW_S_LATCH)
+    {
      block->page.lock.s_unlock();
-    else
-      block->page.lock.x_unlock();
-
-    prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev,
-                           BUF_GET, mtr, err);
-    if (rw_latch == RW_S_LATCH)
+    wait_for_s:
+      prev->page.lock.s_lock();
      block->page.lock.s_lock();
+    }
    else
+    {
+      block->page.lock.x_unlock();
+    wait_for_x:
+      prev->page.lock.x_lock();
      block->page.lock.x_lock();
+    }

+    ut_ad(block == mtr->at_savepoint(mtr->get_savepoint() - 1));
+    mtr->memo_push(prev, mtr_memo_type_t(rw_latch));
    const page_id_t prev_page_id= page_id;
    page_id.set_page_no(btr_page_get_prev(page));
+    ret= -1;

    if (UNIV_UNLIKELY(page_id != prev_page_id))
    {
      mtr->release_last_page();
      if (page_id.page_no() == FIL_NULL)
-        return -1;
+        return ret;
      goto retry;
    }
-
-    if (UNIV_UNLIKELY(!prev))
-      goto fail;
-
-    ret= -1;
  }

  const page_t *const p= prev->page.frame;
@ -1047,11 +1054,11 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
  btr_intention_t lock_intention;
  bool detected_same_key_root= false;

-  mem_heap_t*	heap		= NULL;
-  rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-  rec_offs*	offsets		= offsets_;
-  rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
-  rec_offs*	offsets2	= offsets2_;
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets2= offsets2_;
  rec_offs_init(offsets_);
  rec_offs_init(offsets2_);

@ -1208,7 +1215,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
    ut_a(page_zip_validate(page_zip, block->page.frame, index()));
 #endif /* UNIV_ZIP_DEBUG */

-  const uint32_t page_level= btr_page_get_level(block->page.frame);
+  uint32_t page_level= btr_page_get_level(block->page.frame);

  if (height == ULINT_UNDEFINED)
  {
@ -1216,6 +1223,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
 #ifdef BTR_CUR_ADAPT
    info->root_guess= block;
 #endif
+  reached_root:
    height= page_level;
    tree_height= height + 1;

@ -1225,35 +1233,53 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
      We may have to reacquire the page latch in a different mode. */
      switch (rw_latch) {
      case RW_S_LATCH:
-        if ((latch_mode & ~12) != RW_S_LATCH)
+        if (!(latch_mode & BTR_SEARCH_LEAF))
        {
+          rw_latch= RW_X_LATCH;
          ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH);
-          goto relatch_x;
+          mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_X_FIX);
+          if (!block->page.lock.s_x_upgrade_try())
+          {
+            block->page.lock.s_unlock();
+            block->page.lock.x_lock();
+            /* Dropping the index tree (and freeing the root page)
+            should be impossible while we hold index()->lock. */
+            ut_ad(!block->page.is_freed());
+            page_level= btr_page_get_level(block->page.frame);
+            if (UNIV_UNLIKELY(page_level != 0))
+            {
+              /* btr_root_raise_and_insert() was executed meanwhile */
+              ut_ad(mtr->memo_contains_flagged(&index()->lock,
+                                               MTR_MEMO_S_LOCK));
+              block->page.lock.x_u_downgrade();
+              block->page.lock.u_s_downgrade();
+              rw_latch= RW_S_LATCH;
+              mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_S_FIX);
+              goto reached_root;
+            }
+          }
        }
-        else
-        {
-          if (!latch_by_caller)
-            /* Release the tree s-latch */
-            mtr->rollback_to_savepoint(savepoint, savepoint + 1);
-          goto reached_latched_leaf;
-        }
-        /* fall through */
+        if (rw_latch != RW_S_LATCH)
+          break;
+        if (!latch_by_caller)
+          /* Release the tree s-latch */
+          mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+        goto reached_latched_leaf;
      case RW_SX_LATCH:
-        ut_ad(rw_latch == RW_S_LATCH ||
-              latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
-      relatch_x:
-        mtr->rollback_to_savepoint(block_savepoint);
-        height= ULINT_UNDEFINED;
+        ut_ad(latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
+        static_assert(int{BTR_MODIFY_ROOT_AND_LEAF} == int{RW_SX_LATCH}, "");
        rw_latch= RW_X_LATCH;
-        goto search_loop;
+        mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_X_FIX);
+        block->page.lock.u_x_upgrade();
+        break;
      case RW_X_LATCH:
        if (latch_mode == BTR_MODIFY_TREE)
          goto reached_index_root_and_leaf;
-        goto reached_root_and_leaf;
+        break;
      case RW_NO_LATCH:
        ut_ad(0);
      }
-      goto reached_leaf;
+      goto reached_root_and_leaf;
    }
  }
  else if (UNIV_UNLIKELY(height != page_level))
@ -1277,7 +1303,6 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,

  if (!height)
  {
-  reached_leaf:
    /* We reached the leaf level. */
    ut_ad(block == mtr->at_savepoint(block_savepoint));

@ -1307,7 +1332,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,

      /* latch also siblings from left to right */
      if (page_has_prev(block->page.frame) &&
-          !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
+          !btr_latch_prev(rw_latch, page_id, &err, mtr))
        goto func_exit;
      if (page_has_next(block->page.frame) &&
          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
@ -1333,7 +1358,7 @@ release_tree:
      ut_ad(rw_latch == RW_X_LATCH);
      /* x-latch also siblings from left to right */
      if (page_has_prev(block->page.frame) &&
-          !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
+          !btr_latch_prev(rw_latch, page_id, &err, mtr))
        goto func_exit;
      if (page_has_next(block->page.frame) &&
          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
@ -1480,7 +1505,7 @@ release_tree:
      ut_ad(rw_latch == RW_S_LATCH);

      if (!not_first_access)
-        buf_read_ahead_linear(page_id, zip_size);
+        buf_read_ahead_linear(page_id);

      if (page_has_prev(block->page.frame) &&
          page_rec_is_first(page_cur.rec, block->page.frame))
@ -1489,7 +1514,7 @@ release_tree:

        /* Latch the previous page if the node pointer is the leftmost
        of the current page. */
-        int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err);
+        int ret= btr_latch_prev(rw_latch, page_id, &err, mtr);
        if (!ret)
          goto func_exit;
        ut_ad(block_savepoint + 2 == mtr->get_savepoint());
@ -1515,7 +1540,7 @@ release_tree:
    case BTR_SEARCH_LEAF:
      rw_latch= rw_lock_type_t(latch_mode);
      if (!not_first_access)
-        buf_read_ahead_linear(page_id, zip_size);
+        buf_read_ahead_linear(page_id);
      break;
    case BTR_MODIFY_TREE:
      ut_ad(rw_latch == RW_X_LATCH);
@ -1666,8 +1691,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
 #endif /* UNIV_ZIP_DEBUG */

  if (page_has_prev(block->page.frame) &&
-      !btr_latch_prev(block, page_id, block->zip_size(),
-                      RW_X_LATCH, mtr, &err))
+      !btr_latch_prev(RW_X_LATCH, page_id, &err, mtr))
    goto func_exit;
  if (page_has_next(block->page.frame) &&
      !btr_block_get(*index(), btr_page_get_next(block->page.frame),
@ -1880,7 +1904,6 @@ index_locked:
  page_cur.index = index;

  uint32_t page= index->page;
-  const auto zip_size= index->table->space->zip_size();

  for (ulint height= ULINT_UNDEFINED;;)
  {
@ -1931,8 +1954,7 @@ index_locked:
        {
          /* x-latch also siblings from left to right */
          if (page_has_prev(block->page.frame) &&
-              !btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH,
-                              mtr, &err))
+              !btr_latch_prev(RW_X_LATCH, block->page.id(), &err, mtr))
            break;
          if (page_has_next(block->page.frame) &&
              !btr_block_get(*index, btr_page_get_next(block->page.frame),
@ -1986,8 +2008,7 @@ index_locked:
    if (latch_mode != BTR_MODIFY_TREE)
    {
      if (!height && first && first_access)
-        buf_read_ahead_linear(page_id_t(block->page.id().space(), page),
-                              block->page.zip_size());
+        buf_read_ahead_linear(page_id_t(block->page.id().space(), page));
    }
    else if (btr_cur_need_opposite_intention(block->page, index->is_clust(),
                                             lock_intention,
@ -2012,7 +2033,8 @@ index_locked:
    {
      if (!btr_cur_will_modify_tree(index, block->page.frame,
                                    lock_intention, page_cur.rec,
-                                    node_ptr_max_size, zip_size, mtr))
+                                    node_ptr_max_size,
+                                    index->table->space->zip_size(), mtr))
      {
        ut_ad(n_blocks);
        /* release buffer-fixes on pages that will not be modified
@ -6408,7 +6430,7 @@ btr_copy_blob_prefix(
 			return copied_len;
 		}
 		if (!buf_page_make_young_if_needed(&block->page)) {
-			buf_read_ahead_linear(id, 0);
+			buf_read_ahead_linear(id);
 		}

 		page = buf_block_get_frame(block);
@ -6487,7 +6509,7 @@ btr_copy_zblob_prefix(
 		bpage is protected by the B-tree page latch that
 		is being held on the clustered index record, or,
 		in row_merge_copy_blobs(), by an exclusive table lock. */
-		bpage = buf_page_get_zip(id, zip_size);
+		bpage = buf_page_get_zip(id);

 		if (UNIV_UNLIKELY(!bpage)) {
 			ib::error() << "Cannot load compressed BLOB " << id;
--- a/storage/innobase/btr/btr0pcur.cc
+++ b/storage/innobase/btr/btr0pcur.cc
@ -533,8 +533,7 @@ btr_pcur_move_to_next_page(
 	const auto s = mtr->get_savepoint();
 	mtr->rollback_to_savepoint(s - 2, s - 1);
 	if (first_access) {
-		buf_read_ahead_linear(next_block->page.id(),
-				      next_block->zip_size());
+		buf_read_ahead_linear(next_block->page.id());
 	}
 	return DB_SUCCESS;
 }
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@ -2349,13 +2349,10 @@ be implemented at a higher level.  In other words, all possible
 accesses to a given page through this function must be protected by
 the same set of mutexes or latches.
@param page_id   page identifier
-@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
@return pointer to the block, s-latched */
 TRANSACTIONAL_TARGET
-buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
+buf_page_t* buf_page_get_zip(const page_id_t page_id)
 {
-  ut_ad(zip_size);
-  ut_ad(ut_is_2pow(zip_size));
  ha_handler_stats *const stats= mariadb_stats;
  buf_inc_get(stats);

@ -2456,7 +2453,7 @@ lookup:
  return bpage;

 must_read_page:
-  switch (dberr_t err= buf_read_page(page_id, zip_size, chain)) {
+  switch (dberr_t err= buf_read_page(page_id, chain)) {
  case DB_SUCCESS:
  case DB_SUCCESS_LOCKED_REC:
    mariadb_increment_pages_read(stats);
@ -2491,8 +2488,8 @@ buf_block_init_low(

 /********************************************************************//**
 Decompress a block.
-@return TRUE if successful */
-ibool
+@return true if successful */
+bool
 buf_zip_decompress(
 /*===============*/
 	buf_block_t*	block,	/*!< in/out: block */
@ -2536,7 +2533,7 @@ func_exit:
 			if (space) {
 				space->release();
 			}
-			return(TRUE);
+			return true;
 		}

 		ib::error() << "Unable to decompress "
@ -2570,10 +2567,101 @@ err_exit:
 		space->release();
 	}

-	return(FALSE);
+	return false;
 }

-buf_block_t* buf_pool_t::page_fix(const page_id_t id)
+ATTRIBUTE_COLD
+buf_block_t *buf_pool_t::unzip(buf_page_t *b, buf_pool_t::hash_chain &chain)
+{
+  buf_block_t *block= buf_LRU_get_free_block(have_no_mutex);
+  buf_block_init_low(block);
+  page_hash_latch &hash_lock= page_hash.lock_get(chain);
+ wait_for_unfix:
+  mysql_mutex_lock(&mutex);
+  hash_lock.lock();
+
+  /* b->lock implies !b->can_relocate() */
+  ut_ad(b->lock.have_x());
+  ut_ad(b == page_hash.get(b->id(), chain));
+
+  /* Wait for b->unfix() in any other threads. */
+  uint32_t state= b->state();
+  ut_ad(buf_page_t::buf_fix_count(state));
+  ut_ad(!buf_page_t::is_freed(state));
+
+  switch (state) {
+  case buf_page_t::UNFIXED + 1:
+  case buf_page_t::REINIT + 1:
+    break;
+  default:
+    ut_ad(state < buf_page_t::READ_FIX);
+
+    if (state < buf_page_t::UNFIXED + 1)
+    {
+      ut_ad(state > buf_page_t::FREED);
+      b->lock.x_unlock();
+      hash_lock.unlock();
+      buf_LRU_block_free_non_file_page(block);
+      mysql_mutex_unlock(&mutex);
+      b->unfix();
+      return nullptr;
+    }
+
+    mysql_mutex_unlock(&mutex);
+    hash_lock.unlock();
+    std::this_thread::sleep_for(std::chrono::microseconds(100));
+    goto wait_for_unfix;
+  }
+
+  /* Ensure that another buf_page_get_low() or buf_page_t::page_fix()
+  will wait for block->page.lock.x_unlock(). buf_relocate() will
+  copy the state from b to block and replace b with block in page_hash. */
+  b->set_state(buf_page_t::READ_FIX);
+
+  mysql_mutex_lock(&flush_list_mutex);
+  buf_relocate(b, &block->page);
+
+  /* X-latch the block for the duration of the decompression. */
+  block->page.lock.x_lock();
+
+  buf_flush_relocate_on_flush_list(b, &block->page);
+  mysql_mutex_unlock(&flush_list_mutex);
+
+  /* Insert at the front of unzip_LRU list */
+  buf_unzip_LRU_add_block(block, false);
+
+  mysql_mutex_unlock(&mutex);
+  hash_lock.unlock();
+
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+  b->lock.x_unlock();
+  b->lock.free();
+#endif
+  ut_free(b);
+
+  n_pend_unzip++;
+  const bool ok{buf_zip_decompress(block, false)};
+  n_pend_unzip--;
+
+  if (UNIV_UNLIKELY(!ok))
+  {
+    mysql_mutex_lock(&mutex);
+    block->page.read_unfix(state);
+    block->page.lock.x_unlock();
+    if (!buf_LRU_free_page(&block->page, true))
+      ut_ad(0);
+    mysql_mutex_unlock(&mutex);
+    return nullptr;
+  }
+  else
+    block->page.read_unfix(state);
+
+  return block;
+}
+
+buf_block_t *buf_pool_t::page_fix(const page_id_t id,
+                                  dberr_t *err,
+                                  buf_pool_t::page_fix_conflicts c)
 {
  ha_handler_stats *const stats= mariadb_stats;
  buf_inc_get(stats);
@ -2585,35 +2673,83 @@ buf_block_t* buf_pool_t::page_fix(const page_id_t id)
    buf_page_t *b= page_hash.get(id, chain);
    if (b)
    {
-      uint32_t state= b->fix();
-      hash_lock.unlock_shared();
+      uint32_t state= b->fix() + 1;
      ut_ad(!b->in_zip_hash);
-      ut_ad(b->frame);
-      ut_ad(state >= buf_page_t::FREED);
+      hash_lock.unlock_shared();
+
+      if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED))
+      {
+        ut_ad(state > buf_page_t::FREED);
+        if (c == FIX_ALSO_FREED && b->id() == id)
+        {
+          ut_ad(state == buf_page_t::FREED + 1);
+          return reinterpret_cast<buf_block_t*>(b);
+        }
+        /* The page was marked as freed or corrupted. */
+        b->unfix();
+      corrupted:
+        if (err)
+          *err= DB_CORRUPTION;
+        return nullptr;
+      }
+
      if (state >= buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX)
      {
+        if (c == FIX_NOWAIT)
+        {
+        would_block:
+          b->unfix();
+          return reinterpret_cast<buf_block_t*>(-1);
+        }
+
+        if (UNIV_UNLIKELY(!b->frame))
+        {
+        wait_for_unzip:
+          b->unfix();
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
+          continue;
+        }
        b->lock.s_lock();
        state= b->state();
        ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+
        b->lock.s_unlock();
      }
-      if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED))
+
+      if (UNIV_UNLIKELY(!b->frame))
      {
-        /* The page was marked as freed or corrupted. */
-        b->unfix();
-        b= nullptr;
+        if (b->lock.x_lock_try());
+        else if (c == FIX_NOWAIT)
+          goto would_block;
+        else
+          goto wait_for_unzip;
+
+        buf_block_t *block= unzip(b, chain);
+        if (!block)
+          goto corrupted;
+
+        b= &block->page;
+        state= b->state();
+        b->lock.x_unlock();
      }
+
      return reinterpret_cast<buf_block_t*>(b);
    }

    hash_lock.unlock_shared();
-    switch (buf_read_page(id, 0, chain)) {
+
+    if (c == FIX_NOWAIT)
+      return reinterpret_cast<buf_block_t*>(-1);
+
+    switch (dberr_t local_err= buf_read_page(id, chain)) {
    default:
+      if (err)
+        *err= local_err;
      return nullptr;
    case DB_SUCCESS:
    case DB_SUCCESS_LOCKED_REC:
      mariadb_increment_pages_read(stats);
-      buf_read_ahead_random(id, 0);
+      buf_read_ahead_random(id);
    }
  }
 }
@ -2621,27 +2757,24 @@ buf_block_t* buf_pool_t::page_fix(const page_id_t id)
 /** Low level function used to get access to a database page.
@param[in]	page_id			page id
@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	rw_latch		latch mode
@param[in]	guess			guessed block or NULL
@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
 or BUF_PEEK_IF_IN_POOL
@param[in]	mtr			mini-transaction
@param[out]	err			DB_SUCCESS or error code
-@param[in,out]	no_wait			If not NULL on input, then we must not
-wait for current page latch. On output, the value is set to true if we had to
-return because we could not wait on page latch.
-@return pointer to the block or NULL */
+@return pointer to the block
+@retval nullptr	if the block is corrupted or unavailable */
 TRANSACTIONAL_TARGET
 buf_block_t*
 buf_page_get_gen(
 	const page_id_t		page_id,
 	ulint			zip_size,
-	ulint			rw_latch,
+	rw_lock_type_t		rw_latch,
 	buf_block_t*		guess,
 	ulint			mode,
 	mtr_t*			mtr,
-	dberr_t*		err,
-        bool*			no_wait)
+	dberr_t*		err)
 {
 	ulint		retries = 0;

@ -2660,12 +2793,7 @@ buf_page_get_gen(
 	      || log_sys.get_lsn() == recv_sys.lsn + SIZE_OF_FILE_CHECKPOINT
 	      || ibuf_upgrade_was_needed
 	      : !recv_recovery_is_on() || recv_sys.after_apply);
-	ut_ad(!mtr || mtr->is_active());
-	ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL);
-	ut_ad((rw_latch == RW_S_LATCH)
-	      || (rw_latch == RW_X_LATCH)
-	      || (rw_latch == RW_SX_LATCH)
-	      || (rw_latch == RW_NO_LATCH));
+	ut_ad(mtr->is_active());

 	if (err) {
 		*err = DB_SUCCESS;
@ -2747,11 +2875,11 @@ loop:
 	corrupted, or if an encrypted page with a valid
 	checksum cannot be decypted. */

-	switch (dberr_t local_err = buf_read_page(page_id, zip_size, chain)) {
+	switch (dberr_t local_err = buf_read_page(page_id, chain)) {
 	case DB_SUCCESS:
 	case DB_SUCCESS_LOCKED_REC:
 		mariadb_increment_pages_read(stats);
-		buf_read_ahead_random(page_id, zip_size);
+		buf_read_ahead_random(page_id);
 		break;
 	default:
 		if (mode != BUF_GET_POSSIBLY_FREED
@ -2795,18 +2923,7 @@ ignore_unfixed:
 		in buf_page_t::read_complete() or
 		buf_pool_t::corrupted_evict(), or
 		after buf_zip_decompress() in this function. */
-		if (!no_wait) {
-			block->page.lock.s_lock();
-		} else if (!block->page.lock.s_lock_try()) {
-			ut_ad(rw_latch == RW_NO_LATCH);
-			/* We should not wait trying to acquire S latch for
-			current page while holding latch for the next page.
-			It would violate the latching order resulting in
-			possible deadlock. Caller must handle the failure. */
-			block->page.unfix();
-			*no_wait= true;
-			return nullptr;
-		}
+		block->page.lock.s_lock();
 		state = block->page.state();
 		ut_ad(state < buf_page_t::READ_FIX
 		      || state >= buf_page_t::WRITE_FIX);
@ -2836,18 +2953,6 @@ ignore_unfixed:
 		}
 		ut_ad(id == page_id);
 	} else if (mode != BUF_PEEK_IF_IN_POOL) {
-	} else if (!mtr) {
-		ut_ad(!block->page.oldest_modification());
-		mysql_mutex_lock(&buf_pool.mutex);
-		block->unfix();
-
-free_unfixed_block:
-		if (!buf_LRU_free_page(&block->page, true)) {
-			ut_ad(0);
-		}
-
-		mysql_mutex_unlock(&buf_pool.mutex);
-		return nullptr;
 	} else if (UNIV_UNLIKELY(!block->page.frame)) {
 		/* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an
 		adaptive hash index. There cannot be an
@ -2858,120 +2963,35 @@ free_unfixed_block:
 	ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL
 	      || block->zip_size() == zip_size);

-	if (UNIV_UNLIKELY(!block->page.frame)) {
-		if (!block->page.lock.x_lock_try()) {
-wait_for_unzip:
-			/* The page is being read or written, or
-			another thread is executing buf_zip_decompress()
-			in buf_page_get_gen() on it. */
-			block->page.unfix();
-			std::this_thread::sleep_for(
-				std::chrono::microseconds(100));
-			goto loop;
-		}
-
-		buf_block_t *new_block = buf_LRU_get_free_block(have_no_mutex);
-		buf_block_init_low(new_block);
-
-wait_for_unfix:
-		mysql_mutex_lock(&buf_pool.mutex);
-		page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain);
-
-		/* It does not make sense to use
-		transactional_lock_guard here, because buf_relocate()
-		would likely make a  memory transaction too large. */
-		hash_lock.lock();
-
-		/* block->page.lock implies !block->page.can_relocate() */
-		ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain));
-
-		/* Wait for any other threads to release their buffer-fix
-		on the compressed-only block descriptor. */
-		state = block->page.state();
-
-		switch (state) {
-		case buf_page_t::UNFIXED + 1:
-		case buf_page_t::REINIT + 1:
-			break;
-		default:
-			ut_ad(state < buf_page_t::READ_FIX);
-
-			if (state < buf_page_t::UNFIXED + 1) {
-				ut_ad(state > buf_page_t::FREED);
-				block->page.lock.x_unlock();
-				hash_lock.unlock();
-				buf_LRU_block_free_non_file_page(new_block);
-				mysql_mutex_unlock(&buf_pool.mutex);
-				goto ignore_block;
-			}
-
-			mysql_mutex_unlock(&buf_pool.mutex);
-			hash_lock.unlock();
-			std::this_thread::sleep_for(
-				std::chrono::microseconds(100));
-			goto wait_for_unfix;
-		}
-
-		/* Ensure that another buf_page_get_gen() will wait for
-		new_block->page.lock.x_unlock(). */
-		block->page.set_state(buf_page_t::READ_FIX);
-
-		/* Move the compressed page from block->page to new_block,
-		and uncompress it. */
-
-		mysql_mutex_lock(&buf_pool.flush_list_mutex);
-		buf_relocate(&block->page, &new_block->page);
-
-		/* X-latch the block for the duration of the decompression. */
-		new_block->page.lock.x_lock();
-		ut_d(block->page.lock.x_unlock());
-
-		buf_flush_relocate_on_flush_list(&block->page,
-						 &new_block->page);
-		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-
-		/* Insert at the front of unzip_LRU list */
-		buf_unzip_LRU_add_block(new_block, FALSE);
-
-		mysql_mutex_unlock(&buf_pool.mutex);
-		hash_lock.unlock();
-
-#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
-		block->page.lock.free();
-#endif
-		ut_free(reinterpret_cast<buf_page_t*>(block));
-		block = new_block;
-
-		buf_pool.n_pend_unzip++;
-
-		/* Decompress the page while not holding
-		buf_pool.mutex. */
-		const auto ok = buf_zip_decompress(block, false);
-		--buf_pool.n_pend_unzip;
-		if (!ok) {
-			if (err) {
-				*err = DB_PAGE_CORRUPTED;
-			}
-			mysql_mutex_lock(&buf_pool.mutex);
-		}
-		state = block->page.read_unfix(state);
-		block->page.lock.x_unlock();
-
-		if (!ok) {
-			goto free_unfixed_block;
-		}
-	}
-
 	if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
 		goto ignore_block;
 	}
 	ut_ad((~buf_page_t::LRU_MASK) & state);
 	ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);

+	if (UNIV_UNLIKELY(!block->page.frame)) {
+		if (!block->page.lock.x_lock_try()) {
+wait_for_unzip:
+			/* The page is being read or written, or
+			another thread is executing buf_pool.unzip() on it. */
+			block->page.unfix();
+			std::this_thread::sleep_for(
+				std::chrono::microseconds(100));
+			goto loop;
+		}
+
+		block = buf_pool.unzip(&block->page, chain);
+
+		if (!block) {
+			goto ignore_unfixed;
+		}
+
+		block->page.lock.x_unlock();
+	}
+
 #ifdef UNIV_DEBUG
 	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
 #endif /* UNIV_DEBUG */
-	ut_ad(block->page.frame);

 	/* The state = block->page.state() may be stale at this point,
 	and in fact, at any point of time if we consider its
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@ -2682,12 +2682,12 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
 /** Flush the buffer pool on shutdown. */
 ATTRIBUTE_COLD void buf_flush_buffer_pool()
 {
-  ut_ad(!os_aio_pending_reads());
  ut_ad(!buf_page_cleaner_is_active);
  ut_ad(!buf_flush_sync_lsn);

  service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
                                 "Waiting to flush the buffer pool");
+  os_aio_wait_until_no_pending_reads(false);

  mysql_mutex_lock(&buf_pool.flush_list_mutex);

--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@ -303,10 +303,9 @@ pages: to avoid deadlocks this function must be written such that it cannot
 end up waiting for these latches!
@param[in]	page_id		page id of a page which the current thread
 wants to access
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@return number of page read requests issued */
 TRANSACTIONAL_TARGET
-ulint buf_read_ahead_random(const page_id_t page_id, ulint zip_size)
+ulint buf_read_ahead_random(const page_id_t page_id)
 {
  if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID)
    /* Disable the read-ahead for temporary tablespace */
@ -353,6 +352,7 @@ read_ahead:

  /* Read all the suitable blocks within the area */
  buf_block_t *block= nullptr;
+  unsigned zip_size{space->zip_size()};
  if (UNIV_LIKELY(!zip_size))
  {
  allocate_block:
@ -405,15 +405,14 @@ if it is not already there. Sets the io_fix and an exclusive lock
 on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
@param page_id    page id
-@param zip_size   ROW_FORMAT=COMPRESSED page size, or 0
@param chain      buf_pool.page_hash cell for page_id
-@retval DB_SUCCESS if the page was read and is not corrupted,
+@retval DB_SUCCESS if the page was read and is not corrupted
@retval DB_SUCCESS_LOCKED_REC if the page was not read
@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
 after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
-dberr_t buf_read_page(const page_id_t page_id, ulint zip_size,
+dberr_t buf_read_page(const page_id_t page_id,
                      buf_pool_t::hash_chain &chain)
 {
  fil_space_t *space= fil_space_t::get(page_id.space());
@ -427,6 +426,8 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size,
  /* Our caller should already have ensured that the page does not
  exist in buf_pool.page_hash. */
  buf_block_t *block= nullptr;
+  unsigned zip_size= space->zip_size();
+
  if (UNIV_LIKELY(!zip_size))
  {
  allocate_block:
@ -511,10 +512,9 @@ NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
 function must be written such that it cannot end up waiting for these
 latches!
@param[in]	page_id		page id; see NOTE 3 above
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@return number of page read requests issued */
 TRANSACTIONAL_TARGET
-ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size)
+ulint buf_read_ahead_linear(const page_id_t page_id)
 {
  /* check if readahead is disabled.
  Disable the read ahead logic for temporary tablespace */
@ -553,6 +553,11 @@ fail:
    return 0;
  }

+  if (trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    goto fail;
+
  /* How many out of order accessed pages can we ignore
  when working out the access pattern for linear readahead */
  ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
@ -647,6 +652,7 @@ failed:

  /* If we got this far, read-ahead can be sensible: do it */
  buf_block_t *block= nullptr;
+  unsigned zip_size{space->zip_size()};
  if (UNIV_LIKELY(!zip_size))
  {
  allocate_block:
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@ -647,7 +647,7 @@ dberr_t rtr_search_to_nth_level(btr_cur_t *cur, que_thr_t *thr,

 search_loop:
  auto buf_mode= BUF_GET;
-  ulint rw_latch= RW_NO_LATCH;
+  rw_lock_type_t rw_latch= RW_NO_LATCH;

  if (height)
  {
@ -658,7 +658,7 @@ dberr_t rtr_search_to_nth_level(btr_cur_t *cur, que_thr_t *thr,
      rw_latch= upper_rw_latch;
  }
  else if (latch_mode <= BTR_MODIFY_LEAF)
-    rw_latch= latch_mode;
+    rw_latch= rw_lock_type_t(latch_mode);

  dberr_t err;
  auto block_savepoint= mtr->get_savepoint();
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@ -191,33 +191,29 @@ be implemented at a higher level.  In other words, all possible
 accesses to a given page through this function must be protected by
 the same set of mutexes or latches.
@param page_id   page identifier
-@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
@return pointer to the block, s-latched */
-buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+buf_page_t *buf_page_get_zip(const page_id_t page_id);

 /** Get access to a database page. Buffered redo log may be applied.
@param[in]	page_id			page id
@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	rw_latch		latch mode
@param[in]	guess			guessed block or NULL
@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
 or BUF_PEEK_IF_IN_POOL
@param[in,out]	mtr			mini-transaction
@param[out]	err			DB_SUCCESS or error code
-@param[in,out]	no_wait			If not NULL on input, then we must not
-wait for current page latch. On output, the value is set to true if we had to
-return because we could not wait on page latch.
-@return pointer to the block or NULL */
+@return pointer to the block
+@retval nullptr	if the block is corrupted or unavailable */
 buf_block_t*
 buf_page_get_gen(
 	const page_id_t		page_id,
 	ulint			zip_size,
-	ulint			rw_latch,
+	rw_lock_type_t		rw_latch,
 	buf_block_t*		guess,
 	ulint			mode,
 	mtr_t*			mtr,
-	dberr_t*		err = nullptr,
-        bool*			no_wait = nullptr);
+	dberr_t*		err = nullptr);

 /** Initialize a page in the buffer pool. The page is usually not read
 from a file even if it cannot be found in the buffer buf_pool. This is one
@ -357,8 +353,8 @@ void buf_page_print(const byte* read_buf, ulint zip_size = 0)
 	ATTRIBUTE_COLD __attribute__((nonnull));
 /********************************************************************//**
 Decompress a block.
-@return TRUE if successful */
-ibool
+@return true if successful */
+bool
 buf_zip_decompress(
 /*===============*/
 	buf_block_t*	block,	/*!< in/out: block */
@ -627,30 +623,42 @@ public:
 public:
  const page_id_t &id() const { return id_; }
  uint32_t state() const { return zip.fix; }
-  uint32_t buf_fix_count() const
-  {
-    uint32_t f= state();
-    ut_ad(f >= FREED);
-    return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f);
-  }
+  static uint32_t buf_fix_count(uint32_t s)
+  { ut_ad(s >= FREED); return s < UNFIXED ? (s - FREED) : (~LRU_MASK & s); }
+
+  uint32_t buf_fix_count() const { return buf_fix_count(state()); }
+  /** Check if a file block is io-fixed.
+  @param s   state()
+  @return whether s corresponds to an io-fixed block */
+  static bool is_io_fixed(uint32_t s)
+  { ut_ad(s >= FREED); return s >= READ_FIX; }
+  /** Check if a file block is read-fixed.
+  @param s   state()
+  @return whether s corresponds to a read-fixed block */
+  static bool is_read_fixed(uint32_t s)
+  { return is_io_fixed(s) && s < WRITE_FIX; }
+  /** Check if a file block is write-fixed.
+  @param s   state()
+  @return whether s corresponds to a write-fixed block */
+  static bool is_write_fixed(uint32_t s)
+  { ut_ad(s >= FREED); return s >= WRITE_FIX; }
+
  /** @return whether this block is read or write fixed;
  read_complete() or write_complete() will always release
  the io-fix before releasing U-lock or X-lock */
-  bool is_io_fixed() const
-  { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; }
+  bool is_io_fixed() const { return is_io_fixed(state()); }
  /** @return whether this block is write fixed;
  write_complete() will always release the write-fix before releasing U-lock */
-  bool is_write_fixed() const { return state() >= WRITE_FIX; }
-  /** @return whether this block is read fixed; this should never hold
-  when a thread is holding the block lock in any mode */
-  bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); }
+  bool is_write_fixed() const { return is_write_fixed(state()); }
+  /** @return whether this block is read fixed */
+  bool is_read_fixed() const { return is_read_fixed(state()); }

  /** @return if this belongs to buf_pool.unzip_LRU */
  bool belongs_to_unzip_LRU() const
  { return UNIV_LIKELY_NULL(zip.data) && frame; }

-  bool is_freed() const
-  { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
+  static bool is_freed(uint32_t s) { ut_ad(s >= FREED); return s < UNFIXED; }
+  bool is_freed() const { return is_freed(state()); }
  bool is_reinit() const { return !(~state() & REINIT); }

  void set_reinit(uint32_t prev_state)
@ -1358,11 +1366,43 @@ public:
  }

 public:
+  /** page_fix() mode of operation */
+  enum page_fix_conflicts{
+    /** Fetch if in the buffer pool, also blocks marked as free */
+    FIX_ALSO_FREED= -1,
+    /** Fetch, waiting for page read completion */
+    FIX_WAIT_READ,
+    /** Fetch, but avoid any waits for */
+    FIX_NOWAIT
+  };
+
  /** Look up and buffer-fix a page.
+  Note: If the page is read-fixed (being read into the buffer pool),
+  we would have to wait for the page latch before determining if the page
+  is accessible (it could be corrupted and have been evicted again).
+  If the caller is holding other page latches so that waiting for this
+  page latch could lead to lock order inversion (latching order violation),
+  the mode c=FIX_WAIT_READ must not be used.
  @param id        page identifier
+  @param err       error code (will only be assigned when returning nullptr)
+  @param c         how to handle conflicts
  @return undo log page, buffer-fixed
+  @retval -1       if c=FIX_NOWAIT and buffer-fixing would require waiting
  @retval nullptr  if the undo page was corrupted or freed */
-  buf_block_t *page_fix(const page_id_t id);
+  buf_block_t *page_fix(const page_id_t id, dberr_t *err,
+                        page_fix_conflicts c);
+
+  buf_block_t *page_fix(const page_id_t id)
+  { return page_fix(id, nullptr, FIX_WAIT_READ); }
+
+
+  /** Decompress a page and relocate the block descriptor
+  @param b      buffer-fixed compressed-only ROW_FORMAT=COMPRESSED page
+  @param chain  hash table chain for b->id().fold()
+  @return the decompressed block, x-latched and read-fixed
+  @retval nullptr if the decompression failed (b->unfix() will be invoked) */
+  ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result))
+  buf_block_t *unzip(buf_page_t *b, hash_chain &chain);

  /** @return whether the buffer pool contains a page
  @param page_id       page identifier
@ -1572,8 +1612,8 @@ public:
  /** map of block->frame to buf_block_t blocks that belong
  to buf_buddy_alloc(); protected by buf_pool.mutex */
  hash_table_t zip_hash;
-	Atomic_counter<ulint>
-			n_pend_unzip;	/*!< number of pending decompressions */
+  /** number of pending unzip() */
+  Atomic_counter<ulint> n_pend_unzip;

 	time_t		last_printout_time;
 					/*!< when buf_print_io was last time
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@ -33,15 +33,14 @@ buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
@param page_id    page id
-@param zip_size   ROW_FORMAT=COMPRESSED page size, or 0
@param chain      buf_pool.page_hash cell for page_id
-@retval DB_SUCCESS if the page was read and is not corrupted,
+@retval DB_SUCCESS if the page was read and is not corrupted
@retval DB_SUCCESS_LOCKED_REC if the page was not read
@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
 after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
-dberr_t buf_read_page(const page_id_t page_id, ulint zip_size,
+dberr_t buf_read_page(const page_id_t page_id,
                      buf_pool_t::hash_chain &chain);

 /** High-level function which reads a page asynchronously from a file to the
@ -63,9 +62,8 @@ pages: to avoid deadlocks this function must be written such that it cannot
 end up waiting for these latches!
@param[in]	page_id		page id of a page which the current thread
 wants to access
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@return number of page read requests issued */
-ulint buf_read_ahead_random(const page_id_t page_id, ulint zip_size);
+ulint buf_read_ahead_random(const page_id_t page_id);

 /** Applies linear read-ahead if in the buf_pool the page is a border page of
 a linear read-ahead area and all the pages in the area have been accessed.
@ -87,9 +85,8 @@ NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
 function must be written such that it cannot end up waiting for these
 latches!
@param[in]	page_id		page id; see NOTE 3 above
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@return number of page read requests issued */
-ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size);
+ulint buf_read_ahead_linear(const page_id_t page_id);

 /** Schedule a page for recovery.
@param space    tablespace
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@ -2178,38 +2178,43 @@ updated then its state must be set to BUF_PAGE_NOT_USED.
@retval DB_SUCCESS or error code. */
 dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
 {
-	/* If we already had an old page with matching number
-	in the buffer pool, evict it now, because
-	we no longer evict the pages on DISCARD TABLESPACE. */
-	buf_page_get_gen(block->page.id(), get_zip_size(), RW_NO_LATCH,
-			 nullptr, BUF_PEEK_IF_IN_POOL,
-			 nullptr, nullptr, nullptr);
+  /* If we already had an old page with matching number in the buffer
+  pool, evict it now, because we no longer evict the pages on
+  DISCARD TABLESPACE. */
+  if (buf_block_t *b= buf_pool.page_fix(block->page.id(), nullptr,
+                                        buf_pool_t::FIX_ALSO_FREED))
+  {
+    ut_ad(!b->page.oldest_modification());
+    mysql_mutex_lock(&buf_pool.mutex);
+    b->unfix();

-	uint16_t page_type;
+    if (!buf_LRU_free_page(&b->page, true))
+      ut_ad(0);

-	if (dberr_t err = update_page(block, page_type)) {
-		return err;
-	}
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }

-	const bool full_crc32 = fil_space_t::full_crc32(get_space_flags());
-	byte* frame = get_frame(block);
-	memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
+  uint16_t page_type;

-	if (!block->page.zip.data) {
-		buf_flush_init_for_writing(
-			NULL, block->page.frame, NULL, full_crc32);
-	} else if (fil_page_type_is_index(page_type)) {
-		buf_flush_init_for_writing(
-			NULL, block->page.zip.data, &block->page.zip,
-			full_crc32);
-	} else {
-		/* Calculate and update the checksum of non-index
-		pages for ROW_FORMAT=COMPRESSED tables. */
-		buf_flush_update_zip_checksum(
-			block->page.zip.data, block->zip_size());
-	}
+  if (dberr_t err= update_page(block, page_type))
+    return err;

-	return DB_SUCCESS;
+  const bool full_crc32= fil_space_t::full_crc32(get_space_flags());
+  byte *frame= get_frame(block);
+  memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
+
+  if (!block->page.zip.data)
+    buf_flush_init_for_writing(nullptr, block->page.frame, nullptr,
+                               full_crc32);
+  else if (fil_page_type_is_index(page_type))
+    buf_flush_init_for_writing(nullptr, block->page.zip.data, &block->page.zip,
+                               full_crc32);
+  else
+    /* Calculate and update the checksum of non-index
+    pages for ROW_FORMAT=COMPRESSED tables. */
+    buf_flush_update_zip_checksum(block->page.zip.data, block->zip_size());
+
+  return DB_SUCCESS;
 }

 static void reload_fts_table(row_prebuilt_t *prebuilt,
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@ -2157,38 +2157,6 @@ corrupted_rec:
 			mem_heap_empty(row_heap);

 			if (!mtr_started) {
-				goto scan_next;
-			}
-
-			if (clust_index->lock.is_waiting()) {
-				/* There are waiters on the clustered
-				index tree lock, likely the purge
-				thread. Store and restore the cursor
-				position, and yield so that scanning a
-				large table will not starve other
-				threads. */
-
-				/* Store the cursor position on the last user
-				record on the page. */
-				if (!btr_pcur_move_to_prev_on_page(&pcur)) {
-					goto corrupted_index;
-				}
-				/* Leaf pages must never be empty, unless
-				this is the only page in the index tree. */
-				if (!btr_pcur_is_on_user_rec(&pcur)
-				    && btr_pcur_get_block(&pcur)->page.id()
-				    .page_no() != clust_index->page) {
-					goto corrupted_index;
-				}
-
-				btr_pcur_store_position(&pcur, &mtr);
-				mtr.commit();
-				mtr_started = false;
-
-				/* Give the waiters a chance to proceed. */
-				std::this_thread::yield();
-scan_next:
-				ut_ad(!mtr_started);
 				ut_ad(!mtr.is_active());
 				mtr.start();
 				mtr_started = true;
@ -2201,7 +2169,7 @@ scan_next:
 corrupted_index:
 					err = DB_CORRUPTION;
 					goto func_exit;
-                                }
+				}
 				/* Move to the successor of the
 				original record. */
 				if (!btr_pcur_move_to_next_user_rec(
@ -2236,14 +2204,14 @@ end_of_index:

 				buf_page_make_young_if_needed(&block->page);

+				const auto s = mtr.get_savepoint();
+				mtr.rollback_to_savepoint(s - 2, s - 1);
+
 				page_cur_set_before_first(block, cur);
 				if (!page_cur_move_to_next(cur)
 				    || page_cur_is_after_last(cur)) {
 					goto corrupted_rec;
 				}
-
-				const auto s = mtr.get_savepoint();
-				mtr.rollback_to_savepoint(s - 2, s - 1);
 			}
 		} else {
 			mem_heap_empty(row_heap);
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@ -185,7 +185,7 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
    return nullptr;

  if (!buf_page_make_young_if_needed(&block->page))
-    buf_read_ahead_linear(block->page.id(), 0);
+    buf_read_ahead_linear(block->page.id());
  return trx_undo_page_get_last_rec(block, page_no, offset);
 }

@ -242,7 +242,7 @@ trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
 static trx_undo_rec_t*
 trx_undo_get_next_rec_from_next_page(const buf_block_t *&block,
                                     uint32_t page_no, uint16_t offset,
-                                     ulint mode, mtr_t *mtr)
+                                     rw_lock_type_t mode, mtr_t *mtr)
 {
  if (page_no == block->page.id().page_no() &&
      mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG))
@ -272,7 +272,8 @@ trx_undo_get_next_rec_from_next_page(const buf_block_t *&block,
@retval nullptr if none */
 static trx_undo_rec_t*
 trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
-                       uint16_t offset, ulint mode, const buf_block_t*& block,
+                       uint16_t offset, rw_lock_type_t mode,
+                       const buf_block_t *&block,
                       mtr_t *mtr, dberr_t *err)
 {
  buf_block_t *b= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
@ -282,7 +283,7 @@ trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
    return nullptr;

  if (!buf_page_make_young_if_needed(&b->page))
-    buf_read_ahead_linear(b->page.id(), 0);
+    buf_read_ahead_linear(b->page.id());

  if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(b, page_no, offset))
    return rec;