mirror of
				https://github.com/MariaDB/server.git
				synced 2025-11-04 04:46:15 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			6817 lines
		
	
	
	
		
			211 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			6817 lines
		
	
	
	
		
			211 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/*****************************************************************************
 | 
						|
 | 
						|
Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
 | 
						|
Copyright (c) 2012, Facebook Inc.
 | 
						|
Copyright (c) 2015, 2023, MariaDB Corporation.
 | 
						|
 | 
						|
This program is free software; you can redistribute it and/or modify it under
 | 
						|
the terms of the GNU General Public License as published by the Free Software
 | 
						|
Foundation; version 2 of the License.
 | 
						|
 | 
						|
This program is distributed in the hope that it will be useful, but WITHOUT
 | 
						|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 | 
						|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 | 
						|
 | 
						|
You should have received a copy of the GNU General Public License along with
 | 
						|
this program; if not, write to the Free Software Foundation, Inc.,
 | 
						|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
 | 
						|
 | 
						|
*****************************************************************************/
 | 
						|
 | 
						|
/**************************************************//**
 | 
						|
@file btr/btr0cur.cc
 | 
						|
The index tree cursor
 | 
						|
 | 
						|
All changes that row operations make to a B-tree or the records
 | 
						|
there must go through this module! Undo log records are written here
 | 
						|
of every modify or insert of a clustered index record.
 | 
						|
 | 
						|
			NOTE!!!
 | 
						|
To make sure we do not run out of disk space during a pessimistic
 | 
						|
insert or update, we have to reserve 2 x the height of the index tree
 | 
						|
many pages in the tablespace before we start the operation, because
 | 
						|
if leaf splitting has been started, it is difficult to undo, except
 | 
						|
by crashing the database and doing a roll-forward.
 | 
						|
 | 
						|
Created 10/16/1994 Heikki Tuuri
 | 
						|
*******************************************************/
 | 
						|
 | 
						|
#include "btr0cur.h"
 | 
						|
#include "row0upd.h"
 | 
						|
#include "mtr0log.h"
 | 
						|
#include "page0page.h"
 | 
						|
#include "page0zip.h"
 | 
						|
#include "rem0rec.h"
 | 
						|
#include "rem0cmp.h"
 | 
						|
#include "buf0lru.h"
 | 
						|
#include "buf0rea.h"
 | 
						|
#include "btr0btr.h"
 | 
						|
#include "btr0sea.h"
 | 
						|
#include "row0log.h"
 | 
						|
#include "row0purge.h"
 | 
						|
#include "row0upd.h"
 | 
						|
#include "trx0rec.h"
 | 
						|
#include "trx0roll.h"
 | 
						|
#include "que0que.h"
 | 
						|
#include "row0row.h"
 | 
						|
#include "srv0srv.h"
 | 
						|
#include "lock0lock.h"
 | 
						|
#include "zlib.h"
 | 
						|
#include "srv0start.h"
 | 
						|
#include "mysql_com.h"
 | 
						|
#include "dict0stats.h"
 | 
						|
#include "row0ins.h"
 | 
						|
#ifdef WITH_WSREP
 | 
						|
#include "mysql/service_wsrep.h"
 | 
						|
#endif /* WITH_WSREP */
 | 
						|
#include "log.h"
 | 
						|
 | 
						|
/** Modification types for the B-tree operation.
 | 
						|
    Note that the order must be DELETE, BOTH, INSERT !!
 | 
						|
 */
 | 
						|
enum btr_intention_t {
 | 
						|
	BTR_INTENTION_DELETE,
 | 
						|
	BTR_INTENTION_BOTH,
 | 
						|
	BTR_INTENTION_INSERT
 | 
						|
};
 | 
						|
 | 
						|
/** For the index->lock scalability improvement, only possibility of clear
 | 
						|
performance regression observed was caused by grown huge history list length.
 | 
						|
That is because the exclusive use of index->lock also worked as reserving
 | 
						|
free blocks and read IO bandwidth with priority. To avoid huge glowing history
 | 
						|
list as same level with previous implementation, prioritizes pessimistic tree
 | 
						|
operations by purge as the previous, when it seems to be growing huge.
 | 
						|
 | 
						|
 Experimentally, the history list length starts to affect to performance
 | 
						|
throughput clearly from about 100000. */
 | 
						|
#define BTR_CUR_FINE_HISTORY_LENGTH	100000
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
 | 
						|
ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_non_sea;
 | 
						|
/** Old value of btr_cur_n_non_sea.  Copied by
 | 
						|
srv_refresh_innodb_monitor_stats().  Referenced by
 | 
						|
srv_printf_innodb_monitor(). */
 | 
						|
ulint	btr_cur_n_non_sea_old;
 | 
						|
/** Number of successful adaptive hash index lookups in
 | 
						|
btr_cur_t::search_leaf(). */
 | 
						|
ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_sea;
 | 
						|
/** Old value of btr_cur_n_sea.  Copied by
 | 
						|
srv_refresh_innodb_monitor_stats().  Referenced by
 | 
						|
srv_printf_innodb_monitor(). */
 | 
						|
ulint	btr_cur_n_sea_old;
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
/* Flag to limit optimistic insert records */
 | 
						|
uint	btr_cur_limit_optimistic_insert_debug;
 | 
						|
#endif /* UNIV_DEBUG */
 | 
						|
 | 
						|
/** In the optimistic insert, if the insert does not fit, but this much space
 | 
						|
can be released by page reorganize, then it is reorganized */
 | 
						|
#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(srv_page_size / 32)
 | 
						|
 | 
						|
/** The structure of a BLOB part header */
 | 
						|
/* @{ */
 | 
						|
/*--------------------------------------*/
 | 
						|
#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
 | 
						|
						page */
 | 
						|
#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
 | 
						|
						FIL_NULL if none */
 | 
						|
/*--------------------------------------*/
 | 
						|
#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
 | 
						|
						part header, in bytes */
 | 
						|
 | 
						|
/* @} */
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Marks all extern fields in a record as owned by the record. This function
 | 
						|
should be called if the delete mark of a record is removed: a not delete
 | 
						|
marked record always owns all its extern fields. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_cur_unmark_extern_fields(
 | 
						|
/*=========================*/
 | 
						|
	buf_block_t*	block,	/*!< in/out: index page */
 | 
						|
	rec_t*		rec,	/*!< in/out: record in a clustered index */
 | 
						|
	dict_index_t*	index,	/*!< in: index of the page */
 | 
						|
	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 | 
						|
	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
 | 
						|
/***********************************************************//**
 | 
						|
Frees the externally stored fields for a record, if the field is mentioned
 | 
						|
in the update vector. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_rec_free_updated_extern_fields(
 | 
						|
/*===============================*/
 | 
						|
	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
 | 
						|
				X-latched */
 | 
						|
	rec_t*		rec,	/*!< in: record */
 | 
						|
	buf_block_t*	block,	/*!< in: index page of rec */
 | 
						|
	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 | 
						|
	const upd_t*	update,	/*!< in: update vector */
 | 
						|
	bool		rollback,/*!< in: performing rollback? */
 | 
						|
	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
 | 
						|
				an X-latch to record page and to the tree */
 | 
						|
/***********************************************************//**
 | 
						|
Frees the externally stored fields for a record. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_rec_free_externally_stored_fields(
 | 
						|
/*==================================*/
 | 
						|
	dict_index_t*	index,	/*!< in: index of the data, the index
 | 
						|
				tree MUST be X-latched */
 | 
						|
	rec_t*		rec,	/*!< in: record */
 | 
						|
	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 | 
						|
	buf_block_t*	block,	/*!< in: index page of rec */
 | 
						|
	bool		rollback,/*!< in: performing rollback? */
 | 
						|
	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
 | 
						|
				an X-latch to record page and to the index
 | 
						|
				tree */
 | 
						|
 | 
						|
/*==================== B-TREE SEARCH =========================*/
 | 
						|
 | 
						|
/** Load the instant ALTER TABLE metadata from the clustered index
 | 
						|
when loading a table definition.
 | 
						|
@param[in,out]	index	clustered index definition
 | 
						|
@param[in,out]	mtr	mini-transaction
 | 
						|
@return	error code
 | 
						|
@retval	DB_SUCCESS	if no error occurred
 | 
						|
@retval	DB_CORRUPTION	if any corruption was noticed */
 | 
						|
static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
 | 
						|
{
 | 
						|
	ut_ad(index->is_primary());
 | 
						|
	ut_ad(index->table->is_readable());
 | 
						|
 | 
						|
	if (!index->table->supports_instant()) {
 | 
						|
		return DB_SUCCESS;
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
 | 
						|
 | 
						|
	dberr_t err;
 | 
						|
	const fil_space_t* space = index->table->space;
 | 
						|
	if (!space) {
 | 
						|
corrupted:
 | 
						|
		err = DB_CORRUPTION;
 | 
						|
unreadable:
 | 
						|
		ib::error() << "Table " << index->table->name
 | 
						|
			    << " has an unreadable root page";
 | 
						|
		index->table->corrupted = true;
 | 
						|
		index->table->file_unreadable = true;
 | 
						|
		return err;
 | 
						|
	}
 | 
						|
 | 
						|
	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
 | 
						|
	if (!root) {
 | 
						|
		goto unreadable;
 | 
						|
	}
 | 
						|
 | 
						|
	if (btr_cur_instant_root_init(index, root->page.frame)) {
 | 
						|
		goto corrupted;
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
 | 
						|
 | 
						|
	if (fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX) {
 | 
						|
		ut_ad(!index->is_instant());
 | 
						|
		return DB_SUCCESS;
 | 
						|
	}
 | 
						|
 | 
						|
	btr_cur_t cur;
 | 
						|
	/* Relax the assertion in rec_init_offsets(). */
 | 
						|
	ut_ad(!index->in_instant_init);
 | 
						|
	ut_d(index->in_instant_init = true);
 | 
						|
	err = cur.open_leaf(true, index, BTR_SEARCH_LEAF, mtr);
 | 
						|
	ut_d(index->in_instant_init = false);
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
		index->table->file_unreadable = true;
 | 
						|
		index->table->corrupted = true;
 | 
						|
		return err;
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(page_cur_is_before_first(&cur.page_cur));
 | 
						|
	ut_ad(page_is_leaf(btr_cur_get_page(&cur)));
 | 
						|
 | 
						|
	const rec_t* rec = page_cur_move_to_next(&cur.page_cur);
 | 
						|
	const ulint comp = dict_table_is_comp(index->table);
 | 
						|
	const ulint info_bits = rec ? rec_get_info_bits(rec, comp) : 0;
 | 
						|
 | 
						|
	if (page_rec_is_supremum(rec)
 | 
						|
	    || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
 | 
						|
		if (rec && !index->is_instant()) {
 | 
						|
			/* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
 | 
						|
			assigned even if instant ADD COLUMN was not
 | 
						|
			committed. Changes to these page header fields are not
 | 
						|
			undo-logged, but changes to the hidden metadata record
 | 
						|
			are. If the server is killed and restarted, the page
 | 
						|
			header fields could remain set even though no metadata
 | 
						|
			record is present. */
 | 
						|
			return DB_SUCCESS;
 | 
						|
		}
 | 
						|
 | 
						|
		ib::error() << "Table " << index->table->name
 | 
						|
			    << " is missing instant ALTER metadata";
 | 
						|
		index->table->corrupted = true;
 | 
						|
		return DB_CORRUPTION;
 | 
						|
	}
 | 
						|
 | 
						|
	if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
 | 
						|
	    || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
 | 
						|
incompatible:
 | 
						|
		ib::error() << "Table " << index->table->name
 | 
						|
			<< " contains unrecognizable instant ALTER metadata";
 | 
						|
		index->table->corrupted = true;
 | 
						|
		return DB_CORRUPTION;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Read the metadata. We can get here on server restart
 | 
						|
	or when the table was evicted from the data dictionary cache
 | 
						|
	and is now being accessed again.
 | 
						|
 | 
						|
	Here, READ COMMITTED and REPEATABLE READ should be equivalent.
 | 
						|
	Committing the ADD COLUMN operation would acquire
 | 
						|
	MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
 | 
						|
	concurrent operations on the table, including table eviction
 | 
						|
	from the cache. */
 | 
						|
 | 
						|
	if (info_bits & REC_INFO_DELETED_FLAG) {
 | 
						|
		/* This metadata record includes a BLOB that identifies
 | 
						|
		any dropped or reordered columns. */
 | 
						|
		ulint trx_id_offset = index->trx_id_offset;
 | 
						|
		/* If !index->trx_id_offset, the PRIMARY KEY contains
 | 
						|
		variable-length columns. For the metadata record,
 | 
						|
		variable-length columns should be written with zero
 | 
						|
		length. However, before MDEV-21088 was fixed, for
 | 
						|
		variable-length encoded PRIMARY KEY column of type
 | 
						|
		CHAR, we wrote more than zero bytes. That is why we
 | 
						|
		must determine the actual length of each PRIMARY KEY
 | 
						|
		column.  The DB_TRX_ID will start right after any
 | 
						|
		PRIMARY KEY columns. */
 | 
						|
		ut_ad(index->n_uniq);
 | 
						|
 | 
						|
		/* We cannot invoke rec_get_offsets() before
 | 
						|
		index->table->deserialise_columns(). Therefore,
 | 
						|
		we must duplicate some logic here. */
 | 
						|
		if (trx_id_offset) {
 | 
						|
		} else if (index->table->not_redundant()) {
 | 
						|
			/* The PRIMARY KEY contains variable-length columns.
 | 
						|
			For the metadata record, variable-length columns are
 | 
						|
			always written with zero length. The DB_TRX_ID will
 | 
						|
			start right after any fixed-length columns. */
 | 
						|
 | 
						|
			/* OK, before MDEV-21088 was fixed, for
 | 
						|
			variable-length encoded PRIMARY KEY column of
 | 
						|
			type CHAR, we wrote more than zero bytes. In
 | 
						|
			order to allow affected tables to be accessed,
 | 
						|
			it would be nice to determine the actual
 | 
						|
			length of each PRIMARY KEY column. However, to
 | 
						|
			be able to do that, we should determine the
 | 
						|
			size of the null-bit bitmap in the metadata
 | 
						|
			record. And we cannot know that before reading
 | 
						|
			the metadata BLOB, whose starting point we are
 | 
						|
			trying to find here. (Although the PRIMARY KEY
 | 
						|
			columns cannot be NULL, we would have to know
 | 
						|
			where the lengths of variable-length PRIMARY KEY
 | 
						|
			columns start.)
 | 
						|
 | 
						|
			So, unfortunately we cannot help users who
 | 
						|
			were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
 | 
						|
			or ROW_FORMAT=DYNAMIC table. */
 | 
						|
 | 
						|
			for (uint i = index->n_uniq; i--; ) {
 | 
						|
				trx_id_offset += index->fields[i].fixed_len;
 | 
						|
			}
 | 
						|
		} else if (rec_get_1byte_offs_flag(rec)) {
 | 
						|
			trx_id_offset = rec_1_get_field_end_info(
 | 
						|
				rec, index->n_uniq - 1);
 | 
						|
			ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
 | 
						|
			trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
 | 
						|
		} else {
 | 
						|
			trx_id_offset = rec_2_get_field_end_info(
 | 
						|
				rec, index->n_uniq - 1);
 | 
						|
			ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
 | 
						|
			trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
 | 
						|
		}
 | 
						|
 | 
						|
		const byte* ptr = rec + trx_id_offset
 | 
						|
			+ (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
 | 
						|
 | 
						|
		if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
 | 
						|
			goto incompatible;
 | 
						|
		}
 | 
						|
 | 
						|
		uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
 | 
						|
		if (!len
 | 
						|
		    || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
 | 
						|
		    != FIL_PAGE_DATA
 | 
						|
		    || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
 | 
						|
		    != space->id) {
 | 
						|
			goto incompatible;
 | 
						|
		}
 | 
						|
 | 
						|
		buf_block_t* block = buf_page_get(
 | 
						|
			page_id_t(space->id,
 | 
						|
				  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
 | 
						|
			0, RW_S_LATCH, mtr);
 | 
						|
		if (!block) {
 | 
						|
			goto incompatible;
 | 
						|
		}
 | 
						|
 | 
						|
		btr_search_drop_page_hash_index(block, index);
 | 
						|
 | 
						|
		if (fil_page_get_type(block->page.frame) != FIL_PAGE_TYPE_BLOB
 | 
						|
		    || mach_read_from_4(&block->page.frame
 | 
						|
					[FIL_PAGE_DATA
 | 
						|
					 + BTR_BLOB_HDR_NEXT_PAGE_NO])
 | 
						|
		    != FIL_NULL
 | 
						|
		    || mach_read_from_4(&block->page.frame
 | 
						|
					[FIL_PAGE_DATA
 | 
						|
					 + BTR_BLOB_HDR_PART_LEN])
 | 
						|
		    != len) {
 | 
						|
			goto incompatible;
 | 
						|
		}
 | 
						|
 | 
						|
		/* The unused part of the BLOB page should be zero-filled. */
 | 
						|
		for (const byte* b = block->page.frame
 | 
						|
		       + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
 | 
						|
		       * const end = block->page.frame + srv_page_size
 | 
						|
		       - BTR_EXTERN_LEN;
 | 
						|
		     b < end; ) {
 | 
						|
			if (*b++) {
 | 
						|
				goto incompatible;
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if (index->table->deserialise_columns(
 | 
						|
			    &block->page.frame
 | 
						|
			    [FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) {
 | 
						|
			goto incompatible;
 | 
						|
		}
 | 
						|
 | 
						|
		/* Proceed to initialize the default values of
 | 
						|
		any instantly added columns. */
 | 
						|
	}
 | 
						|
 | 
						|
	mem_heap_t* heap = NULL;
 | 
						|
	rec_offs* offsets = rec_get_offsets(rec, index, NULL,
 | 
						|
					    index->n_core_fields,
 | 
						|
					    ULINT_UNDEFINED, &heap);
 | 
						|
	if (rec_offs_any_default(offsets)) {
 | 
						|
inconsistent:
 | 
						|
		mem_heap_free(heap);
 | 
						|
		goto incompatible;
 | 
						|
	}
 | 
						|
 | 
						|
	/* In fact, because we only ever append fields to the metadata
 | 
						|
	record, it is also OK to perform READ UNCOMMITTED and
 | 
						|
	then ignore any extra fields, provided that
 | 
						|
	trx_sys.is_registered(DB_TRX_ID). */
 | 
						|
	if (rec_offs_n_fields(offsets)
 | 
						|
	    > ulint(index->n_fields) + !!index->table->instant
 | 
						|
	    && !trx_sys.is_registered(current_trx(),
 | 
						|
				      row_get_rec_trx_id(rec, index,
 | 
						|
							 offsets))) {
 | 
						|
		goto inconsistent;
 | 
						|
	}
 | 
						|
 | 
						|
	for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
 | 
						|
		dict_col_t* col = index->fields[i].col;
 | 
						|
		const unsigned o = i + !!index->table->instant;
 | 
						|
		ulint len;
 | 
						|
		const byte* data = rec_get_nth_field(rec, offsets, o, &len);
 | 
						|
		ut_ad(!col->is_added());
 | 
						|
		ut_ad(!col->def_val.data);
 | 
						|
		col->def_val.len = len;
 | 
						|
		switch (len) {
 | 
						|
		case UNIV_SQL_NULL:
 | 
						|
			continue;
 | 
						|
		case 0:
 | 
						|
			col->def_val.data = field_ref_zero;
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
		ut_ad(len != UNIV_SQL_DEFAULT);
 | 
						|
		if (!rec_offs_nth_extern(offsets, o)) {
 | 
						|
			col->def_val.data = mem_heap_dup(
 | 
						|
				index->table->heap, data, len);
 | 
						|
		} else if (len < BTR_EXTERN_FIELD_REF_SIZE
 | 
						|
			   || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
 | 
						|
				      field_ref_zero,
 | 
						|
				      BTR_EXTERN_FIELD_REF_SIZE)) {
 | 
						|
			col->def_val.len = UNIV_SQL_DEFAULT;
 | 
						|
			goto inconsistent;
 | 
						|
		} else {
 | 
						|
			col->def_val.data = btr_copy_externally_stored_field(
 | 
						|
				&col->def_val.len, data,
 | 
						|
				cur.page_cur.block->zip_size(),
 | 
						|
				len, index->table->heap);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	mem_heap_free(heap);
 | 
						|
	return DB_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/** Load the instant ALTER TABLE metadata from the clustered index
 | 
						|
when loading a table definition.
 | 
						|
@param[in,out]	table	table definition from the data dictionary
 | 
						|
@return	error code
 | 
						|
@retval	DB_SUCCESS	if no error occurred */
 | 
						|
dberr_t btr_cur_instant_init(dict_table_t *table)
 | 
						|
{
 | 
						|
  mtr_t mtr;
 | 
						|
  dict_index_t *index= dict_table_get_first_index(table);
 | 
						|
  mtr.start();
 | 
						|
  dberr_t err = index ? btr_cur_instant_init_low(index, &mtr) : DB_CORRUPTION;
 | 
						|
  mtr.commit();
 | 
						|
  if (err == DB_SUCCESS && index->is_gen_clust())
 | 
						|
  {
 | 
						|
    btr_cur_t cur;
 | 
						|
    mtr.start();
 | 
						|
    err= cur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr);
 | 
						|
    if (err != DB_SUCCESS);
 | 
						|
    else if (const rec_t *rec= page_rec_get_prev(btr_cur_get_rec(&cur)))
 | 
						|
      if (page_rec_is_user_rec(rec))
 | 
						|
        table->row_id= mach_read_from_6(rec);
 | 
						|
    mtr.commit();
 | 
						|
  }
 | 
						|
  return(err);
 | 
						|
}
 | 
						|
 | 
						|
/** Initialize the n_core_null_bytes on first access to a clustered
 | 
						|
index root page.
 | 
						|
@param[in]	index	clustered index that is on its first access
 | 
						|
@param[in]	page	clustered index root page
 | 
						|
@return	whether the page is corrupted */
 | 
						|
bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 | 
						|
{
 | 
						|
	ut_ad(!index->is_dummy);
 | 
						|
	ut_ad(index->is_primary());
 | 
						|
	ut_ad(!index->is_instant());
 | 
						|
	ut_ad(index->table->supports_instant());
 | 
						|
 | 
						|
	if (page_has_siblings(page)) {
 | 
						|
		return true;
 | 
						|
	}
 | 
						|
 | 
						|
	/* This is normally executed as part of btr_cur_instant_init()
 | 
						|
	when dict_load_table_one() is loading a table definition.
 | 
						|
	Other threads should not access or modify the n_core_null_bytes,
 | 
						|
	n_core_fields before dict_load_table_one() returns.
 | 
						|
 | 
						|
	This can also be executed during IMPORT TABLESPACE, where the
 | 
						|
	table definition is exclusively locked. */
 | 
						|
 | 
						|
	switch (fil_page_get_type(page)) {
 | 
						|
	default:
 | 
						|
		return true;
 | 
						|
	case FIL_PAGE_INDEX:
 | 
						|
		/* The field PAGE_INSTANT is guaranteed 0 on clustered
 | 
						|
		index root pages of ROW_FORMAT=COMPACT or
 | 
						|
		ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
 | 
						|
		if (page_is_comp(page) && page_get_instant(page)) {
 | 
						|
			return true;
 | 
						|
		}
 | 
						|
		index->n_core_null_bytes = static_cast<uint8_t>(
 | 
						|
			UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
 | 
						|
		return false;
 | 
						|
	case FIL_PAGE_TYPE_INSTANT:
 | 
						|
		break;
 | 
						|
	}
 | 
						|
 | 
						|
	const uint16_t n = page_get_instant(page);
 | 
						|
 | 
						|
	if (n < index->n_uniq + DATA_ROLL_PTR) {
 | 
						|
		/* The PRIMARY KEY (or hidden DB_ROW_ID) and
 | 
						|
		DB_TRX_ID,DB_ROLL_PTR columns must always be present
 | 
						|
		as 'core' fields. */
 | 
						|
		return true;
 | 
						|
	}
 | 
						|
 | 
						|
	if (n > REC_MAX_N_FIELDS) {
 | 
						|
		return true;
 | 
						|
	}
 | 
						|
 | 
						|
	index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
 | 
						|
 | 
						|
	const rec_t* infimum = page_get_infimum_rec(page);
 | 
						|
	const rec_t* supremum = page_get_supremum_rec(page);
 | 
						|
 | 
						|
	if (!memcmp(infimum, "infimum", 8)
 | 
						|
	    && !memcmp(supremum, "supremum", 8)) {
 | 
						|
		if (n > index->n_fields) {
 | 
						|
			/* All fields, including those for instantly
 | 
						|
			added columns, must be present in the
 | 
						|
			data dictionary. */
 | 
						|
			return true;
 | 
						|
		}
 | 
						|
 | 
						|
		ut_ad(!index->is_dummy);
 | 
						|
		ut_d(index->is_dummy = true);
 | 
						|
		index->n_core_null_bytes = static_cast<uint8_t>(
 | 
						|
			UT_BITS_IN_BYTES(index->get_n_nullable(n)));
 | 
						|
		ut_d(index->is_dummy = false);
 | 
						|
		return false;
 | 
						|
	}
 | 
						|
 | 
						|
	if (memcmp(infimum, field_ref_zero, 8)
 | 
						|
	    || memcmp(supremum, field_ref_zero, 7)) {
 | 
						|
		/* The infimum and supremum records must either contain
 | 
						|
		the original strings, or they must be filled with zero
 | 
						|
		bytes, except for the bytes that we have repurposed. */
 | 
						|
		return true;
 | 
						|
	}
 | 
						|
 | 
						|
	index->n_core_null_bytes = supremum[7];
 | 
						|
	return index->n_core_null_bytes > 128;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
Gets intention in btr_intention_t from latch_mode, and cleares the intention
 | 
						|
at the latch_mode.
 | 
						|
@param latch_mode	in/out: pointer to latch_mode
 | 
						|
@return intention for latching tree */
 | 
						|
static
 | 
						|
btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode)
 | 
						|
{
 | 
						|
	btr_intention_t	intention;
 | 
						|
 | 
						|
	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
 | 
						|
	case BTR_LATCH_FOR_INSERT:
 | 
						|
		intention = BTR_INTENTION_INSERT;
 | 
						|
		break;
 | 
						|
	case BTR_LATCH_FOR_DELETE:
 | 
						|
		intention = BTR_INTENTION_DELETE;
 | 
						|
		break;
 | 
						|
	default:
 | 
						|
		/* both or unknown */
 | 
						|
		intention = BTR_INTENTION_BOTH;
 | 
						|
	}
 | 
						|
	*latch_mode = btr_latch_mode(
 | 
						|
		*latch_mode & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
 | 
						|
 | 
						|
	return(intention);
 | 
						|
}
 | 
						|
 | 
						|
/** @return whether the distance between two records is at most the
 | 
						|
specified value */
 | 
						|
template<bool comp>
 | 
						|
static bool
 | 
						|
page_rec_distance_is_at_most(const page_t *page, const rec_t *left,
 | 
						|
                             const rec_t *right, ulint val)
 | 
						|
  noexcept
 | 
						|
{
 | 
						|
  do
 | 
						|
  {
 | 
						|
    if (left == right)
 | 
						|
      return true;
 | 
						|
    left= page_rec_next_get<comp>(page, left);
 | 
						|
  }
 | 
						|
  while (left && val--);
 | 
						|
  return false;
 | 
						|
}
 | 
						|
 | 
						|
/** Detects whether the modifying record might need a modifying tree structure.
 | 
						|
@param[in]	index		index
 | 
						|
@param[in]	page		page
 | 
						|
@param[in]	lock_intention	lock intention for the tree operation
 | 
						|
@param[in]	rec		record (current node_ptr)
 | 
						|
@param[in]	rec_size	size of the record or max size of node_ptr
 | 
						|
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 | 
						|
@param[in]	mtr		mtr
 | 
						|
@return true if tree modification is needed */
 | 
						|
static
 | 
						|
bool
 | 
						|
btr_cur_will_modify_tree(
 | 
						|
	dict_index_t*	index,
 | 
						|
	const page_t*	page,
 | 
						|
	btr_intention_t	lock_intention,
 | 
						|
	const rec_t*	rec,
 | 
						|
	ulint		rec_size,
 | 
						|
	ulint		zip_size,
 | 
						|
	mtr_t*		mtr)
 | 
						|
{
 | 
						|
	ut_ad(!page_is_leaf(page));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 | 
						|
					 | MTR_MEMO_SX_LOCK));
 | 
						|
 | 
						|
	/* Pessimistic delete of the first record causes delete & insert
 | 
						|
	of node_ptr at upper level. And a subsequent page shrink is
 | 
						|
	possible. It causes delete of node_ptr at the upper level.
 | 
						|
	So we should pay attention also to 2nd record not only
 | 
						|
	first record and last record. Because if the "delete & insert" are
 | 
						|
	done for the different page, the 2nd record become
 | 
						|
	first record and following compress might delete the record and causes
 | 
						|
	the uppper level node_ptr modification. */
 | 
						|
 | 
						|
	const ulint n_recs = page_get_n_recs(page);
 | 
						|
 | 
						|
	if (lock_intention <= BTR_INTENTION_BOTH) {
 | 
						|
		compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
 | 
						|
		compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
 | 
						|
 | 
						|
		if (!page_has_siblings(page)) {
 | 
						|
			return true;
 | 
						|
		}
 | 
						|
 | 
						|
		ulint margin = rec_size;
 | 
						|
 | 
						|
		if (lock_intention == BTR_INTENTION_BOTH) {
 | 
						|
			ulint	level = btr_page_get_level(page);
 | 
						|
 | 
						|
			/* This value is the worst expectation for the node_ptr
 | 
						|
			records to be deleted from this page. It is used to
 | 
						|
			expect whether the cursor position can be the left_most
 | 
						|
			record in this page or not. */
 | 
						|
			ulint   max_nodes_deleted = 0;
 | 
						|
 | 
						|
			/* By modifying tree operations from the under of this
 | 
						|
			level, logically (2 ^ (level - 1)) opportunities to
 | 
						|
			deleting records in maximum even unreally rare case. */
 | 
						|
			if (level > 7) {
 | 
						|
				/* TODO: adjust this practical limit. */
 | 
						|
				max_nodes_deleted = 64;
 | 
						|
			} else if (level > 0) {
 | 
						|
				max_nodes_deleted = (ulint)1 << (level - 1);
 | 
						|
			}
 | 
						|
			/* check delete will cause. (BTR_INTENTION_BOTH
 | 
						|
			or BTR_INTENTION_DELETE) */
 | 
						|
			if (n_recs <= max_nodes_deleted * 2) {
 | 
						|
				/* The cursor record can be the left most record
 | 
						|
				in this page. */
 | 
						|
				return true;
 | 
						|
			}
 | 
						|
 | 
						|
			if (page_is_comp(page)) {
 | 
						|
				const rec_t *const infimum
 | 
						|
					= page + PAGE_NEW_INFIMUM;
 | 
						|
				if (page_rec_next_get<true>(page, infimum)
 | 
						|
				    == rec) {
 | 
						|
					return true;
 | 
						|
				}
 | 
						|
				if (page_has_prev(page)
 | 
						|
				    && page_rec_distance_is_at_most<true>(
 | 
						|
					    page, infimum, rec,
 | 
						|
					    max_nodes_deleted)) {
 | 
						|
					return true;
 | 
						|
				}
 | 
						|
				if (page_has_next(page)
 | 
						|
				    && page_rec_distance_is_at_most<true>(
 | 
						|
					    page, rec,
 | 
						|
					    page + PAGE_NEW_SUPREMUM,
 | 
						|
					    max_nodes_deleted)) {
 | 
						|
					return true;
 | 
						|
				}
 | 
						|
			} else {
 | 
						|
				const rec_t *const infimum
 | 
						|
					= page + PAGE_OLD_INFIMUM;
 | 
						|
				if (page_rec_next_get<false>(page, infimum)
 | 
						|
				    == rec) {
 | 
						|
					return true;
 | 
						|
				}
 | 
						|
				if (page_has_prev(page)
 | 
						|
				    && page_rec_distance_is_at_most<false>(
 | 
						|
					    page, infimum, rec,
 | 
						|
					    max_nodes_deleted)) {
 | 
						|
					return true;
 | 
						|
				}
 | 
						|
				if (page_has_next(page)
 | 
						|
				    && page_rec_distance_is_at_most<false>(
 | 
						|
					    page, rec,
 | 
						|
					    page + PAGE_OLD_SUPREMUM,
 | 
						|
					    max_nodes_deleted)) {
 | 
						|
					return true;
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
			/* Delete at leftmost record in a page causes delete
 | 
						|
			& insert at its parent page. After that, the delete
 | 
						|
			might cause btr_compress() and delete record at its
 | 
						|
			parent page. Thus we should consider max deletes. */
 | 
						|
			margin *= max_nodes_deleted;
 | 
						|
		}
 | 
						|
 | 
						|
		/* Safe because we already have SX latch of the index tree */
 | 
						|
		if (page_get_data_size(page)
 | 
						|
		    < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
 | 
						|
			return(true);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (lock_intention >= BTR_INTENTION_BOTH) {
 | 
						|
		/* check insert will cause. BTR_INTENTION_BOTH
 | 
						|
		or BTR_INTENTION_INSERT*/
 | 
						|
 | 
						|
		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
 | 
						|
		we should check it here in advance, since the max allowable
 | 
						|
		records in a page is limited. */
 | 
						|
		LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
 | 
						|
 | 
						|
		/* needs 2 records' space for the case the single split and
 | 
						|
		insert cannot fit.
 | 
						|
		page_get_max_insert_size_after_reorganize() includes space
 | 
						|
		for page directory already */
 | 
						|
		ulint	max_size
 | 
						|
			= page_get_max_insert_size_after_reorganize(page, 2);
 | 
						|
 | 
						|
		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
 | 
						|
		    || max_size < rec_size * 2) {
 | 
						|
			return(true);
 | 
						|
		}
 | 
						|
 | 
						|
		/* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
 | 
						|
		This is based on the worst case, and we could invoke
 | 
						|
		page_zip_available() on the block->page.zip. */
 | 
						|
		/* needs 2 records' space also for worst compress rate. */
 | 
						|
		if (zip_size
 | 
						|
		    && page_zip_empty_size(index->n_fields, zip_size)
 | 
						|
		    <= rec_size * 2 + page_get_data_size(page)
 | 
						|
		    + page_dir_calc_reserved_space(n_recs + 2)) {
 | 
						|
			return(true);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return(false);
 | 
						|
}
 | 
						|
 | 
						|
/** Detects whether the modifying record might need a opposite modification
 | 
						|
to the intention.
 | 
						|
@param bpage             buffer pool page
 | 
						|
@param is_clust          whether this is a clustered index
 | 
						|
@param lock_intention    lock intention for the tree operation
 | 
						|
@param node_ptr_max_size the maximum size of a node pointer
 | 
						|
@param compress_limit    BTR_CUR_PAGE_COMPRESS_LIMIT(index)
 | 
						|
@param rec               record (current node_ptr)
 | 
						|
@return true if tree modification is needed */
 | 
						|
static bool btr_cur_need_opposite_intention(const buf_page_t &bpage,
 | 
						|
                                            bool is_clust,
 | 
						|
                                            btr_intention_t lock_intention,
 | 
						|
                                            ulint node_ptr_max_size,
 | 
						|
                                            ulint compress_limit,
 | 
						|
                                            const rec_t *rec)
 | 
						|
{
 | 
						|
  ut_ad(bpage.frame == page_align(rec));
 | 
						|
  if (UNIV_LIKELY_NULL(bpage.zip.data) &&
 | 
						|
      !page_zip_available(&bpage.zip, is_clust, node_ptr_max_size, 1))
 | 
						|
    return true;
 | 
						|
  const page_t *const page= bpage.frame;
 | 
						|
  if (lock_intention != BTR_INTENTION_INSERT)
 | 
						|
  {
 | 
						|
    /* We compensate also for btr_cur_compress_recommendation() */
 | 
						|
    if (!page_has_siblings(page) ||
 | 
						|
        page_rec_is_first(rec, page) || page_rec_is_last(rec, page) ||
 | 
						|
        page_get_data_size(page) < node_ptr_max_size + compress_limit)
 | 
						|
      return true;
 | 
						|
    if (lock_intention == BTR_INTENTION_DELETE)
 | 
						|
      return false;
 | 
						|
  }
 | 
						|
  else if (page_has_next(page) && page_rec_is_last(rec, page))
 | 
						|
    return true;
 | 
						|
  LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true);
 | 
						|
  const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2);
 | 
						|
  return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size ||
 | 
						|
    max_size < node_ptr_max_size * 2;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
@param[in]	index b-tree
 | 
						|
@return maximum size of a node pointer record in bytes */
 | 
						|
static ulint btr_node_ptr_max_size(const dict_index_t* index)
 | 
						|
{
 | 
						|
	/* Each record has page_no, length of page_no and header. */
 | 
						|
	ulint comp = dict_table_is_comp(index->table);
 | 
						|
	ulint rec_max_size = comp
 | 
						|
		? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
 | 
						|
		+ UT_BITS_IN_BYTES(index->n_nullable)
 | 
						|
		: REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
 | 
						|
		+ 2 * index->n_fields;
 | 
						|
 | 
						|
	/* Compute the maximum possible record size. */
 | 
						|
	for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
 | 
						|
		const dict_field_t*	field
 | 
						|
			= dict_index_get_nth_field(index, i);
 | 
						|
		const dict_col_t*	col
 | 
						|
			= dict_field_get_col(field);
 | 
						|
		ulint			field_max_size;
 | 
						|
		ulint			field_ext_max_size;
 | 
						|
 | 
						|
		/* Determine the maximum length of the index field. */
 | 
						|
 | 
						|
		field_max_size = dict_col_get_fixed_size(col, comp);
 | 
						|
		if (field_max_size && field->fixed_len) {
 | 
						|
			/* dict_index_add_col() should guarantee this */
 | 
						|
			ut_ad(!field->prefix_len
 | 
						|
			      || field->fixed_len == field->prefix_len);
 | 
						|
			/* Fixed lengths are not encoded
 | 
						|
			in ROW_FORMAT=COMPACT. */
 | 
						|
			rec_max_size += field_max_size;
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		field_max_size = dict_col_get_max_size(col);
 | 
						|
		if (UNIV_UNLIKELY(!field_max_size)) {
 | 
						|
			switch (col->mtype) {
 | 
						|
			case DATA_VARCHAR:
 | 
						|
				if (!comp
 | 
						|
				    && (!strcmp(index->table->name.m_name,
 | 
						|
						"SYS_FOREIGN")
 | 
						|
					|| !strcmp(index->table->name.m_name,
 | 
						|
						   "SYS_FOREIGN_COLS"))) {
 | 
						|
					break;
 | 
						|
				}
 | 
						|
				/* fall through */
 | 
						|
			case DATA_FIXBINARY:
 | 
						|
			case DATA_BINARY:
 | 
						|
			case DATA_VARMYSQL:
 | 
						|
			case DATA_CHAR:
 | 
						|
			case DATA_MYSQL:
 | 
						|
				/* BINARY(0), VARBINARY(0),
 | 
						|
				CHAR(0) and VARCHAR(0) are possible
 | 
						|
				data type definitions in MariaDB.
 | 
						|
				The InnoDB internal SQL parser maps
 | 
						|
				CHAR to DATA_VARCHAR, so DATA_CHAR (or
 | 
						|
				DATA_MYSQL) is only coming from the
 | 
						|
				MariaDB SQL layer. */
 | 
						|
				if (comp) {
 | 
						|
					/* Add a length byte, because
 | 
						|
					fixed-length empty field are
 | 
						|
					encoded as variable-length.
 | 
						|
					For ROW_FORMAT=REDUNDANT,
 | 
						|
					these bytes were added to
 | 
						|
					rec_max_size before this loop. */
 | 
						|
					rec_max_size++;
 | 
						|
				}
 | 
						|
				continue;
 | 
						|
			}
 | 
						|
 | 
						|
			/* SYS_FOREIGN.ID is defined as CHAR in the
 | 
						|
			InnoDB internal SQL parser, which translates
 | 
						|
			into the incorrect VARCHAR(0).  InnoDB does
 | 
						|
			not enforce maximum lengths of columns, so
 | 
						|
			that is why any data can be inserted in the
 | 
						|
			first place.
 | 
						|
 | 
						|
			Likewise, SYS_FOREIGN.FOR_NAME,
 | 
						|
			SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
 | 
						|
			defined as CHAR, and also they are part of a key. */
 | 
						|
 | 
						|
			ut_ad(!strcmp(index->table->name.m_name,
 | 
						|
				      "SYS_FOREIGN")
 | 
						|
			      || !strcmp(index->table->name.m_name,
 | 
						|
					 "SYS_FOREIGN_COLS"));
 | 
						|
			ut_ad(!comp);
 | 
						|
			ut_ad(col->mtype == DATA_VARCHAR);
 | 
						|
 | 
						|
			rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
 | 
						|
				? REDUNDANT_REC_MAX_DATA_SIZE
 | 
						|
				: page_get_free_space_of_empty(FALSE) / 2;
 | 
						|
		} else if (field_max_size == NAME_LEN && i == 1
 | 
						|
			   && (!strcmp(index->table->name.m_name,
 | 
						|
				       TABLE_STATS_NAME)
 | 
						|
			       || !strcmp(index->table->name.m_name,
 | 
						|
					  INDEX_STATS_NAME))) {
 | 
						|
			/* Interpret "table_name" as VARCHAR(199) even
 | 
						|
			if it was incorrectly defined as VARCHAR(64).
 | 
						|
			While the caller of ha_innobase enforces the
 | 
						|
			maximum length on any data written, the InnoDB
 | 
						|
			internal SQL parser will happily write as much
 | 
						|
			data as is provided. The purpose of this hack
 | 
						|
			is to avoid InnoDB hangs after persistent
 | 
						|
			statistics on partitioned tables are
 | 
						|
			deleted. */
 | 
						|
			field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
 | 
						|
		}
 | 
						|
		field_ext_max_size = field_max_size < 256 ? 1 : 2;
 | 
						|
 | 
						|
		if (field->prefix_len
 | 
						|
		    && field->prefix_len < field_max_size) {
 | 
						|
			field_max_size = field->prefix_len;
 | 
						|
		}
 | 
						|
 | 
						|
		if (comp) {
 | 
						|
			/* Add the extra size for ROW_FORMAT=COMPACT.
 | 
						|
			For ROW_FORMAT=REDUNDANT, these bytes were
 | 
						|
			added to rec_max_size before this loop. */
 | 
						|
			rec_max_size += field_ext_max_size;
 | 
						|
		}
 | 
						|
 | 
						|
		rec_max_size += field_max_size;
 | 
						|
	}
 | 
						|
 | 
						|
	return rec_max_size;
 | 
						|
}
 | 
						|
 | 
						|
/** @return a B-tree search mode suitable for non-leaf pages
 | 
						|
@param mode  leaf page search mode */
 | 
						|
static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)
 | 
						|
{
 | 
						|
  if (mode > PAGE_CUR_GE)
 | 
						|
  {
 | 
						|
    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
 | 
						|
    return mode;
 | 
						|
  }
 | 
						|
  if (mode == PAGE_CUR_GE)
 | 
						|
    return PAGE_CUR_L;
 | 
						|
  ut_ad(mode == PAGE_CUR_G);
 | 
						|
  return PAGE_CUR_LE;
 | 
						|
}
 | 
						|
 | 
						|
MY_ATTRIBUTE((nonnull,warn_unused_result))
 | 
						|
/** Acquire a latch on the previous page without violating the latching order.
 | 
						|
@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH)
 | 
						|
@param page_id  page identifier with valid space identifier
 | 
						|
@param err      error code
 | 
						|
@param mtr      mini-transaction
 | 
						|
@retval 0  if an error occurred
 | 
						|
@retval 1  if the page could be latched in the wrong order
 | 
						|
@retval -1 if the latch on block was temporarily released */
 | 
						|
static int btr_latch_prev(rw_lock_type_t rw_latch,
 | 
						|
                          page_id_t page_id, dberr_t *err, mtr_t *mtr)
 | 
						|
{
 | 
						|
  ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
 | 
						|
 | 
						|
  buf_block_t *block= mtr->at_savepoint(mtr->get_savepoint() - 1);
 | 
						|
 | 
						|
  ut_ad(page_id.space() == block->page.id().space());
 | 
						|
 | 
						|
  const page_t *const page= block->page.frame;
 | 
						|
  page_id.set_page_no(btr_page_get_prev(page));
 | 
						|
  /* We are holding a latch on the current page.
 | 
						|
 | 
						|
  We will start by buffer-fixing the left sibling. Waiting for a latch
 | 
						|
  on it while holding a latch on the current page could lead to a
 | 
						|
  deadlock, because another thread could hold that latch and wait for
 | 
						|
  a right sibling page latch (the current page).
 | 
						|
 | 
						|
  If there is a conflict, we will temporarily release our latch on the
 | 
						|
  current block while waiting for a latch on the left sibling.  The
 | 
						|
  buffer-fixes on both blocks will prevent eviction. */
 | 
						|
 | 
						|
 retry:
 | 
						|
  int ret= 1;
 | 
						|
  buf_block_t *prev= buf_pool.page_fix(page_id, err, buf_pool_t::FIX_NOWAIT);
 | 
						|
  if (UNIV_UNLIKELY(!prev))
 | 
						|
    return 0;
 | 
						|
  if (prev == reinterpret_cast<buf_block_t*>(-1))
 | 
						|
  {
 | 
						|
    /* The block existed in buf_pool.page_hash, but not in a state that is
 | 
						|
    safe to access without waiting for some pending operation, such as
 | 
						|
    buf_page_t::read_complete() or buf_pool_t::unzip().
 | 
						|
 | 
						|
    Retry while temporarily releasing the successor block->page.lock
 | 
						|
    (but retaining a buffer-fix so that the block cannot be evicted. */
 | 
						|
 | 
						|
    if (rw_latch == RW_S_LATCH)
 | 
						|
      block->page.lock.s_unlock();
 | 
						|
    else
 | 
						|
      block->page.lock.x_unlock();
 | 
						|
 | 
						|
    prev= buf_pool.page_fix(page_id, err, buf_pool_t::FIX_WAIT_READ);
 | 
						|
 | 
						|
    if (!prev)
 | 
						|
    {
 | 
						|
      ut_ad(*err != DB_SUCCESS);
 | 
						|
      if (rw_latch == RW_S_LATCH)
 | 
						|
        block->page.lock.s_lock();
 | 
						|
      else
 | 
						|
        block->page.lock.x_lock();
 | 
						|
      return 0;
 | 
						|
    }
 | 
						|
    else if (rw_latch == RW_S_LATCH)
 | 
						|
      goto wait_for_s;
 | 
						|
    else
 | 
						|
      goto wait_for_x;
 | 
						|
  }
 | 
						|
 | 
						|
  static_assert(MTR_MEMO_PAGE_S_FIX == mtr_memo_type_t(BTR_SEARCH_LEAF), "");
 | 
						|
  static_assert(MTR_MEMO_PAGE_X_FIX == mtr_memo_type_t(BTR_MODIFY_LEAF), "");
 | 
						|
 | 
						|
  if (rw_latch == RW_S_LATCH
 | 
						|
      ? prev->page.lock.s_lock_try()
 | 
						|
      : prev->page.lock.x_lock_try())
 | 
						|
    mtr->memo_push(prev, mtr_memo_type_t(rw_latch));
 | 
						|
  else
 | 
						|
  {
 | 
						|
    if (rw_latch == RW_S_LATCH)
 | 
						|
    {
 | 
						|
      block->page.lock.s_unlock();
 | 
						|
    wait_for_s:
 | 
						|
      prev->page.lock.s_lock();
 | 
						|
      block->page.lock.s_lock();
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
      block->page.lock.x_unlock();
 | 
						|
    wait_for_x:
 | 
						|
      prev->page.lock.x_lock();
 | 
						|
      block->page.lock.x_lock();
 | 
						|
    }
 | 
						|
 | 
						|
    ut_ad(block == mtr->at_savepoint(mtr->get_savepoint() - 1));
 | 
						|
    mtr->memo_push(prev, mtr_memo_type_t(rw_latch));
 | 
						|
    const page_id_t prev_page_id= page_id;
 | 
						|
    page_id.set_page_no(btr_page_get_prev(page));
 | 
						|
    ret= -1;
 | 
						|
 | 
						|
    if (UNIV_UNLIKELY(page_id != prev_page_id))
 | 
						|
    {
 | 
						|
      mtr->release_last_page();
 | 
						|
      if (page_id.page_no() == FIL_NULL)
 | 
						|
        return ret;
 | 
						|
      goto retry;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  const page_t *const p= prev->page.frame;
 | 
						|
  if (memcmp_aligned<4>(FIL_PAGE_NEXT + p, FIL_PAGE_OFFSET + page, 4) ||
 | 
						|
      memcmp_aligned<2>(FIL_PAGE_TYPE + p, FIL_PAGE_TYPE + page, 2) ||
 | 
						|
      memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + p,
 | 
						|
                        PAGE_HEADER + PAGE_INDEX_ID + page, 8) ||
 | 
						|
      page_is_comp(p) != page_is_comp(page))
 | 
						|
  {
 | 
						|
    ut_ad("corrupted" == 0); // FIXME: remove this
 | 
						|
    *err= DB_CORRUPTION;
 | 
						|
    ret= 0;
 | 
						|
  }
 | 
						|
 | 
						|
  return ret;
 | 
						|
}
 | 
						|
 | 
						|
dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
 | 
						|
                               btr_latch_mode latch_mode, mtr_t *mtr)
 | 
						|
{
 | 
						|
  ut_ad(index()->is_btree());
 | 
						|
 | 
						|
  buf_block_t *guess;
 | 
						|
  btr_intention_t lock_intention;
 | 
						|
  bool detected_same_key_root= false;
 | 
						|
 | 
						|
  mem_heap_t *heap= nullptr;
 | 
						|
  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
 | 
						|
  rec_offs *offsets= offsets_;
 | 
						|
  rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
 | 
						|
  rec_offs *offsets2= offsets2_;
 | 
						|
  rec_offs_init(offsets_);
 | 
						|
  rec_offs_init(offsets2_);
 | 
						|
 | 
						|
  ut_ad(dict_index_check_search_tuple(index(), tuple));
 | 
						|
  ut_ad(dtuple_check_typed(tuple));
 | 
						|
  ut_ad(index()->page != FIL_NULL);
 | 
						|
 | 
						|
  MEM_UNDEFINED(&up_match, sizeof up_match);
 | 
						|
  MEM_UNDEFINED(&up_bytes, sizeof up_bytes);
 | 
						|
  MEM_UNDEFINED(&low_match, sizeof low_match);
 | 
						|
  MEM_UNDEFINED(&low_bytes, sizeof low_bytes);
 | 
						|
  ut_d(up_match= low_match= uint16_t(~0u));
 | 
						|
 | 
						|
  ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) ||
 | 
						|
        mtr->memo_contains_flagged(&index()->lock,
 | 
						|
                                   MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK |
 | 
						|
                                   MTR_MEMO_X_LOCK));
 | 
						|
 | 
						|
  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
 | 
						|
  lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
 | 
						|
  latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
 | 
						|
 | 
						|
  ut_ad(!latch_by_caller
 | 
						|
        || latch_mode == BTR_SEARCH_LEAF
 | 
						|
        || latch_mode == BTR_MODIFY_LEAF
 | 
						|
        || latch_mode == BTR_MODIFY_TREE
 | 
						|
        || latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
 | 
						|
 | 
						|
#ifndef BTR_CUR_ADAPT
 | 
						|
  guess= nullptr;
 | 
						|
#else
 | 
						|
  auto info= &index()->search_info;
 | 
						|
  guess= info->root_guess;
 | 
						|
 | 
						|
# ifdef BTR_CUR_HASH_ADAPT
 | 
						|
  flag= BTR_CUR_BINARY;
 | 
						|
#  ifdef UNIV_SEARCH_PERF_STAT
 | 
						|
  info->n_searches++;
 | 
						|
#  endif
 | 
						|
  if (latch_mode > BTR_MODIFY_LEAF)
 | 
						|
    /* The adaptive hash index cannot be useful for these searches. */;
 | 
						|
  else if (mode != PAGE_CUR_LE && mode != PAGE_CUR_GE)
 | 
						|
    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_G);
 | 
						|
  /* We do a dirty read of btr_search.enabled below,
 | 
						|
  and btr_search_guess_on_hash() will have to check it again. */
 | 
						|
  else if (!btr_search.enabled);
 | 
						|
  else if (btr_search_guess_on_hash(index(), tuple, mode != PAGE_CUR_LE,
 | 
						|
                                    latch_mode, this, mtr))
 | 
						|
  {
 | 
						|
    /* Search using the hash index succeeded */
 | 
						|
    ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_GE);
 | 
						|
    ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
    ut_ad(low_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
    ++btr_cur_n_sea;
 | 
						|
 | 
						|
    return DB_SUCCESS;
 | 
						|
  }
 | 
						|
  else
 | 
						|
    ++btr_cur_n_non_sea;
 | 
						|
# endif
 | 
						|
#endif
 | 
						|
 | 
						|
  /* If the hash search did not succeed, do binary search down the
 | 
						|
     tree */
 | 
						|
 | 
						|
  /* Store the position of the tree latch we push to mtr so that we
 | 
						|
     know how to release it when we have latched leaf node(s) */
 | 
						|
 | 
						|
  const ulint savepoint= mtr->get_savepoint();
 | 
						|
 | 
						|
  ulint node_ptr_max_size= 0, compress_limit= 0;
 | 
						|
  rw_lock_type_t rw_latch= RW_S_LATCH;
 | 
						|
 | 
						|
  switch (latch_mode) {
 | 
						|
  case BTR_MODIFY_TREE:
 | 
						|
    rw_latch= RW_X_LATCH;
 | 
						|
    node_ptr_max_size= btr_node_ptr_max_size(index());
 | 
						|
    if (latch_by_caller)
 | 
						|
    {
 | 
						|
      ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK));
 | 
						|
      break;
 | 
						|
    }
 | 
						|
    if (lock_intention == BTR_INTENTION_DELETE)
 | 
						|
    {
 | 
						|
      compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index());
 | 
						|
      if (os_aio_pending_reads_approx() &&
 | 
						|
          trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
 | 
						|
      {
 | 
						|
        /* Most delete-intended operations are due to the purge of history.
 | 
						|
        Prioritize them when the history list is growing huge. */
 | 
						|
        mtr_x_lock_index(index(), mtr);
 | 
						|
        break;
 | 
						|
      }
 | 
						|
    }
 | 
						|
    mtr_sx_lock_index(index(), mtr);
 | 
						|
    break;
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
  case BTR_CONT_MODIFY_TREE:
 | 
						|
    ut_ad("invalid mode" == 0);
 | 
						|
    break;
 | 
						|
#endif
 | 
						|
  case BTR_MODIFY_ROOT_AND_LEAF:
 | 
						|
    rw_latch= RW_SX_LATCH;
 | 
						|
    /* fall through */
 | 
						|
  default:
 | 
						|
    if (!latch_by_caller)
 | 
						|
      mtr_s_lock_index(index(), mtr);
 | 
						|
  }
 | 
						|
 | 
						|
  dberr_t err;
 | 
						|
 | 
						|
  if (!index()->table->space)
 | 
						|
  {
 | 
						|
  corrupted:
 | 
						|
    ut_ad("corrupted" == 0); // FIXME: remove this
 | 
						|
    err= DB_CORRUPTION;
 | 
						|
  func_exit:
 | 
						|
    if (UNIV_LIKELY_NULL(heap))
 | 
						|
      mem_heap_free(heap);
 | 
						|
    return err;
 | 
						|
  }
 | 
						|
 | 
						|
  const ulint zip_size= index()->table->space->zip_size();
 | 
						|
 | 
						|
  /* Start with the root page. */
 | 
						|
  page_id_t page_id(index()->table->space_id, index()->page);
 | 
						|
 | 
						|
  const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode);
 | 
						|
  ulint height= ULINT_UNDEFINED;
 | 
						|
  up_match= 0;
 | 
						|
  up_bytes= 0;
 | 
						|
  low_match= 0;
 | 
						|
  low_bytes= 0;
 | 
						|
 search_loop:
 | 
						|
  auto block_savepoint= mtr->get_savepoint();
 | 
						|
  buf_block_t *block=
 | 
						|
    buf_page_get_gen(page_id, zip_size, rw_latch, guess, BUF_GET, mtr, &err);
 | 
						|
  if (!block)
 | 
						|
  {
 | 
						|
    btr_read_failed(err, *index());
 | 
						|
    goto func_exit;
 | 
						|
  }
 | 
						|
 | 
						|
  btr_search_drop_page_hash_index(block, index());
 | 
						|
 | 
						|
  if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
 | 
						|
      btr_page_get_index_id(block->page.frame) != index()->id ||
 | 
						|
      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
 | 
						|
      !fil_page_index_page_check(block->page.frame))
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  page_cur.block= block;
 | 
						|
  ut_ad(block == mtr->at_savepoint(block_savepoint));
 | 
						|
  const bool not_first_access{buf_page_make_young_if_needed(&block->page)};
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
  if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
 | 
						|
    ut_a(page_zip_validate(page_zip, block->page.frame, index()));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
  uint32_t page_level= btr_page_get_level(block->page.frame);
 | 
						|
 | 
						|
  if (height == ULINT_UNDEFINED)
 | 
						|
  {
 | 
						|
    /* We are in the B-tree index root page. */
 | 
						|
#ifdef BTR_CUR_ADAPT
 | 
						|
    info->root_guess= block;
 | 
						|
#endif
 | 
						|
  reached_root:
 | 
						|
    height= page_level;
 | 
						|
    tree_height= height + 1;
 | 
						|
 | 
						|
    if (!height)
 | 
						|
    {
 | 
						|
      /* The root page is also a leaf page.
 | 
						|
      We may have to reacquire the page latch in a different mode. */
 | 
						|
      switch (rw_latch) {
 | 
						|
      case RW_S_LATCH:
 | 
						|
        if (!(latch_mode & BTR_SEARCH_LEAF))
 | 
						|
        {
 | 
						|
          rw_latch= RW_X_LATCH;
 | 
						|
          ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH);
 | 
						|
          mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_X_FIX);
 | 
						|
          if (!block->page.lock.s_x_upgrade_try())
 | 
						|
          {
 | 
						|
            block->page.lock.s_unlock();
 | 
						|
            block->page.lock.x_lock();
 | 
						|
            /* Dropping the index tree (and freeing the root page)
 | 
						|
            should be impossible while we hold index()->lock. */
 | 
						|
            ut_ad(!block->page.is_freed());
 | 
						|
            page_level= btr_page_get_level(block->page.frame);
 | 
						|
            if (UNIV_UNLIKELY(page_level != 0))
 | 
						|
            {
 | 
						|
              /* btr_root_raise_and_insert() was executed meanwhile */
 | 
						|
              ut_ad(mtr->memo_contains_flagged(&index()->lock,
 | 
						|
                                               MTR_MEMO_S_LOCK));
 | 
						|
              block->page.lock.x_u_downgrade();
 | 
						|
              block->page.lock.u_s_downgrade();
 | 
						|
              rw_latch= RW_S_LATCH;
 | 
						|
              mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_S_FIX);
 | 
						|
              goto reached_root;
 | 
						|
            }
 | 
						|
          }
 | 
						|
        }
 | 
						|
        if (rw_latch != RW_S_LATCH)
 | 
						|
          break;
 | 
						|
        if (!latch_by_caller)
 | 
						|
          /* Release the tree s-latch */
 | 
						|
          mtr->rollback_to_savepoint(savepoint, savepoint + 1);
 | 
						|
        goto reached_latched_leaf;
 | 
						|
      case RW_SX_LATCH:
 | 
						|
        ut_ad(latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
 | 
						|
        static_assert(int{BTR_MODIFY_ROOT_AND_LEAF} == int{RW_SX_LATCH}, "");
 | 
						|
        rw_latch= RW_X_LATCH;
 | 
						|
        mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_X_FIX);
 | 
						|
        block->page.lock.u_x_upgrade();
 | 
						|
        break;
 | 
						|
      case RW_X_LATCH:
 | 
						|
        if (latch_mode == BTR_MODIFY_TREE)
 | 
						|
          goto reached_index_root_and_leaf;
 | 
						|
        break;
 | 
						|
      case RW_NO_LATCH:
 | 
						|
        ut_ad(0);
 | 
						|
      }
 | 
						|
      goto reached_root_and_leaf;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  else if (UNIV_UNLIKELY(height != page_level))
 | 
						|
    goto corrupted;
 | 
						|
  else
 | 
						|
    switch (latch_mode) {
 | 
						|
    case BTR_MODIFY_TREE:
 | 
						|
      break;
 | 
						|
    case BTR_MODIFY_ROOT_AND_LEAF:
 | 
						|
      ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() ==
 | 
						|
             index()->page) == (tree_height <= height + 2));
 | 
						|
      if (tree_height <= height + 2)
 | 
						|
        /* Retain the root page latch. */
 | 
						|
        break;
 | 
						|
      /* fall through */
 | 
						|
    default:
 | 
						|
      ut_ad(block_savepoint > savepoint);
 | 
						|
      mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
 | 
						|
      block_savepoint--;
 | 
						|
    }
 | 
						|
 | 
						|
  if (!height)
 | 
						|
  {
 | 
						|
    /* We reached the leaf level. */
 | 
						|
    ut_ad(block == mtr->at_savepoint(block_savepoint));
 | 
						|
 | 
						|
    if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF)
 | 
						|
    {
 | 
						|
    reached_root_and_leaf:
 | 
						|
      if (!latch_by_caller)
 | 
						|
        mtr->rollback_to_savepoint(savepoint, savepoint + 1);
 | 
						|
    reached_index_root_and_leaf:
 | 
						|
      ut_ad(rw_latch == RW_X_LATCH);
 | 
						|
      btr_search_drop_page_hash_index(block, index());
 | 
						|
      if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
 | 
						|
                                     &page_cur, nullptr))
 | 
						|
        goto corrupted;
 | 
						|
      ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_GE);
 | 
						|
      ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
      ut_ad(low_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
      goto func_exit;
 | 
						|
    }
 | 
						|
 | 
						|
    switch (latch_mode) {
 | 
						|
    case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
 | 
						|
      ut_ad(!latch_by_caller);
 | 
						|
      ut_ad(rw_latch == RW_S_LATCH);
 | 
						|
 | 
						|
      /* latch also siblings from left to right */
 | 
						|
      if (page_has_prev(block->page.frame) &&
 | 
						|
          !btr_latch_prev(rw_latch, page_id, &err, mtr))
 | 
						|
        goto func_exit;
 | 
						|
      if (page_has_next(block->page.frame) &&
 | 
						|
          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
 | 
						|
                         rw_latch, mtr, &err))
 | 
						|
        goto func_exit;
 | 
						|
      goto release_tree;
 | 
						|
    case BTR_SEARCH_LEAF:
 | 
						|
    case BTR_MODIFY_LEAF:
 | 
						|
      ut_ad(rw_latch == rw_lock_type_t(latch_mode));
 | 
						|
      if (!latch_by_caller)
 | 
						|
      {
 | 
						|
release_tree:
 | 
						|
        /* Release the tree s-latch */
 | 
						|
        block_savepoint--;
 | 
						|
        mtr->rollback_to_savepoint(savepoint, savepoint + 1);
 | 
						|
      }
 | 
						|
      /* release upper blocks */
 | 
						|
      if (savepoint < block_savepoint)
 | 
						|
        mtr->rollback_to_savepoint(savepoint, block_savepoint);
 | 
						|
      break;
 | 
						|
    default:
 | 
						|
      ut_ad(latch_mode == BTR_MODIFY_TREE);
 | 
						|
      ut_ad(rw_latch == RW_X_LATCH);
 | 
						|
      /* x-latch also siblings from left to right */
 | 
						|
      if (page_has_prev(block->page.frame) &&
 | 
						|
          !btr_latch_prev(rw_latch, page_id, &err, mtr))
 | 
						|
        goto func_exit;
 | 
						|
      if (page_has_next(block->page.frame) &&
 | 
						|
          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
 | 
						|
                         RW_X_LATCH, mtr, &err))
 | 
						|
        goto func_exit;
 | 
						|
    }
 | 
						|
 | 
						|
  reached_latched_leaf:
 | 
						|
    if (!(tuple->info_bits & REC_INFO_MIN_REC_FLAG))
 | 
						|
    {
 | 
						|
      if (page_cur_search_with_match_bytes(*tuple, mode, &up_match, &low_match,
 | 
						|
                                           &page_cur, &up_bytes, &low_bytes))
 | 
						|
        goto corrupted;
 | 
						|
    }
 | 
						|
    else if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
 | 
						|
                                        &page_cur, nullptr))
 | 
						|
      goto corrupted;
 | 
						|
 | 
						|
    ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_GE);
 | 
						|
    ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
    ut_ad(low_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
 | 
						|
    if (latch_mode == BTR_MODIFY_TREE &&
 | 
						|
        btr_cur_need_opposite_intention(block->page, index()->is_clust(),
 | 
						|
                                        lock_intention,
 | 
						|
                                        node_ptr_max_size, compress_limit,
 | 
						|
                                        page_cur.rec))
 | 
						|
        goto need_opposite_intention;
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
    if (flag != BTR_CUR_BINARY)
 | 
						|
    {
 | 
						|
      ut_ad(!(tuple->info_bits & REC_INFO_MIN_REC_FLAG));
 | 
						|
      ut_ad(!index()->table->is_temporary());
 | 
						|
      if (!rec_is_metadata(page_cur.rec, *index()) &&
 | 
						|
          index()->search_info.hash_analysis_useful())
 | 
						|
        search_info_update();
 | 
						|
    }
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
 | 
						|
    goto func_exit;
 | 
						|
  }
 | 
						|
 | 
						|
  guess= nullptr;
 | 
						|
  if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
 | 
						|
                                 &page_cur, nullptr))
 | 
						|
    goto corrupted;
 | 
						|
  offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
 | 
						|
                           &heap);
 | 
						|
 | 
						|
  ut_ad(block == mtr->at_savepoint(block_savepoint));
 | 
						|
 | 
						|
  switch (latch_mode) {
 | 
						|
  default:
 | 
						|
    break;
 | 
						|
  case BTR_MODIFY_TREE:
 | 
						|
    if (btr_cur_need_opposite_intention(block->page, index()->is_clust(),
 | 
						|
                                        lock_intention,
 | 
						|
                                        node_ptr_max_size, compress_limit,
 | 
						|
                                        page_cur.rec))
 | 
						|
      /* If the rec is the first or last in the page for pessimistic
 | 
						|
      delete intention, it might cause node_ptr insert for the upper
 | 
						|
      level. We should change the intention and retry. */
 | 
						|
    need_opposite_intention:
 | 
						|
      return pessimistic_search_leaf(tuple, mode, mtr);
 | 
						|
 | 
						|
    if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH ||
 | 
						|
        index()->is_unique() ||
 | 
						|
        (up_match <= rec_offs_n_fields(offsets) &&
 | 
						|
         low_match <= rec_offs_n_fields(offsets)))
 | 
						|
      break;
 | 
						|
 | 
						|
    /* If the first or the last record of the page or the same key
 | 
						|
    value to the first record or last record, then another page might
 | 
						|
    be chosen when BTR_CONT_MODIFY_TREE.  So, the parent page should
 | 
						|
    not released to avoiding deadlock with blocking the another search
 | 
						|
    with the same key value. */
 | 
						|
    const rec_t *first=
 | 
						|
      page_rec_get_next_const(page_get_infimum_rec(block->page.frame));
 | 
						|
    ulint matched_fields;
 | 
						|
 | 
						|
    if (UNIV_UNLIKELY(!first))
 | 
						|
      goto corrupted;
 | 
						|
    if (page_cur.rec == first ||
 | 
						|
        page_rec_is_last(page_cur.rec, block->page.frame))
 | 
						|
    {
 | 
						|
    same_key_root:
 | 
						|
      detected_same_key_root= true;
 | 
						|
      break;
 | 
						|
    }
 | 
						|
 | 
						|
    matched_fields= 0;
 | 
						|
    offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED,
 | 
						|
                              &heap);
 | 
						|
    cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false,
 | 
						|
                &matched_fields);
 | 
						|
    if (matched_fields >= rec_offs_n_fields(offsets) - 1)
 | 
						|
      goto same_key_root;
 | 
						|
    if (const rec_t* last=
 | 
						|
        page_rec_get_prev_const(page_get_supremum_rec(block->page.frame)))
 | 
						|
    {
 | 
						|
      matched_fields= 0;
 | 
						|
      offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED,
 | 
						|
                                &heap);
 | 
						|
      cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false,
 | 
						|
                  &matched_fields);
 | 
						|
      if (matched_fields >= rec_offs_n_fields(offsets) - 1)
 | 
						|
        goto same_key_root;
 | 
						|
    }
 | 
						|
    else
 | 
						|
      goto corrupted;
 | 
						|
 | 
						|
    /* Release the non-root parent page unless it may need to be modified. */
 | 
						|
    if (tree_height > height + 1 &&
 | 
						|
        !btr_cur_will_modify_tree(index(), block->page.frame, lock_intention,
 | 
						|
                                  page_cur.rec, node_ptr_max_size,
 | 
						|
                                  zip_size, mtr))
 | 
						|
    {
 | 
						|
      mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
 | 
						|
      block_savepoint--;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  /* Go to the child node */
 | 
						|
  page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
 | 
						|
 | 
						|
  if (!--height)
 | 
						|
  {
 | 
						|
    /* We are about to access the leaf level. */
 | 
						|
 | 
						|
    switch (latch_mode) {
 | 
						|
    case BTR_MODIFY_ROOT_AND_LEAF:
 | 
						|
      rw_latch= RW_X_LATCH;
 | 
						|
      break;
 | 
						|
    case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
 | 
						|
      ut_ad(rw_latch == RW_S_LATCH);
 | 
						|
 | 
						|
      if (!not_first_access)
 | 
						|
        buf_read_ahead_linear(page_id);
 | 
						|
 | 
						|
      if (page_has_prev(block->page.frame) &&
 | 
						|
          page_rec_is_first(page_cur.rec, block->page.frame))
 | 
						|
      {
 | 
						|
        ut_ad(block_savepoint + 1 == mtr->get_savepoint());
 | 
						|
 | 
						|
        /* Latch the previous page if the node pointer is the leftmost
 | 
						|
        of the current page. */
 | 
						|
        int ret= btr_latch_prev(rw_latch, page_id, &err, mtr);
 | 
						|
        if (!ret)
 | 
						|
          goto func_exit;
 | 
						|
        ut_ad(block_savepoint + 2 == mtr->get_savepoint());
 | 
						|
        if (ret < 0)
 | 
						|
        {
 | 
						|
          up_match= 0, low_match= 0, up_bytes= 0, low_bytes= 0;
 | 
						|
          /* While our latch on the level-2 page prevents splits or
 | 
						|
          merges of this level-1 block, other threads may have
 | 
						|
          modified it due to splitting or merging some level-0 (leaf)
 | 
						|
          pages underneath it. Thus, we must search again. */
 | 
						|
          if (page_cur_search_with_match(tuple, page_mode,
 | 
						|
                                         &up_match, &low_match,
 | 
						|
                                         &page_cur, nullptr))
 | 
						|
            goto corrupted;
 | 
						|
          offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0,
 | 
						|
                                   ULINT_UNDEFINED, &heap);
 | 
						|
          page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec,
 | 
						|
                                                             offsets));
 | 
						|
        }
 | 
						|
      }
 | 
						|
      rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
 | 
						|
      break;
 | 
						|
    case BTR_MODIFY_LEAF:
 | 
						|
    case BTR_SEARCH_LEAF:
 | 
						|
      rw_latch= rw_lock_type_t(latch_mode);
 | 
						|
      if (!not_first_access)
 | 
						|
        buf_read_ahead_linear(page_id);
 | 
						|
      break;
 | 
						|
    case BTR_MODIFY_TREE:
 | 
						|
      ut_ad(rw_latch == RW_X_LATCH);
 | 
						|
 | 
						|
      if (lock_intention == BTR_INTENTION_INSERT &&
 | 
						|
          page_has_next(block->page.frame) &&
 | 
						|
          page_rec_is_last(page_cur.rec, block->page.frame))
 | 
						|
      {
 | 
						|
        /* btr_insert_into_right_sibling() might cause deleting node_ptr
 | 
						|
        at upper level */
 | 
						|
        mtr->rollback_to_savepoint(block_savepoint);
 | 
						|
        goto need_opposite_intention;
 | 
						|
      }
 | 
						|
      break;
 | 
						|
    default:
 | 
						|
      ut_ad(rw_latch == RW_X_LATCH);
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  goto search_loop;
 | 
						|
}
 | 
						|
 | 
						|
ATTRIBUTE_COLD void mtr_t::index_lock_upgrade()
 | 
						|
{
 | 
						|
  auto &slot= m_memo[get_savepoint() - 1];
 | 
						|
  if (slot.type == MTR_MEMO_X_LOCK)
 | 
						|
    return;
 | 
						|
  ut_ad(slot.type == MTR_MEMO_SX_LOCK);
 | 
						|
  index_lock *lock= static_cast<index_lock*>(slot.object);
 | 
						|
  lock->u_x_upgrade(SRW_LOCK_CALL);
 | 
						|
  slot.type= MTR_MEMO_X_LOCK;
 | 
						|
}
 | 
						|
 | 
						|
/** Mark a non-leaf page "least recently used", but avoid invoking
 | 
						|
buf_page_t::set_accessed(), because we do not want linear read-ahead */
 | 
						|
static void btr_cur_nonleaf_make_young(buf_page_t *bpage)
 | 
						|
{
 | 
						|
  if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage)))
 | 
						|
    buf_page_make_young(bpage);
 | 
						|
}
 | 
						|
 | 
						|
ATTRIBUTE_COLD
 | 
						|
dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
 | 
						|
                                           page_cur_mode_t mode, mtr_t *mtr)
 | 
						|
{
 | 
						|
  ut_ad(index()->is_btree());
 | 
						|
 | 
						|
  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
 | 
						|
  rec_offs* offsets= offsets_;
 | 
						|
  rec_offs_init(offsets_);
 | 
						|
 | 
						|
  ut_ad(flag == BTR_CUR_BINARY);
 | 
						|
  ut_ad(dict_index_check_search_tuple(index(), tuple));
 | 
						|
  ut_ad(dtuple_check_typed(tuple));
 | 
						|
  buf_block_t *block= mtr->at_savepoint(1);
 | 
						|
  ut_ad(block->page.id().page_no() == index()->page);
 | 
						|
  block->page.fix();
 | 
						|
  mtr->rollback_to_savepoint(1);
 | 
						|
  mtr->index_lock_upgrade();
 | 
						|
 | 
						|
  const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)};
 | 
						|
 | 
						|
  mtr->page_lock(block, RW_X_LATCH);
 | 
						|
  btr_search_drop_page_hash_index(block, index());
 | 
						|
 | 
						|
  up_match= 0;
 | 
						|
  up_bytes= 0;
 | 
						|
  low_match= 0;
 | 
						|
  low_bytes= 0;
 | 
						|
  ulint height= btr_page_get_level(block->page.frame);
 | 
						|
  tree_height= height + 1;
 | 
						|
  mem_heap_t *heap= nullptr;
 | 
						|
 | 
						|
 search_loop:
 | 
						|
  dberr_t err;
 | 
						|
  page_cur.block= block;
 | 
						|
 | 
						|
  if (UNIV_UNLIKELY(!height))
 | 
						|
  {
 | 
						|
    if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
 | 
						|
                                   &page_cur, nullptr))
 | 
						|
    corrupted:
 | 
						|
      err= DB_CORRUPTION;
 | 
						|
    else
 | 
						|
    {
 | 
						|
      ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_GE);
 | 
						|
      ut_ad(up_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
      ut_ad(low_match != uint16_t(~0U) || mode != PAGE_CUR_LE);
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
      /* We do a dirty read of btr_search.enabled here.  We will recheck in
 | 
						|
      btr_search_build_page_hash_index() before building a page hash
 | 
						|
      index, while holding search latch. */
 | 
						|
      if (!btr_search.enabled);
 | 
						|
      else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
 | 
						|
        /* This may be a search tuple for btr_pcur_t::restore_position(). */
 | 
						|
        ut_ad(tuple->is_metadata() ||
 | 
						|
              (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
 | 
						|
      else if (index()->table->is_temporary());
 | 
						|
      else if (!rec_is_metadata(page_cur.rec, *index()) &&
 | 
						|
               index()->search_info.hash_analysis_useful())
 | 
						|
        search_info_update();
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
      err= DB_SUCCESS;
 | 
						|
    }
 | 
						|
 | 
						|
  func_exit:
 | 
						|
    if (UNIV_LIKELY_NULL(heap))
 | 
						|
      mem_heap_free(heap);
 | 
						|
    return err;
 | 
						|
  }
 | 
						|
 | 
						|
  if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
 | 
						|
                                 &page_cur, nullptr))
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  page_id_t page_id{block->page.id()};
 | 
						|
 | 
						|
  offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
 | 
						|
                           &heap);
 | 
						|
  /* Go to the child node */
 | 
						|
  page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
 | 
						|
 | 
						|
  block=
 | 
						|
    buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET,
 | 
						|
                     mtr, &err);
 | 
						|
 | 
						|
  if (!block)
 | 
						|
  {
 | 
						|
    btr_read_failed(err, *index());
 | 
						|
    goto func_exit;
 | 
						|
  }
 | 
						|
 | 
						|
  btr_search_drop_page_hash_index(block, index());
 | 
						|
 | 
						|
  if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
 | 
						|
      btr_page_get_index_id(block->page.frame) != index()->id ||
 | 
						|
      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
 | 
						|
      !fil_page_index_page_check(block->page.frame))
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  if (--height != btr_page_get_level(block->page.frame))
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  btr_cur_nonleaf_make_young(&block->page);
 | 
						|
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
  const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
 | 
						|
  ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index()));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
  if (page_has_prev(block->page.frame) &&
 | 
						|
      !btr_latch_prev(RW_X_LATCH, page_id, &err, mtr))
 | 
						|
    goto func_exit;
 | 
						|
  if (page_has_next(block->page.frame) &&
 | 
						|
      !btr_block_get(*index(), btr_page_get_next(block->page.frame),
 | 
						|
                     RW_X_LATCH, mtr, &err))
 | 
						|
    goto func_exit;
 | 
						|
  goto search_loop;
 | 
						|
}
 | 
						|
 | 
						|
/********************************************************************//**
 | 
						|
Searches an index tree and positions a tree cursor on a given non-leaf level.
 | 
						|
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
 | 
						|
to node pointer page number fields on the upper levels of the tree!
 | 
						|
cursor->up_match and cursor->low_match both will have sensible values.
 | 
						|
Cursor is left at the place where an insert of the
 | 
						|
search tuple should be performed in the B-tree. InnoDB does an insert
 | 
						|
immediately after the cursor. Thus, the cursor may end up on a user record,
 | 
						|
or on a page infimum record.
 | 
						|
@param level      the tree level of search
 | 
						|
@param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
 | 
						|
                  it cannot get compared to the node ptr page number field!
 | 
						|
@param latch      RW_S_LATCH or RW_X_LATCH
 | 
						|
@param cursor     tree cursor; the cursor page is s- or x-latched, but see also
 | 
						|
                  above!
 | 
						|
@param mtr        mini-transaction
 | 
						|
@return DB_SUCCESS on success or error code otherwise */
 | 
						|
TRANSACTIONAL_TARGET
 | 
						|
dberr_t btr_cur_search_to_nth_level(ulint level,
 | 
						|
                                    const dtuple_t *tuple,
 | 
						|
                                    rw_lock_type_t rw_latch,
 | 
						|
                                    btr_cur_t *cursor, mtr_t *mtr)
 | 
						|
{
 | 
						|
  dict_index_t *const index= cursor->index();
 | 
						|
 | 
						|
  ut_ad(index->is_btree());
 | 
						|
  mem_heap_t *heap= nullptr;
 | 
						|
  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
 | 
						|
  rec_offs *offsets= offsets_;
 | 
						|
  rec_offs_init(offsets_);
 | 
						|
  ut_ad(level);
 | 
						|
  ut_ad(dict_index_check_search_tuple(index, tuple));
 | 
						|
  ut_ad(index->is_btree());
 | 
						|
  ut_ad(dtuple_check_typed(tuple));
 | 
						|
  ut_ad(index->page != FIL_NULL);
 | 
						|
 | 
						|
  MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
 | 
						|
  MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
 | 
						|
  cursor->up_match= 0;
 | 
						|
  cursor->low_match= 0;
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
  cursor->flag= BTR_CUR_BINARY;
 | 
						|
#endif
 | 
						|
#ifndef BTR_CUR_ADAPT
 | 
						|
  buf_block_t *block= nullptr;
 | 
						|
#else
 | 
						|
  buf_block_t *block= index->search_info.root_guess;
 | 
						|
#endif /* BTR_CUR_ADAPT */
 | 
						|
 | 
						|
  ut_ad(mtr->memo_contains_flagged(&index->lock,
 | 
						|
                                   MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
 | 
						|
 | 
						|
  dberr_t err;
 | 
						|
 | 
						|
  if (!index->table->space)
 | 
						|
  {
 | 
						|
  corrupted:
 | 
						|
    err= DB_CORRUPTION;
 | 
						|
  func_exit:
 | 
						|
    if (UNIV_LIKELY_NULL(heap))
 | 
						|
      mem_heap_free(heap);
 | 
						|
    return err;
 | 
						|
  }
 | 
						|
 | 
						|
  const ulint zip_size= index->table->space->zip_size();
 | 
						|
 | 
						|
  /* Start with the root page. */
 | 
						|
  page_id_t page_id(index->table->space_id, index->page);
 | 
						|
  ulint height= ULINT_UNDEFINED;
 | 
						|
 | 
						|
search_loop:
 | 
						|
  err= DB_SUCCESS;
 | 
						|
  if (buf_block_t *b=
 | 
						|
      mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch)))
 | 
						|
    block= b;
 | 
						|
  else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch,
 | 
						|
                                     block, BUF_GET, mtr, &err)))
 | 
						|
  {
 | 
						|
    btr_read_failed(err, *index);
 | 
						|
    goto func_exit;
 | 
						|
  }
 | 
						|
  else
 | 
						|
  {
 | 
						|
    btr_search_drop_page_hash_index(block, index);
 | 
						|
    btr_cur_nonleaf_make_young(&block->page);
 | 
						|
  }
 | 
						|
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
  if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
 | 
						|
    ut_a(page_zip_validate(page_zip, block->page.frame, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
  if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
 | 
						|
      btr_page_get_index_id(block->page.frame) != index->id ||
 | 
						|
      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
 | 
						|
      !fil_page_index_page_check(block->page.frame))
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  const uint32_t page_level= btr_page_get_level(block->page.frame);
 | 
						|
 | 
						|
  if (height == ULINT_UNDEFINED)
 | 
						|
  {
 | 
						|
    /* We are in the root node */
 | 
						|
    height= page_level;
 | 
						|
    if (!height)
 | 
						|
      goto corrupted;
 | 
						|
    cursor->tree_height= height + 1;
 | 
						|
  }
 | 
						|
  else if (height != ulint{page_level})
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  cursor->page_cur.block= block;
 | 
						|
 | 
						|
  /* Search for complete index fields. */
 | 
						|
  if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match,
 | 
						|
                                 &cursor->low_match, &cursor->page_cur,
 | 
						|
                                 nullptr))
 | 
						|
    goto corrupted;
 | 
						|
 | 
						|
  /* If this is the desired level, leave the loop */
 | 
						|
  if (level == height)
 | 
						|
    goto func_exit;
 | 
						|
 | 
						|
  ut_ad(height > level);
 | 
						|
  height--;
 | 
						|
 | 
						|
  offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
 | 
						|
                            ULINT_UNDEFINED, &heap);
 | 
						|
  /* Go to the child node */
 | 
						|
  page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
 | 
						|
                                                     offsets));
 | 
						|
  block= nullptr;
 | 
						|
  goto search_loop;
 | 
						|
}
 | 
						|
 | 
						|
dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
 | 
						|
                             btr_latch_mode latch_mode, mtr_t *mtr)
 | 
						|
{
 | 
						|
  ulint n_blocks= 0;
 | 
						|
  mem_heap_t *heap= nullptr;
 | 
						|
  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
 | 
						|
  rec_offs *offsets= offsets_;
 | 
						|
  dberr_t err;
 | 
						|
 | 
						|
  rec_offs_init(offsets_);
 | 
						|
 | 
						|
  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
 | 
						|
  latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED);
 | 
						|
 | 
						|
  btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
 | 
						|
 | 
						|
  /* Store the position of the tree latch we push to mtr so that we
 | 
						|
  know how to release it when we have latched the leaf node */
 | 
						|
 | 
						|
  auto savepoint= mtr->get_savepoint();
 | 
						|
 | 
						|
  rw_lock_type_t upper_rw_latch= RW_X_LATCH;
 | 
						|
  ulint node_ptr_max_size= 0, compress_limit= 0;
 | 
						|
 | 
						|
  if (latch_mode == BTR_MODIFY_TREE)
 | 
						|
  {
 | 
						|
    node_ptr_max_size= btr_node_ptr_max_size(index);
 | 
						|
    /* Most of delete-intended operations are purging. Free blocks
 | 
						|
    and read IO bandwidth should be prioritized for them, when the
 | 
						|
    history list is growing huge. */
 | 
						|
    savepoint++;
 | 
						|
    if (lock_intention == BTR_INTENTION_DELETE)
 | 
						|
    {
 | 
						|
      compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index);
 | 
						|
 | 
						|
      if (os_aio_pending_reads_approx() &&
 | 
						|
          trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
 | 
						|
      {
 | 
						|
        mtr_x_lock_index(index, mtr);
 | 
						|
        goto index_locked;
 | 
						|
      }
 | 
						|
    }
 | 
						|
    mtr_sx_lock_index(index, mtr);
 | 
						|
  }
 | 
						|
  else
 | 
						|
  {
 | 
						|
    static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), "");
 | 
						|
    ut_ad(!(latch_mode & 8));
 | 
						|
    /* This function doesn't need to lock left page of the leaf page */
 | 
						|
    static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), "");
 | 
						|
    latch_mode= btr_latch_mode(latch_mode & (RW_S_LATCH | RW_X_LATCH));
 | 
						|
    ut_ad(!latch_by_caller ||
 | 
						|
          mtr->memo_contains_flagged(&index->lock,
 | 
						|
                                     MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK));
 | 
						|
    upper_rw_latch= RW_S_LATCH;
 | 
						|
    if (!latch_by_caller)
 | 
						|
    {
 | 
						|
      savepoint++;
 | 
						|
      mtr_s_lock_index(index, mtr);
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
index_locked:
 | 
						|
  ut_ad(savepoint == mtr->get_savepoint());
 | 
						|
 | 
						|
  const rw_lock_type_t root_leaf_rw_latch=
 | 
						|
    rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH));
 | 
						|
 | 
						|
  page_cur.index = index;
 | 
						|
 | 
						|
  uint32_t page= index->page;
 | 
						|
 | 
						|
  for (ulint height= ULINT_UNDEFINED;;)
 | 
						|
  {
 | 
						|
    ut_ad(n_blocks < BTR_MAX_LEVELS);
 | 
						|
    ut_ad(savepoint + n_blocks == mtr->get_savepoint());
 | 
						|
 | 
						|
    bool first_access= false;
 | 
						|
    buf_block_t* block=
 | 
						|
      btr_block_get(*index, page,
 | 
						|
                    height ? upper_rw_latch : root_leaf_rw_latch,
 | 
						|
                    mtr, &err, &first_access);
 | 
						|
    ut_ad(!block == (err != DB_SUCCESS));
 | 
						|
 | 
						|
    if (!block)
 | 
						|
      break;
 | 
						|
 | 
						|
    if (first)
 | 
						|
      page_cur_set_before_first(block, &page_cur);
 | 
						|
    else
 | 
						|
      page_cur_set_after_last(block, &page_cur);
 | 
						|
 | 
						|
    const uint32_t l= btr_page_get_level(block->page.frame);
 | 
						|
 | 
						|
    if (height == ULINT_UNDEFINED)
 | 
						|
    {
 | 
						|
      /* We are in the root node */
 | 
						|
      height= l;
 | 
						|
      if (height);
 | 
						|
      else if (upper_rw_latch != root_leaf_rw_latch)
 | 
						|
      {
 | 
						|
        /* We should retry to get the page, because the root page
 | 
						|
        is latched with different level as a leaf page. */
 | 
						|
        ut_ad(n_blocks == 0);
 | 
						|
        ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
 | 
						|
        upper_rw_latch= root_leaf_rw_latch;
 | 
						|
        mtr->rollback_to_savepoint(savepoint);
 | 
						|
        height= ULINT_UNDEFINED;
 | 
						|
        continue;
 | 
						|
      }
 | 
						|
      else
 | 
						|
      {
 | 
						|
      reached_leaf:
 | 
						|
        const auto leaf_savepoint= mtr->get_savepoint();
 | 
						|
        ut_ad(leaf_savepoint);
 | 
						|
        ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1));
 | 
						|
 | 
						|
        if (latch_mode == BTR_MODIFY_TREE)
 | 
						|
        {
 | 
						|
          /* x-latch also siblings from left to right */
 | 
						|
          if (page_has_prev(block->page.frame) &&
 | 
						|
              !btr_latch_prev(RW_X_LATCH, block->page.id(), &err, mtr))
 | 
						|
            break;
 | 
						|
          if (page_has_next(block->page.frame) &&
 | 
						|
              !btr_block_get(*index, btr_page_get_next(block->page.frame),
 | 
						|
                             RW_X_LATCH, mtr, &err))
 | 
						|
            break;
 | 
						|
 | 
						|
          if (!index->lock.have_x() &&
 | 
						|
              btr_cur_need_opposite_intention(block->page, index->is_clust(),
 | 
						|
                                              lock_intention,
 | 
						|
                                              node_ptr_max_size,
 | 
						|
                                              compress_limit, page_cur.rec))
 | 
						|
            goto need_opposite_intention;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
          if (latch_mode != BTR_CONT_MODIFY_TREE)
 | 
						|
          {
 | 
						|
            ut_ad(latch_mode == BTR_MODIFY_LEAF ||
 | 
						|
                  latch_mode == BTR_SEARCH_LEAF);
 | 
						|
            /* Release index->lock if needed, and the non-leaf pages. */
 | 
						|
            mtr->rollback_to_savepoint(savepoint - !latch_by_caller,
 | 
						|
                                       leaf_savepoint - 1);
 | 
						|
          }
 | 
						|
        }
 | 
						|
        break;
 | 
						|
      }
 | 
						|
    }
 | 
						|
    else if (UNIV_UNLIKELY(height != l))
 | 
						|
    {
 | 
						|
    corrupted:
 | 
						|
      err= DB_CORRUPTION;
 | 
						|
      break;
 | 
						|
    }
 | 
						|
 | 
						|
    if (!height)
 | 
						|
      goto reached_leaf;
 | 
						|
 | 
						|
    height--;
 | 
						|
 | 
						|
    if (first
 | 
						|
        ? !page_cur_move_to_next(&page_cur)
 | 
						|
        : !page_cur_move_to_prev(&page_cur))
 | 
						|
      goto corrupted;
 | 
						|
 | 
						|
    offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED,
 | 
						|
                             &heap);
 | 
						|
    page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
 | 
						|
 | 
						|
    ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH);
 | 
						|
 | 
						|
    if (latch_mode != BTR_MODIFY_TREE)
 | 
						|
    {
 | 
						|
      if (!height && first && first_access)
 | 
						|
        buf_read_ahead_linear(page_id_t(block->page.id().space(), page));
 | 
						|
    }
 | 
						|
    else if (btr_cur_need_opposite_intention(block->page, index->is_clust(),
 | 
						|
                                             lock_intention,
 | 
						|
                                             node_ptr_max_size, compress_limit,
 | 
						|
                                             page_cur.rec))
 | 
						|
    {
 | 
						|
    need_opposite_intention:
 | 
						|
      /* If the rec is the first or last in the page for pessimistic
 | 
						|
      delete intention, it might cause node_ptr insert for the upper
 | 
						|
      level. We should change the intention and retry. */
 | 
						|
 | 
						|
      mtr->rollback_to_savepoint(savepoint);
 | 
						|
      mtr->index_lock_upgrade();
 | 
						|
      /* X-latch all pages from now on */
 | 
						|
      latch_mode= BTR_CONT_MODIFY_TREE;
 | 
						|
      page= index->page;
 | 
						|
      height= ULINT_UNDEFINED;
 | 
						|
      n_blocks= 0;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
      if (!btr_cur_will_modify_tree(index, block->page.frame,
 | 
						|
                                    lock_intention, page_cur.rec,
 | 
						|
                                    node_ptr_max_size,
 | 
						|
                                    index->table->space->zip_size(), mtr))
 | 
						|
      {
 | 
						|
        ut_ad(n_blocks);
 | 
						|
        /* release buffer-fixes on pages that will not be modified
 | 
						|
        (except the root) */
 | 
						|
        if (n_blocks > 1)
 | 
						|
        {
 | 
						|
          mtr->rollback_to_savepoint(savepoint + 1, savepoint + n_blocks - 1);
 | 
						|
          n_blocks= 1;
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    /* Go to the child node */
 | 
						|
    n_blocks++;
 | 
						|
  }
 | 
						|
 | 
						|
  if (UNIV_LIKELY_NULL(heap))
 | 
						|
    mem_heap_free(heap);
 | 
						|
 | 
						|
  return err;
 | 
						|
}
 | 
						|
 | 
						|
/*==================== B-TREE INSERT =========================*/
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Inserts a record if there is enough space, or if enough space can
 | 
						|
be freed by reorganizing. Differs from btr_cur_optimistic_insert because
 | 
						|
no heuristics is applied to whether it pays to use CPU time for
 | 
						|
reorganizing the page or not.
 | 
						|
 | 
						|
@return pointer to inserted record if succeed, else NULL */
 | 
						|
static MY_ATTRIBUTE((nonnull, warn_unused_result))
 | 
						|
rec_t*
 | 
						|
btr_cur_insert_if_possible(
 | 
						|
/*=======================*/
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
 | 
						|
				cursor stays valid */
 | 
						|
	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
 | 
						|
				have been stored to tuple */
 | 
						|
	rec_offs**	offsets,/*!< out: offsets on *rec */
 | 
						|
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 | 
						|
	ulint		n_ext,	/*!< in: number of externally stored columns */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 | 
						|
{
 | 
						|
	page_cur_t*	page_cursor;
 | 
						|
	rec_t*		rec;
 | 
						|
 | 
						|
	ut_ad(dtuple_check_typed(tuple));
 | 
						|
 | 
						|
	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
 | 
						|
					 MTR_MEMO_PAGE_X_FIX));
 | 
						|
	page_cursor = btr_cur_get_page_cur(cursor);
 | 
						|
 | 
						|
	/* Now, try the insert */
 | 
						|
	rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext,
 | 
						|
				    mtr);
 | 
						|
 | 
						|
	/* If the record did not fit, reorganize.
 | 
						|
	For compressed pages, page_cur_tuple_insert()
 | 
						|
	attempted this already. */
 | 
						|
	if (!rec && !page_cur_get_page_zip(page_cursor)
 | 
						|
	    && btr_page_reorganize(page_cursor, mtr) == DB_SUCCESS) {
 | 
						|
		rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap,
 | 
						|
					    n_ext, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
 | 
						|
	return(rec);
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
For an insert, checks the locks and does the undo logging if desired.
 | 
						|
@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
 | 
						|
UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
 | 
						|
dberr_t
 | 
						|
btr_cur_ins_lock_and_undo(
 | 
						|
/*======================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging and locking flags: if
 | 
						|
				not zero, the parameters index and thr
 | 
						|
				should be specified */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
 | 
						|
	dtuple_t*	entry,	/*!< in/out: entry to insert */
 | 
						|
	que_thr_t*	thr,	/*!< in: query thread or NULL */
 | 
						|
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 | 
						|
	bool*		inherit)/*!< out: true if the inserted new record maybe
 | 
						|
				should inherit LOCK_GAP type locks from the
 | 
						|
				successor record */
 | 
						|
{
 | 
						|
	if (!(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))) {
 | 
						|
		return DB_SUCCESS;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Check if we have to wait for a lock: enqueue an explicit lock
 | 
						|
	request if yes */
 | 
						|
 | 
						|
	rec_t* rec = btr_cur_get_rec(cursor);
 | 
						|
	dict_index_t* index = cursor->index();
 | 
						|
 | 
						|
	ut_ad(!dict_index_is_online_ddl(index)
 | 
						|
	      || dict_index_is_clust(index)
 | 
						|
	      || (flags & BTR_CREATE_FLAG));
 | 
						|
	ut_ad((flags & BTR_NO_UNDO_LOG_FLAG)
 | 
						|
	      || !index->table->skip_alter_undo);
 | 
						|
 | 
						|
	ut_ad(mtr->is_named_space(index->table->space));
 | 
						|
 | 
						|
	/* Check if there is predicate or GAP lock preventing the insertion */
 | 
						|
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
 | 
						|
		const unsigned type = index->type;
 | 
						|
		if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
 | 
						|
			lock_prdt_t	prdt;
 | 
						|
			rtr_mbr_t	mbr;
 | 
						|
 | 
						|
			rtr_get_mbr_from_tuple(entry, &mbr);
 | 
						|
 | 
						|
			/* Use on stack MBR variable to test if a lock is
 | 
						|
			needed. If so, the predicate (MBR) will be allocated
 | 
						|
			from lock heap in lock_prdt_insert_check_and_lock() */
 | 
						|
			lock_init_prdt_from_mbr(&prdt, &mbr, 0, nullptr);
 | 
						|
 | 
						|
			if (dberr_t err = lock_prdt_insert_check_and_lock(
 | 
						|
				    rec, btr_cur_get_block(cursor),
 | 
						|
				    index, thr, mtr, &prdt)) {
 | 
						|
				return err;
 | 
						|
			}
 | 
						|
			*inherit = false;
 | 
						|
		} else {
 | 
						|
			ut_ad(!dict_index_is_online_ddl(index)
 | 
						|
			      || index->is_primary()
 | 
						|
			      || (flags & BTR_CREATE_FLAG));
 | 
						|
#ifdef WITH_WSREP
 | 
						|
			trx_t* trx= thr_get_trx(thr);
 | 
						|
			/* If transaction scanning an unique secondary
 | 
						|
			key is wsrep high priority thread (brute
 | 
						|
			force) this scanning may involve GAP-locking
 | 
						|
			in the index. As this locking happens also
 | 
						|
			when applying replication events in high
 | 
						|
			priority applier threads, there is a
 | 
						|
			probability for lock conflicts between two
 | 
						|
			wsrep high priority threads. To avoid this
 | 
						|
			GAP-locking we mark that this transaction
 | 
						|
			is using unique key scan here. */
 | 
						|
			if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
 | 
						|
			    && trx->is_wsrep()
 | 
						|
			    && wsrep_thd_is_BF(trx->mysql_thd, false)) {
 | 
						|
				trx->wsrep = 3;
 | 
						|
			}
 | 
						|
#endif /* WITH_WSREP */
 | 
						|
			if (dberr_t err = lock_rec_insert_check_and_lock(
 | 
						|
				    rec, btr_cur_get_block(cursor),
 | 
						|
				    index, thr, mtr, inherit)) {
 | 
						|
				return err;
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (!index->is_primary() || !page_is_leaf(btr_cur_get_page(cursor))) {
 | 
						|
		return DB_SUCCESS;
 | 
						|
	}
 | 
						|
 | 
						|
	constexpr roll_ptr_t dummy_roll_ptr = roll_ptr_t{1}
 | 
						|
		<< ROLL_PTR_INSERT_FLAG_POS;
 | 
						|
	roll_ptr_t roll_ptr = dummy_roll_ptr;
 | 
						|
 | 
						|
	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
 | 
						|
		if (dberr_t err = trx_undo_report_row_operation(
 | 
						|
			    thr, index, entry, NULL, 0, NULL, NULL,
 | 
						|
			    &roll_ptr)) {
 | 
						|
			return err;
 | 
						|
		}
 | 
						|
 | 
						|
		if (roll_ptr != dummy_roll_ptr) {
 | 
						|
			dfield_t* r = dtuple_get_nth_field(entry,
 | 
						|
							   index->db_trx_id());
 | 
						|
			trx_write_trx_id(static_cast<byte*>(r->data),
 | 
						|
					 thr_get_trx(thr)->id);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 | 
						|
		dfield_t* r = dtuple_get_nth_field(
 | 
						|
			entry, index->db_roll_ptr());
 | 
						|
		ut_ad(r->len == DATA_ROLL_PTR_LEN);
 | 
						|
		trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
 | 
						|
	}
 | 
						|
 | 
						|
	return DB_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
Prefetch siblings of the leaf for the pessimistic operation.
 | 
						|
@param block	leaf page
 | 
						|
@param index    index of the page */
 | 
						|
static void btr_cur_prefetch_siblings(const buf_block_t *block,
 | 
						|
                                      const dict_index_t *index)
 | 
						|
{
 | 
						|
  ut_ad(page_is_leaf(block->page.frame));
 | 
						|
 | 
						|
  const page_t *page= block->page.frame;
 | 
						|
  uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
 | 
						|
  uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
 | 
						|
 | 
						|
  fil_space_t *space= index->table->space;
 | 
						|
 | 
						|
  if (prev == FIL_NULL);
 | 
						|
  else if (space->acquire())
 | 
						|
    buf_read_page_background(space, page_id_t(space->id, prev),
 | 
						|
                             block->zip_size());
 | 
						|
  if (next == FIL_NULL);
 | 
						|
  else if (space->acquire())
 | 
						|
    buf_read_page_background(space, page_id_t(space->id, next),
 | 
						|
                             block->zip_size());
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Tries to perform an insert to a page in an index tree, next to cursor.
 | 
						|
It is assumed that mtr holds an x-latch on the page. The operation does
 | 
						|
not succeed if there is too little space on the page. If there is just
 | 
						|
one record on the page, the insert will always succeed; this is to
 | 
						|
prevent trying to split a page with just one record.
 | 
						|
@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
 | 
						|
dberr_t
 | 
						|
btr_cur_optimistic_insert(
 | 
						|
/*======================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging and locking flags: if not
 | 
						|
				zero, the parameters index and thr should be
 | 
						|
				specified */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
 | 
						|
				cursor stays valid */
 | 
						|
	rec_offs**	offsets,/*!< out: offsets on *rec */
 | 
						|
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
 | 
						|
	dtuple_t*	entry,	/*!< in/out: entry to insert */
 | 
						|
	rec_t**		rec,	/*!< out: pointer to inserted record if
 | 
						|
				succeed */
 | 
						|
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
 | 
						|
				be stored externally by the caller */
 | 
						|
	ulint		n_ext,	/*!< in: number of externally stored columns */
 | 
						|
	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
 | 
						|
				!(~flags
 | 
						|
				& (BTR_NO_LOCKING_FLAG
 | 
						|
				| BTR_NO_UNDO_LOG_FLAG)) */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction;
 | 
						|
				if this function returns DB_SUCCESS on
 | 
						|
				a leaf page of a secondary index in a
 | 
						|
				compressed tablespace, the caller must
 | 
						|
				mtr_commit(mtr) before latching
 | 
						|
				any further pages */
 | 
						|
{
 | 
						|
	big_rec_t*	big_rec_vec	= NULL;
 | 
						|
	dict_index_t*	index;
 | 
						|
	page_cur_t*	page_cursor;
 | 
						|
	buf_block_t*	block;
 | 
						|
	page_t*		page;
 | 
						|
	rec_t*		dummy;
 | 
						|
	bool		leaf;
 | 
						|
	bool		reorg __attribute__((unused));
 | 
						|
	bool		inherit = true;
 | 
						|
	ulint		rec_size;
 | 
						|
	dberr_t		err;
 | 
						|
 | 
						|
	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
 | 
						|
	*big_rec = NULL;
 | 
						|
 | 
						|
	block = btr_cur_get_block(cursor);
 | 
						|
	page = buf_block_get_frame(block);
 | 
						|
	index = cursor->index();
 | 
						|
 | 
						|
	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(!dict_index_is_online_ddl(index)
 | 
						|
	      || dict_index_is_clust(index)
 | 
						|
	      || (flags & BTR_CREATE_FLAG));
 | 
						|
	ut_ad(dtuple_check_typed(entry));
 | 
						|
 | 
						|
#ifdef HAVE_valgrind
 | 
						|
	if (block->page.zip.data) {
 | 
						|
		MEM_CHECK_DEFINED(page, srv_page_size);
 | 
						|
		MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
 | 
						|
	}
 | 
						|
#endif /* HAVE_valgrind */
 | 
						|
 | 
						|
	leaf = page_is_leaf(page);
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
 | 
						|
		ut_ad(leaf);
 | 
						|
		goto convert_big_rec;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Calculate the record size when entry is converted to a record */
 | 
						|
	rec_size = rec_get_converted_size(index, entry, n_ext);
 | 
						|
 | 
						|
	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
 | 
						|
				   dtuple_get_n_fields(entry),
 | 
						|
				   block->zip_size())) {
 | 
						|
convert_big_rec:
 | 
						|
		/* The record is so big that we have to store some fields
 | 
						|
		externally on separate database pages */
 | 
						|
		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
 | 
						|
 | 
						|
			return(DB_TOO_BIG_RECORD);
 | 
						|
		}
 | 
						|
 | 
						|
		rec_size = rec_get_converted_size(index, entry, n_ext);
 | 
						|
	}
 | 
						|
 | 
						|
	if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
 | 
						|
		if (big_rec_vec != NULL) {
 | 
						|
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
 | 
						|
		}
 | 
						|
 | 
						|
		return(DB_TOO_BIG_RECORD);
 | 
						|
	}
 | 
						|
 | 
						|
	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail);
 | 
						|
 | 
						|
	if (block->page.zip.data && leaf
 | 
						|
	    && (page_get_data_size(page) + rec_size
 | 
						|
		>= dict_index_zip_pad_optimal_page_size(index))) {
 | 
						|
		/* If compression padding tells us that insertion will
 | 
						|
		result in too packed up page i.e.: which is likely to
 | 
						|
		cause compression failure then don't do an optimistic
 | 
						|
		insertion. */
 | 
						|
fail:
 | 
						|
		err = DB_FAIL;
 | 
						|
 | 
						|
		/* prefetch siblings of the leaf for the pessimistic
 | 
						|
		operation, if the page is leaf. */
 | 
						|
		if (leaf) {
 | 
						|
			btr_cur_prefetch_siblings(block, index);
 | 
						|
		}
 | 
						|
fail_err:
 | 
						|
 | 
						|
		if (big_rec_vec) {
 | 
						|
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
 | 
						|
		}
 | 
						|
 | 
						|
		return(err);
 | 
						|
	}
 | 
						|
 | 
						|
	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
 | 
						|
	if (max_size < rec_size) {
 | 
						|
		goto fail;
 | 
						|
	}
 | 
						|
 | 
						|
	const ulint n_recs = page_get_n_recs(page);
 | 
						|
	if (UNIV_UNLIKELY(n_recs >= 8189)) {
 | 
						|
		ut_ad(srv_page_size == 65536);
 | 
						|
		goto fail;
 | 
						|
	}
 | 
						|
 | 
						|
	if (page_has_garbage(page)) {
 | 
						|
		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
 | 
						|
		    && n_recs > 1
 | 
						|
		    && page_get_max_insert_size(page, 1) < rec_size) {
 | 
						|
 | 
						|
			goto fail;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* If there have been many consecutive inserts to the
 | 
						|
	clustered index leaf page of an uncompressed table, check if
 | 
						|
	we have to split the page to reserve enough free space for
 | 
						|
	future updates of records. */
 | 
						|
 | 
						|
	if (leaf && !block->page.zip.data && dict_index_is_clust(index)
 | 
						|
	    && page_get_n_recs(page) >= 2
 | 
						|
	    && dict_index_get_space_reserve() + rec_size > max_size
 | 
						|
	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
 | 
						|
		|| btr_page_get_split_rec_to_left(cursor))) {
 | 
						|
		goto fail;
 | 
						|
	}
 | 
						|
 | 
						|
	page_cursor = btr_cur_get_page_cur(cursor);
 | 
						|
 | 
						|
	DBUG_LOG("ib_cur",
 | 
						|
		 "insert " << index->name << " (" << index->id << ") by "
 | 
						|
		 << ib::hex(thr ? thr->graph->trx->id : 0)
 | 
						|
		 << ' ' << rec_printer(entry).str());
 | 
						|
	DBUG_EXECUTE_IF("do_page_reorganize",
 | 
						|
			ut_a(!n_recs || btr_page_reorganize(page_cursor, mtr)
 | 
						|
			     == DB_SUCCESS););
 | 
						|
 | 
						|
	/* Now, try the insert */
 | 
						|
	{
 | 
						|
		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
 | 
						|
 | 
						|
		/* Check locks and write to the undo log,
 | 
						|
		if specified */
 | 
						|
		err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
 | 
						|
						thr, mtr, &inherit);
 | 
						|
		if (err != DB_SUCCESS) {
 | 
						|
			goto fail_err;
 | 
						|
		}
 | 
						|
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
		if (!(flags & BTR_CREATE_FLAG)
 | 
						|
		    && leaf && index->is_primary()) {
 | 
						|
			const dfield_t* trx_id = dtuple_get_nth_field(
 | 
						|
				entry, dict_col_get_clust_pos(
 | 
						|
					dict_table_get_sys_col(index->table,
 | 
						|
							       DATA_TRX_ID),
 | 
						|
					index));
 | 
						|
 | 
						|
			ut_ad(trx_id->len == DATA_TRX_ID_LEN);
 | 
						|
			ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
 | 
						|
			ut_ad(*static_cast<const byte*>
 | 
						|
			      (trx_id[1].data) & 0x80);
 | 
						|
			if (flags & BTR_NO_UNDO_LOG_FLAG) {
 | 
						|
				ut_ad(!memcmp(trx_id->data, reset_trx_id,
 | 
						|
					      DATA_TRX_ID_LEN));
 | 
						|
			} else {
 | 
						|
				ut_ad(thr->graph->trx->id);
 | 
						|
				ut_ad(thr->graph->trx->bulk_insert
 | 
						|
				      || thr->graph->trx->id
 | 
						|
				      == trx_read_trx_id(
 | 
						|
					      static_cast<const byte*>(
 | 
						|
							trx_id->data))
 | 
						|
				      || index->table->is_temporary());
 | 
						|
			}
 | 
						|
		}
 | 
						|
#endif
 | 
						|
 | 
						|
		*rec = page_cur_tuple_insert(page_cursor, entry, offsets, heap,
 | 
						|
					     n_ext, mtr);
 | 
						|
 | 
						|
		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
 | 
						|
	}
 | 
						|
 | 
						|
	if (*rec) {
 | 
						|
	} else if (block->page.zip.data) {
 | 
						|
		ut_ad(!index->table->is_temporary());
 | 
						|
		goto fail;
 | 
						|
	} else {
 | 
						|
		ut_ad(!reorg);
 | 
						|
		reorg = true;
 | 
						|
 | 
						|
		/* If the record did not fit, reorganize */
 | 
						|
		err = btr_page_reorganize(page_cursor, mtr);
 | 
						|
		if (err != DB_SUCCESS
 | 
						|
		    || page_get_max_insert_size(page, 1) != max_size
 | 
						|
		    || !(*rec = page_cur_tuple_insert(page_cursor, entry,
 | 
						|
						      offsets, heap, n_ext,
 | 
						|
						      mtr))) {
 | 
						|
			err = DB_CORRUPTION;
 | 
						|
			goto fail_err;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
	if (!leaf) {
 | 
						|
	} else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
 | 
						|
		ut_ad(entry->is_metadata());
 | 
						|
		ut_ad(index->is_instant());
 | 
						|
		ut_ad(flags == BTR_NO_LOCKING_FLAG);
 | 
						|
	} else if (!index->table->is_temporary()) {
 | 
						|
		btr_search_update_hash_on_insert(cursor, reorg);
 | 
						|
	}
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
 | 
						|
	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
 | 
						|
 | 
						|
		lock_update_insert(block, *rec);
 | 
						|
	}
 | 
						|
 | 
						|
	*big_rec = big_rec_vec;
 | 
						|
 | 
						|
	return(DB_SUCCESS);
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Performs an insert on a page of an index tree. It is assumed that mtr
 | 
						|
holds an x-latch on the tree and on the cursor page. If the insert is
 | 
						|
made on the leaf level, to avoid deadlocks, mtr must also own x-latches
 | 
						|
to brothers of page, if those brothers exist.
 | 
						|
@return DB_SUCCESS or error number */
 | 
						|
dberr_t
 | 
						|
btr_cur_pessimistic_insert(
 | 
						|
/*=======================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging and locking flags: if not
 | 
						|
				zero, the parameter thr should be
 | 
						|
				specified; if no undo logging is specified,
 | 
						|
				then the caller must have reserved enough
 | 
						|
				free extents in the file space so that the
 | 
						|
				insertion will certainly succeed */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
 | 
						|
				cursor stays valid */
 | 
						|
	rec_offs**	offsets,/*!< out: offsets on *rec */
 | 
						|
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
 | 
						|
				that can be emptied */
 | 
						|
	dtuple_t*	entry,	/*!< in/out: entry to insert */
 | 
						|
	rec_t**		rec,	/*!< out: pointer to inserted record if
 | 
						|
				succeed */
 | 
						|
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
 | 
						|
				be stored externally by the caller */
 | 
						|
	ulint		n_ext,	/*!< in: number of externally stored columns */
 | 
						|
	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
 | 
						|
				!(~flags
 | 
						|
				& (BTR_NO_LOCKING_FLAG
 | 
						|
				| BTR_NO_UNDO_LOG_FLAG)) */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 | 
						|
{
 | 
						|
	dict_index_t*	index		= cursor->index();
 | 
						|
	big_rec_t*	big_rec_vec	= NULL;
 | 
						|
	bool		inherit = false;
 | 
						|
	uint32_t	n_reserved	= 0;
 | 
						|
 | 
						|
	ut_ad(dtuple_check_typed(entry));
 | 
						|
	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
 | 
						|
 | 
						|
	*big_rec = NULL;
 | 
						|
 | 
						|
	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 | 
						|
					 | MTR_MEMO_SX_LOCK));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
 | 
						|
					 MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(!dict_index_is_online_ddl(index)
 | 
						|
	      || dict_index_is_clust(index)
 | 
						|
	      || (flags & BTR_CREATE_FLAG));
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
	cursor->flag = BTR_CUR_BINARY;
 | 
						|
#endif
 | 
						|
 | 
						|
	/* Check locks and write to undo log, if specified */
 | 
						|
 | 
						|
	dberr_t err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
 | 
						|
						thr, mtr, &inherit);
 | 
						|
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
		return(err);
 | 
						|
	}
 | 
						|
 | 
						|
	/* First reserve enough free space for the file segments of
 | 
						|
	the index tree, so that the insert will not fail because of
 | 
						|
	lack of space */
 | 
						|
 | 
						|
	err = fsp_reserve_free_extents(&n_reserved, index->table->space,
 | 
						|
				       uint32_t(cursor->tree_height / 16 + 3),
 | 
						|
				       FSP_NORMAL, mtr);
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
		return err;
 | 
						|
	}
 | 
						|
 | 
						|
	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
 | 
						|
				   index->table->not_redundant(),
 | 
						|
				   dtuple_get_n_fields(entry),
 | 
						|
				   btr_cur_get_block(cursor)->zip_size())
 | 
						|
	    || UNIV_UNLIKELY(entry->is_alter_metadata()
 | 
						|
			     && !dfield_is_ext(
 | 
						|
				     dtuple_get_nth_field(
 | 
						|
					     entry,
 | 
						|
					     index->first_user_field())))) {
 | 
						|
		/* The record is so big that we have to store some fields
 | 
						|
		externally on separate database pages */
 | 
						|
 | 
						|
		if (UNIV_LIKELY_NULL(big_rec_vec)) {
 | 
						|
			/* This should never happen, but we handle
 | 
						|
			the situation in a robust manner. */
 | 
						|
			ut_ad(0);
 | 
						|
			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
 | 
						|
		}
 | 
						|
 | 
						|
		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
 | 
						|
 | 
						|
		if (big_rec_vec == NULL) {
 | 
						|
 | 
						|
			index->table->space->release_free_extents(n_reserved);
 | 
						|
			return(DB_TOO_BIG_RECORD);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (index->page == btr_cur_get_block(cursor)->page.id().page_no()) {
 | 
						|
		*rec = index->is_spatial()
 | 
						|
			? rtr_root_raise_and_insert(flags, cursor, offsets,
 | 
						|
						    heap, entry, n_ext, mtr,
 | 
						|
						    &err, thr)
 | 
						|
			: btr_root_raise_and_insert(flags, cursor, offsets,
 | 
						|
						    heap, entry, n_ext, mtr,
 | 
						|
						    &err);
 | 
						|
	} else if (index->is_spatial()) {
 | 
						|
		*rec = rtr_page_split_and_insert(flags, cursor, offsets, heap,
 | 
						|
						 entry, n_ext, mtr, &err, thr);
 | 
						|
	} else {
 | 
						|
		*rec = btr_page_split_and_insert(flags, cursor, offsets, heap,
 | 
						|
						 entry, n_ext, mtr, &err);
 | 
						|
	}
 | 
						|
 | 
						|
	if (!*rec) {
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
 | 
						|
	      || dict_index_is_spatial(index));
 | 
						|
 | 
						|
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
 | 
						|
		ut_ad(!index->table->is_temporary());
 | 
						|
		if (dict_index_is_spatial(index)) {
 | 
						|
			/* Do nothing */
 | 
						|
		} else {
 | 
						|
			/* The cursor might be moved to the other page
 | 
						|
			and the max trx id field should be updated after
 | 
						|
			the cursor was fixed. */
 | 
						|
			if (!dict_index_is_clust(index)) {
 | 
						|
				page_update_max_trx_id(
 | 
						|
					btr_cur_get_block(cursor),
 | 
						|
					btr_cur_get_page_zip(cursor),
 | 
						|
					thr_get_trx(thr)->id, mtr);
 | 
						|
			}
 | 
						|
 | 
						|
			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
 | 
						|
			    || !page_has_prev(btr_cur_get_page(cursor))) {
 | 
						|
				/* split and inserted need to call
 | 
						|
				lock_update_insert() always. */
 | 
						|
				inherit = true;
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (!page_is_leaf(btr_cur_get_page(cursor))) {
 | 
						|
		ut_ad(!big_rec_vec);
 | 
						|
	} else {
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
		if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
 | 
						|
			ut_ad(entry->is_metadata());
 | 
						|
			ut_ad(index->is_instant());
 | 
						|
			ut_ad(flags & BTR_NO_LOCKING_FLAG);
 | 
						|
			ut_ad(!(flags & BTR_CREATE_FLAG));
 | 
						|
		} else if (!index->table->is_temporary()) {
 | 
						|
			btr_search_update_hash_on_insert(cursor, false);
 | 
						|
		}
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
		if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
 | 
						|
 | 
						|
			lock_update_insert(btr_cur_get_block(cursor), *rec);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	err = DB_SUCCESS;
 | 
						|
func_exit:
 | 
						|
	index->table->space->release_free_extents(n_reserved);
 | 
						|
	*big_rec = big_rec_vec;
 | 
						|
 | 
						|
	return err;
 | 
						|
}
 | 
						|
 | 
						|
/*==================== B-TREE UPDATE =========================*/
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
For an update, checks the locks and does the undo logging.
 | 
						|
@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
 | 
						|
UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
 | 
						|
dberr_t
 | 
						|
btr_cur_upd_lock_and_undo(
 | 
						|
/*======================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging and locking flags */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
 | 
						|
	const rec_offs*	offsets,/*!< in: rec_get_offsets() on cursor */
 | 
						|
	const upd_t*	update,	/*!< in: update vector */
 | 
						|
	ulint		cmpl_info,/*!< in: compiler info on secondary index
 | 
						|
				updates */
 | 
						|
	que_thr_t*	thr,	/*!< in: query thread
 | 
						|
				(can be NULL if BTR_NO_LOCKING_FLAG) */
 | 
						|
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 | 
						|
	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
 | 
						|
{
 | 
						|
	dict_index_t*	index;
 | 
						|
	const rec_t*	rec;
 | 
						|
	dberr_t		err;
 | 
						|
 | 
						|
	ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
 | 
						|
 | 
						|
	rec = btr_cur_get_rec(cursor);
 | 
						|
	index = cursor->index();
 | 
						|
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(mtr->is_named_space(index->table->space));
 | 
						|
 | 
						|
	if (!dict_index_is_clust(index)) {
 | 
						|
		ut_ad(dict_index_is_online_ddl(index)
 | 
						|
		      == !!(flags & BTR_CREATE_FLAG));
 | 
						|
 | 
						|
		/* We do undo logging only when we update a clustered index
 | 
						|
		record */
 | 
						|
		return(lock_sec_rec_modify_check_and_lock(
 | 
						|
			       flags, btr_cur_get_block(cursor), rec,
 | 
						|
			       index, thr, mtr));
 | 
						|
	}
 | 
						|
 | 
						|
	/* Check if we have to wait for a lock: enqueue an explicit lock
 | 
						|
	request if yes */
 | 
						|
 | 
						|
	if (!(flags & BTR_NO_LOCKING_FLAG)) {
 | 
						|
		err = lock_clust_rec_modify_check_and_lock(
 | 
						|
			btr_cur_get_block(cursor), rec, index,
 | 
						|
			offsets, thr);
 | 
						|
		if (err != DB_SUCCESS) {
 | 
						|
			return(err);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* Append the info about the update in the undo log */
 | 
						|
 | 
						|
	return((flags & BTR_NO_UNDO_LOG_FLAG)
 | 
						|
	       ? DB_SUCCESS
 | 
						|
	       : trx_undo_report_row_operation(
 | 
						|
		       thr, index, NULL, update,
 | 
						|
		       cmpl_info, rec, offsets, roll_ptr));
 | 
						|
}
 | 
						|
 | 
						|
/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
 | 
						|
@param[in,out]	entry		clustered index entry
 | 
						|
@param[in]	index		clustered index
 | 
						|
@param[in]	trx_id		DB_TRX_ID
 | 
						|
@param[in]	roll_ptr	DB_ROLL_PTR */
 | 
						|
static void btr_cur_write_sys(
 | 
						|
	dtuple_t*		entry,
 | 
						|
	const dict_index_t*	index,
 | 
						|
	trx_id_t		trx_id,
 | 
						|
	roll_ptr_t		roll_ptr)
 | 
						|
{
 | 
						|
	dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
 | 
						|
	ut_ad(t->len == DATA_TRX_ID_LEN);
 | 
						|
	trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
 | 
						|
	dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
 | 
						|
	ut_ad(r->len == DATA_ROLL_PTR_LEN);
 | 
						|
	trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
 | 
						|
}
 | 
						|
 | 
						|
MY_ATTRIBUTE((warn_unused_result))
 | 
						|
/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
 | 
						|
@param[in,out]  block           clustered index leaf page
 | 
						|
@param[in,out]  rec             clustered index record
 | 
						|
@param[in]      index           clustered index
 | 
						|
@param[in]      offsets         rec_get_offsets(rec, index)
 | 
						|
@param[in]      trx             transaction
 | 
						|
@param[in]      roll_ptr        DB_ROLL_PTR value
 | 
						|
@param[in,out]  mtr             mini-transaction
 | 
						|
@return error code */
 | 
						|
static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
 | 
						|
                                   dict_index_t *index, const rec_offs *offsets,
 | 
						|
                                   const trx_t *trx, roll_ptr_t roll_ptr,
 | 
						|
                                   mtr_t *mtr)
 | 
						|
{
 | 
						|
  ut_ad(index->is_primary());
 | 
						|
  ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
 | 
						|
  if (UNIV_LIKELY_NULL(block->page.zip.data))
 | 
						|
  {
 | 
						|
    page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
 | 
						|
                                       trx->id, roll_ptr, mtr);
 | 
						|
    return DB_SUCCESS;
 | 
						|
  }
 | 
						|
 | 
						|
  ulint offset= index->trx_id_offset;
 | 
						|
 | 
						|
  if (!offset)
 | 
						|
    offset= row_get_trx_id_offset(index, offsets);
 | 
						|
 | 
						|
  compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
 | 
						|
 | 
						|
  /* During IMPORT the trx id in the record can be in the future, if
 | 
						|
  the .ibd file is being imported from another instance. During IMPORT
 | 
						|
  roll_ptr will be 0. */
 | 
						|
  ut_ad(roll_ptr == 0 ||
 | 
						|
        lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
 | 
						|
                                 rec, index, offsets));
 | 
						|
 | 
						|
  byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
 | 
						|
 | 
						|
  trx_write_trx_id(sys, trx->id);
 | 
						|
  trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
 | 
						|
 | 
						|
  ulint d= 0;
 | 
						|
  const byte *src= nullptr;
 | 
						|
  byte *dest= rec + offset;
 | 
						|
  ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
 | 
						|
 | 
						|
  if (UNIV_LIKELY(index->trx_id_offset))
 | 
						|
  {
 | 
						|
    const rec_t *prev= page_rec_get_prev_const(rec);
 | 
						|
    if (UNIV_UNLIKELY(!prev || prev == rec))
 | 
						|
      return DB_CORRUPTION;
 | 
						|
    else if (page_rec_is_infimum(prev));
 | 
						|
    else
 | 
						|
      for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
 | 
						|
        if (src[d] != sys[d])
 | 
						|
          break;
 | 
						|
    if (d > 6 && memcmp(dest, sys, d))
 | 
						|
    {
 | 
						|
      /* We save space by replacing a single record
 | 
						|
 | 
						|
      WRITE,page_offset(dest),byte[13]
 | 
						|
 | 
						|
      with two records:
 | 
						|
 | 
						|
      MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
 | 
						|
      WRITE|0x80,0,byte[13-d]
 | 
						|
 | 
						|
      The single WRITE record would be x+13 bytes long, with x>2.
 | 
						|
      The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
 | 
						|
      second WRITE would be 1+1+13-d = 15-d bytes.
 | 
						|
 | 
						|
      The total size is: x+13 versus x+4+15-d = x+19-d bytes.
 | 
						|
      To save space, we must have d>6, that is, the complete DB_TRX_ID and
 | 
						|
      the first byte(s) of DB_ROLL_PTR must match the previous record. */
 | 
						|
      memcpy(dest, src, d);
 | 
						|
      mtr->memmove(*block, dest - block->page.frame, src - block->page.frame,
 | 
						|
                   d);
 | 
						|
      dest+= d;
 | 
						|
      len-= d;
 | 
						|
      /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
 | 
						|
      DB_TRX_ID refers to an active transaction. */
 | 
						|
      ut_ad(len);
 | 
						|
    }
 | 
						|
    else
 | 
						|
      d= 0;
 | 
						|
  }
 | 
						|
 | 
						|
  if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
 | 
						|
    mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
 | 
						|
 | 
						|
  return DB_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
See if there is enough place in the page modification log to log
 | 
						|
an update-in-place.
 | 
						|
 | 
						|
@retval false if out of space
 | 
						|
@retval true if enough place */
 | 
						|
bool
 | 
						|
btr_cur_update_alloc_zip_func(
 | 
						|
/*==========================*/
 | 
						|
	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
 | 
						|
	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
 | 
						|
#endif /* UNIV_DEBUG */
 | 
						|
	ulint		length,	/*!< in: size needed */
 | 
						|
	bool		create,	/*!< in: true=delete-and-insert,
 | 
						|
				false=update-in-place */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 | 
						|
{
 | 
						|
	dict_index_t*	index = cursor->index;
 | 
						|
 | 
						|
	/* Have a local copy of the variables as these can change
 | 
						|
	dynamically. */
 | 
						|
	const page_t*	page = page_cur_get_page(cursor);
 | 
						|
 | 
						|
	ut_ad(page_zip == page_cur_get_page_zip(cursor));
 | 
						|
	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
 | 
						|
 | 
						|
	if (page_zip_available(page_zip, dict_index_is_clust(index),
 | 
						|
			       length, create)) {
 | 
						|
		return(true);
 | 
						|
	}
 | 
						|
 | 
						|
	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
 | 
						|
		/* The page has been freshly compressed, so
 | 
						|
		reorganizing it will not help. */
 | 
						|
		return(false);
 | 
						|
	}
 | 
						|
 | 
						|
	if (create && page_is_leaf(page)
 | 
						|
	    && (length + page_get_data_size(page)
 | 
						|
		>= dict_index_zip_pad_optimal_page_size(index))) {
 | 
						|
		return(false);
 | 
						|
	}
 | 
						|
 | 
						|
	if (btr_page_reorganize(cursor, mtr) == DB_SUCCESS) {
 | 
						|
		rec_offs_make_valid(page_cur_get_rec(cursor), index,
 | 
						|
				    page_is_leaf(page), offsets);
 | 
						|
 | 
						|
		return page_zip_available(page_zip, dict_index_is_clust(index),
 | 
						|
					  length, create);
 | 
						|
	}
 | 
						|
 | 
						|
	return(false);
 | 
						|
}
 | 
						|
 | 
						|
/** Apply an update vector to a record. No field size changes are allowed.
 | 
						|
 | 
						|
This is usually invoked on a clustered index. The only use case for a
 | 
						|
secondary index is row_ins_sec_index_entry_by_modify() or its
 | 
						|
counterpart in ibuf_insert_to_index_page().
 | 
						|
@param[in,out]  rec     index record
 | 
						|
@param[in]      index   the index of the record
 | 
						|
@param[in]      offsets rec_get_offsets(rec, index)
 | 
						|
@param[in]      update  update vector
 | 
						|
@param[in,out]  block   index page
 | 
						|
@param[in,out]  mtr     mini-transaction */
 | 
						|
void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
 | 
						|
                              const rec_offs *offsets, const upd_t *update,
 | 
						|
                              buf_block_t *block, mtr_t *mtr)
 | 
						|
{
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(!index->table->skip_alter_undo);
 | 
						|
	ut_ad(!block->page.zip.data || index->table->not_redundant());
 | 
						|
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
	if (rec_offs_comp(offsets)) {
 | 
						|
		switch (rec_get_status(rec)) {
 | 
						|
		case REC_STATUS_ORDINARY:
 | 
						|
			break;
 | 
						|
		case REC_STATUS_INSTANT:
 | 
						|
			ut_ad(index->is_instant());
 | 
						|
			break;
 | 
						|
		case REC_STATUS_NODE_PTR:
 | 
						|
		case REC_STATUS_INFIMUM:
 | 
						|
		case REC_STATUS_SUPREMUM:
 | 
						|
			ut_ad("wrong record status in update" == 0);
 | 
						|
		}
 | 
						|
	}
 | 
						|
#endif /* UNIV_DEBUG */
 | 
						|
 | 
						|
	static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
 | 
						|
	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 | 
						|
		ut_ad(rec_offs_comp(offsets));
 | 
						|
		byte* info_bits = &rec[-REC_NEW_INFO_BITS];
 | 
						|
		const bool flip_del_mark = (*info_bits ^ update->info_bits)
 | 
						|
			& REC_INFO_DELETED_FLAG;
 | 
						|
		*info_bits &= byte(~REC_INFO_BITS_MASK);
 | 
						|
		*info_bits |= update->info_bits;
 | 
						|
 | 
						|
		if (flip_del_mark) {
 | 
						|
			page_zip_rec_set_deleted(block, rec, update->info_bits
 | 
						|
						 & REC_INFO_DELETED_FLAG, mtr);
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		byte* info_bits = &rec[rec_offs_comp(offsets)
 | 
						|
				       ? -REC_NEW_INFO_BITS
 | 
						|
				       : -REC_OLD_INFO_BITS];
 | 
						|
 | 
						|
		mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
 | 
						|
					       (*info_bits
 | 
						|
						& ~REC_INFO_BITS_MASK)
 | 
						|
					       | update->info_bits);
 | 
						|
	}
 | 
						|
 | 
						|
	for (ulint i = 0; i < update->n_fields; i++) {
 | 
						|
		const upd_field_t* uf = upd_get_nth_field(update, i);
 | 
						|
		if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
		const ulint n = uf->field_no;
 | 
						|
 | 
						|
		ut_ad(!dfield_is_ext(&uf->new_val)
 | 
						|
		      == !rec_offs_nth_extern(offsets, n));
 | 
						|
		ut_ad(!rec_offs_nth_default(offsets, n));
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
 | 
						|
			if (rec_offs_nth_sql_null(offsets, n)) {
 | 
						|
				ut_ad(index->table->is_instant());
 | 
						|
				ut_ad(n >= index->n_core_fields);
 | 
						|
				continue;
 | 
						|
			}
 | 
						|
 | 
						|
			ut_ad(!index->table->not_redundant());
 | 
						|
			switch (ulint size = rec_get_nth_field_size(rec, n)) {
 | 
						|
			case 0:
 | 
						|
				break;
 | 
						|
			case 1:
 | 
						|
				mtr->write<1,mtr_t::MAYBE_NOP>(
 | 
						|
					*block,
 | 
						|
					rec_get_field_start_offs(rec, n) + rec,
 | 
						|
					0U);
 | 
						|
				break;
 | 
						|
			default:
 | 
						|
				mtr->memset(
 | 
						|
					block,
 | 
						|
					rec_get_field_start_offs(rec, n) + rec
 | 
						|
					- block->page.frame,
 | 
						|
					size, 0);
 | 
						|
			}
 | 
						|
			ulint l = rec_get_1byte_offs_flag(rec)
 | 
						|
				? (n + 1) : (n + 1) * 2;
 | 
						|
			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
 | 
						|
			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
 | 
						|
					    == REC_2BYTE_SQL_NULL_MASK);
 | 
						|
			mtr->write<1>(*block, b,
 | 
						|
				      byte(*b | REC_1BYTE_SQL_NULL_MASK));
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		ulint len;
 | 
						|
		byte* data = rec_get_nth_field(rec, offsets, n, &len);
 | 
						|
		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 | 
						|
			ut_ad(len == uf->new_val.len);
 | 
						|
			memcpy(data, uf->new_val.data, len);
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(len != uf->new_val.len)) {
 | 
						|
			ut_ad(len == UNIV_SQL_NULL);
 | 
						|
			ut_ad(!rec_offs_comp(offsets));
 | 
						|
			len = uf->new_val.len;
 | 
						|
			ut_ad(len == rec_get_nth_field_size(rec, n));
 | 
						|
			ulint l = rec_get_1byte_offs_flag(rec)
 | 
						|
				? (n + 1) : (n + 1) * 2;
 | 
						|
			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
 | 
						|
			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
 | 
						|
					    == REC_2BYTE_SQL_NULL_MASK);
 | 
						|
			mtr->write<1>(*block, b,
 | 
						|
				      byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
 | 
						|
		}
 | 
						|
 | 
						|
		if (len) {
 | 
						|
			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
 | 
						|
						      uf->new_val.data, len);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_LIKELY(!block->page.zip.data)) {
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	switch (update->n_fields) {
 | 
						|
	case 0:
 | 
						|
		/* We only changed the delete-mark flag. */
 | 
						|
		return;
 | 
						|
	case 1:
 | 
						|
		if (!index->is_clust()
 | 
						|
		    || update->fields[0].field_no != index->db_roll_ptr()) {
 | 
						|
			break;
 | 
						|
		}
 | 
						|
		goto update_sys;
 | 
						|
	case 2:
 | 
						|
		if (!index->is_clust()
 | 
						|
		    || update->fields[0].field_no != index->db_trx_id()
 | 
						|
		    || update->fields[1].field_no != index->db_roll_ptr()) {
 | 
						|
			break;
 | 
						|
		}
 | 
						|
	update_sys:
 | 
						|
		ulint len;
 | 
						|
		const byte* sys = rec_get_nth_field(rec, offsets,
 | 
						|
						    index->db_trx_id(), &len);
 | 
						|
		ut_ad(len == DATA_TRX_ID_LEN);
 | 
						|
		page_zip_write_trx_id_and_roll_ptr(
 | 
						|
			block, rec, offsets, index->db_trx_id(),
 | 
						|
			trx_read_trx_id(sys),
 | 
						|
			trx_read_roll_ptr(sys + DATA_TRX_ID_LEN), mtr);
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	page_zip_write_rec(block, rec, index, offsets, 0, mtr);
 | 
						|
}
 | 
						|
 | 
						|
/** Check if a ROW_FORMAT=COMPRESSED page can be updated in place
 | 
						|
@param cur     cursor pointing to ROW_FORMAT=COMPRESSED page
 | 
						|
@param offsets rec_get_offsets(btr_cur_get_rec(cur))
 | 
						|
@param update  index fields being updated
 | 
						|
@param mtr     mini-transaction
 | 
						|
@return the record in the ROW_FORMAT=COMPRESSED page
 | 
						|
@retval nullptr if the page cannot be updated in place */
 | 
						|
ATTRIBUTE_COLD static
 | 
						|
rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets,
 | 
						|
                                         const upd_t& update, mtr_t *mtr)
 | 
						|
{
 | 
						|
  dict_index_t *index= cur->index();
 | 
						|
  ut_ad(!index->table->is_temporary());
 | 
						|
 | 
						|
  switch (update.n_fields) {
 | 
						|
  case 0:
 | 
						|
    /* We are only changing the delete-mark flag. */
 | 
						|
    break;
 | 
						|
  case 1:
 | 
						|
    if (!index->is_clust() ||
 | 
						|
        update.fields[0].field_no != index->db_roll_ptr())
 | 
						|
      goto check_for_overflow;
 | 
						|
    /* We are only changing the delete-mark flag and DB_ROLL_PTR. */
 | 
						|
    break;
 | 
						|
  case 2:
 | 
						|
    if (!index->is_clust() ||
 | 
						|
        update.fields[0].field_no != index->db_trx_id() ||
 | 
						|
        update.fields[1].field_no != index->db_roll_ptr())
 | 
						|
      goto check_for_overflow;
 | 
						|
    /* We are only changing DB_TRX_ID, DB_ROLL_PTR, and the delete-mark.
 | 
						|
    They can be updated in place in the uncompressed part of the
 | 
						|
    ROW_FORMAT=COMPRESSED page. */
 | 
						|
    break;
 | 
						|
  check_for_overflow:
 | 
						|
  default:
 | 
						|
    if (!btr_cur_update_alloc_zip(btr_cur_get_page_zip(cur),
 | 
						|
                                  btr_cur_get_page_cur(cur),
 | 
						|
                                  offsets, rec_offs_size(offsets),
 | 
						|
                                  false, mtr))
 | 
						|
      return nullptr;
 | 
						|
  }
 | 
						|
 | 
						|
  return btr_cur_get_rec(cur);
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Updates a record when the update causes no size changes in its fields.
 | 
						|
We assume here that the ordering fields of the record do not change.
 | 
						|
@return locking or undo log related error code, or
 | 
						|
@retval DB_SUCCESS on success
 | 
						|
@retval DB_ZIP_OVERFLOW if there is not enough space left
 | 
						|
on a ROW_FORMAT=COMPRESSED page */
 | 
						|
dberr_t
 | 
						|
btr_cur_update_in_place(
 | 
						|
/*====================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging and locking flags */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
 | 
						|
				cursor stays valid and positioned on the
 | 
						|
				same record */
 | 
						|
	rec_offs*	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
 | 
						|
	const upd_t*	update,	/*!< in: update vector */
 | 
						|
	ulint		cmpl_info,/*!< in: compiler info on secondary index
 | 
						|
				updates */
 | 
						|
	que_thr_t*	thr,	/*!< in: query thread */
 | 
						|
	trx_id_t	trx_id,	/*!< in: transaction id */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
 | 
						|
				is a secondary index, the caller must
 | 
						|
				mtr_commit(mtr) before latching any
 | 
						|
				further pages */
 | 
						|
{
 | 
						|
	dict_index_t*	index;
 | 
						|
	rec_t*		rec;
 | 
						|
	roll_ptr_t	roll_ptr	= 0;
 | 
						|
	ulint		was_delete_marked;
 | 
						|
 | 
						|
	ut_ad(page_is_leaf(cursor->page_cur.block->page.frame));
 | 
						|
	rec = btr_cur_get_rec(cursor);
 | 
						|
	index = cursor->index();
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 | 
						|
	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
 | 
						|
	      || index->table->is_temporary());
 | 
						|
	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
 | 
						|
	      || index->is_primary());
 | 
						|
	ut_ad(thr_get_trx(thr)->id == trx_id
 | 
						|
	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
 | 
						|
	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
 | 
						|
		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
 | 
						|
	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
 | 
						|
	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
 | 
						|
	ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
 | 
						|
 | 
						|
	DBUG_LOG("ib_cur",
 | 
						|
		 "update-in-place " << index->name << " (" << index->id
 | 
						|
		 << ") by " << ib::hex(trx_id) << ": "
 | 
						|
		 << rec_printer(rec, offsets).str());
 | 
						|
 | 
						|
	buf_block_t* block = btr_cur_get_block(cursor);
 | 
						|
	page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
 | 
						|
 | 
						|
	/* Check that enough space is available on the compressed page. */
 | 
						|
	if (UNIV_LIKELY_NULL(page_zip)
 | 
						|
	    && !(rec = btr_cur_update_in_place_zip_check(
 | 
						|
			 cursor, offsets, *update, mtr))) {
 | 
						|
		return DB_ZIP_OVERFLOW;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Do lock checking and undo logging */
 | 
						|
	if (dberr_t err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
 | 
						|
						    update, cmpl_info,
 | 
						|
						    thr, mtr, &roll_ptr)) {
 | 
						|
		return err;
 | 
						|
	}
 | 
						|
 | 
						|
	if (flags & BTR_KEEP_SYS_FLAG) {
 | 
						|
	} else if (dberr_t err = btr_cur_upd_rec_sys(block, rec, index, offsets,
 | 
						|
						     thr_get_trx(thr),
 | 
						|
						     roll_ptr, mtr)) {
 | 
						|
		return err;
 | 
						|
	}
 | 
						|
 | 
						|
	was_delete_marked = rec_get_deleted_flag(
 | 
						|
		rec, page_is_comp(buf_block_get_frame(block)));
 | 
						|
	/* In delete-marked records, DB_TRX_ID must always refer to an
 | 
						|
	existing undo log record. */
 | 
						|
	ut_ad(!was_delete_marked
 | 
						|
	      || !dict_index_is_clust(index)
 | 
						|
	      || row_get_rec_trx_id(rec, index, offsets));
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
	{
 | 
						|
		auto part = block->index
 | 
						|
			? &btr_search.get_part(*index) : nullptr;
 | 
						|
		if (part) {
 | 
						|
			/* TO DO: Can we skip this if none of the fields
 | 
						|
			index->search_info->curr_n_fields
 | 
						|
			are being updated? */
 | 
						|
 | 
						|
			/* The function row_upd_changes_ord_field_binary
 | 
						|
			does not work on a secondary index. */
 | 
						|
 | 
						|
			if (!dict_index_is_clust(index)
 | 
						|
			    || row_upd_changes_ord_field_binary(
 | 
						|
				    index, update, thr, NULL, NULL)) {
 | 
						|
				ut_ad(!(update->info_bits
 | 
						|
					& REC_INFO_MIN_REC_FLAG));
 | 
						|
				/* Remove possible hash index pointer
 | 
						|
				to this record */
 | 
						|
				btr_search_update_hash_on_delete(cursor);
 | 
						|
			}
 | 
						|
 | 
						|
			part->latch.wr_lock(SRW_LOCK_CALL);
 | 
						|
		}
 | 
						|
 | 
						|
		assert_block_ahi_valid(block);
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
 | 
						|
		btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
 | 
						|
					 mtr);
 | 
						|
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
		if (part) {
 | 
						|
			part->latch.wr_unlock();
 | 
						|
		}
 | 
						|
	}
 | 
						|
#endif /* BTR_CUR_HASH_ADAPT */
 | 
						|
 | 
						|
	if (was_delete_marked
 | 
						|
	    && !rec_get_deleted_flag(
 | 
						|
		    rec, page_is_comp(buf_block_get_frame(block)))) {
 | 
						|
		/* The new updated record owns its possible externally
 | 
						|
		stored fields */
 | 
						|
 | 
						|
		btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
	return DB_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/** Trim a metadata record during the rollback of instant ALTER TABLE.
 | 
						|
@param[in]	entry	metadata tuple
 | 
						|
@param[in]	index	primary key
 | 
						|
@param[in]	update	update vector for the rollback */
 | 
						|
ATTRIBUTE_COLD
 | 
						|
static void btr_cur_trim_alter_metadata(dtuple_t* entry,
 | 
						|
					const dict_index_t* index,
 | 
						|
					const upd_t* update)
 | 
						|
{
 | 
						|
	ut_ad(index->is_instant());
 | 
						|
	ut_ad(update->is_alter_metadata());
 | 
						|
	ut_ad(entry->is_alter_metadata());
 | 
						|
 | 
						|
	ut_ad(update->fields[0].field_no == index->first_user_field());
 | 
						|
	ut_ad(update->fields[0].new_val.ext);
 | 
						|
	ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
 | 
						|
	ut_ad(entry->n_fields - 1 == index->n_fields);
 | 
						|
 | 
						|
	const byte* ptr = static_cast<const byte*>(
 | 
						|
		update->fields[0].new_val.data);
 | 
						|
	ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
 | 
						|
	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
 | 
						|
	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
 | 
						|
	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
 | 
						|
	      == index->table->space->id);
 | 
						|
 | 
						|
	ulint n_fields = update->fields[1].field_no;
 | 
						|
	ut_ad(n_fields <= index->n_fields);
 | 
						|
	if (n_fields != index->n_uniq) {
 | 
						|
		ut_ad(n_fields
 | 
						|
		      >= index->n_core_fields);
 | 
						|
		entry->n_fields = uint16_t(n_fields);
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	/* This is based on dict_table_t::deserialise_columns()
 | 
						|
	and btr_cur_instant_init_low(). */
 | 
						|
	mtr_t mtr;
 | 
						|
	mtr.start();
 | 
						|
	buf_block_t* block = buf_page_get(
 | 
						|
		page_id_t(index->table->space->id,
 | 
						|
			  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
 | 
						|
		0, RW_S_LATCH, &mtr);
 | 
						|
	if (!block) {
 | 
						|
		ut_ad("corruption" == 0);
 | 
						|
		mtr.commit();
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	btr_search_drop_page_hash_index(block, index);
 | 
						|
 | 
						|
	ut_ad(fil_page_get_type(block->page.frame) == FIL_PAGE_TYPE_BLOB);
 | 
						|
	ut_ad(mach_read_from_4(&block->page.frame
 | 
						|
			       [FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO])
 | 
						|
	      == FIL_NULL);
 | 
						|
	ut_ad(mach_read_from_4(&block->page.frame
 | 
						|
			       [FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN])
 | 
						|
	      == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
 | 
						|
	n_fields = mach_read_from_4(
 | 
						|
		&block->page.frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
 | 
						|
		+ index->first_user_field();
 | 
						|
	/* Rollback should not increase the number of fields. */
 | 
						|
	ut_ad(n_fields <= index->n_fields);
 | 
						|
	ut_ad(n_fields + 1 <= entry->n_fields);
 | 
						|
	/* dict_index_t::clear_instant_alter() cannot be invoked while
 | 
						|
	rollback of an instant ALTER TABLE transaction is in progress
 | 
						|
	for an is_alter_metadata() record. */
 | 
						|
	ut_ad(n_fields >= index->n_core_fields);
 | 
						|
 | 
						|
	mtr.commit();
 | 
						|
	entry->n_fields = uint16_t(n_fields + 1);
 | 
						|
}
 | 
						|
 | 
						|
/** Trim an update tuple due to instant ADD COLUMN, if needed.
 | 
						|
For normal records, the trailing instantly added fields that match
 | 
						|
the initial default values are omitted.
 | 
						|
 | 
						|
For the special metadata record on a table on which instant
 | 
						|
ADD COLUMN has already been executed, both ADD COLUMN and the
 | 
						|
rollback of ADD COLUMN need to be handled specially.
 | 
						|
 | 
						|
@param[in,out]	entry	index entry
 | 
						|
@param[in]	index	index
 | 
						|
@param[in]	update	update vector
 | 
						|
@param[in]	thr	execution thread */
 | 
						|
static inline
 | 
						|
void
 | 
						|
btr_cur_trim(
 | 
						|
	dtuple_t*		entry,
 | 
						|
	const dict_index_t*	index,
 | 
						|
	const upd_t*		update,
 | 
						|
	const que_thr_t*	thr)
 | 
						|
{
 | 
						|
	if (!index->is_instant()) {
 | 
						|
	} else if (UNIV_UNLIKELY(update->is_metadata())) {
 | 
						|
		/* We are either updating a metadata record
 | 
						|
		(instant ALTER TABLE on a table where instant ALTER was
 | 
						|
		already executed) or rolling back such an operation. */
 | 
						|
		ut_ad(!upd_get_nth_field(update, 0)->orig_len);
 | 
						|
		ut_ad(entry->is_metadata());
 | 
						|
 | 
						|
		if (thr->graph->trx->in_rollback) {
 | 
						|
			/* This rollback can occur either as part of
 | 
						|
			ha_innobase::commit_inplace_alter_table() rolling
 | 
						|
			back after a failed innobase_add_instant_try(),
 | 
						|
			or as part of crash recovery. Either way, the
 | 
						|
			table will be in the data dictionary cache, with
 | 
						|
			the instantly added columns going to be removed
 | 
						|
			later in the rollback. */
 | 
						|
			ut_ad(index->table->cached);
 | 
						|
			/* The DB_TRX_ID,DB_ROLL_PTR are always last,
 | 
						|
			and there should be some change to roll back.
 | 
						|
			The first field in the update vector is the
 | 
						|
			first instantly added column logged by
 | 
						|
			innobase_add_instant_try(). */
 | 
						|
			ut_ad(update->n_fields > 2);
 | 
						|
			if (update->is_alter_metadata()) {
 | 
						|
				btr_cur_trim_alter_metadata(
 | 
						|
					entry, index, update);
 | 
						|
				return;
 | 
						|
			}
 | 
						|
			ut_ad(!entry->is_alter_metadata());
 | 
						|
 | 
						|
			ulint n_fields = upd_get_nth_field(update, 0)
 | 
						|
				->field_no;
 | 
						|
			ut_ad(n_fields + 1 >= entry->n_fields);
 | 
						|
			entry->n_fields = uint16_t(n_fields);
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		entry->trim(*index);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Tries to update a record on a page in an index tree. It is assumed that mtr
 | 
						|
holds an x-latch on the page. The operation does not succeed if there is too
 | 
						|
little space on the page or if the update would result in too empty a page,
 | 
						|
so that tree compression is recommended. We assume here that the ordering
 | 
						|
fields of the record do not change.
 | 
						|
@return error code, including
 | 
						|
@retval DB_SUCCESS on success
 | 
						|
@retval DB_OVERFLOW if the updated record does not fit
 | 
						|
@retval DB_UNDERFLOW if the page would become too empty
 | 
						|
@retval DB_ZIP_OVERFLOW if there is not enough space left
 | 
						|
on a ROW_FORMAT=COMPRESSED page */
 | 
						|
dberr_t
 | 
						|
btr_cur_optimistic_update(
 | 
						|
/*======================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging and locking flags */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
 | 
						|
				cursor stays valid and positioned on the
 | 
						|
				same record */
 | 
						|
	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
 | 
						|
	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
 | 
						|
	const upd_t*	update,	/*!< in: update vector; this must also
 | 
						|
				contain trx id and roll ptr fields */
 | 
						|
	ulint		cmpl_info,/*!< in: compiler info on secondary index
 | 
						|
				updates */
 | 
						|
	que_thr_t*	thr,	/*!< in: query thread */
 | 
						|
	trx_id_t	trx_id,	/*!< in: transaction id */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
 | 
						|
				is a secondary index, the caller must
 | 
						|
				mtr_commit(mtr) before latching any
 | 
						|
				further pages */
 | 
						|
{
 | 
						|
	dict_index_t*	index;
 | 
						|
	page_cur_t*	page_cursor;
 | 
						|
	dberr_t		err;
 | 
						|
	buf_block_t*	block;
 | 
						|
	page_t*		page;
 | 
						|
	page_zip_des_t*	page_zip;
 | 
						|
	rec_t*		rec;
 | 
						|
	ulint		max_size;
 | 
						|
	ulint		new_rec_size;
 | 
						|
	ulint		old_rec_size;
 | 
						|
	dtuple_t*	new_entry;
 | 
						|
	roll_ptr_t	roll_ptr;
 | 
						|
	ulint		i;
 | 
						|
 | 
						|
	block = btr_cur_get_block(cursor);
 | 
						|
	page = buf_block_get_frame(block);
 | 
						|
	rec = btr_cur_get_rec(cursor);
 | 
						|
	index = cursor->index();
 | 
						|
	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
 | 
						|
	      || index->table->is_temporary());
 | 
						|
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 | 
						|
	/* This is intended only for leaf page updates */
 | 
						|
	ut_ad(page_is_leaf(page));
 | 
						|
	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
 | 
						|
	      || dict_index_is_clust(index));
 | 
						|
	ut_ad(thr_get_trx(thr)->id == trx_id
 | 
						|
	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
 | 
						|
	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
 | 
						|
		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
 | 
						|
	ut_ad(fil_page_index_page_check(page));
 | 
						|
	ut_ad(btr_page_get_index_id(page) == index->id);
 | 
						|
 | 
						|
	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
 | 
						|
				   ULINT_UNDEFINED, heap);
 | 
						|
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 | 
						|
	ut_a(!rec_offs_any_null_extern(rec, *offsets)
 | 
						|
	     || thr_get_trx(thr) == trx_roll_crash_recv_trx);
 | 
						|
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 | 
						|
 | 
						|
	if (UNIV_LIKELY(!update->is_metadata())
 | 
						|
	    && !row_upd_changes_field_size_or_external(index, *offsets,
 | 
						|
						       update)) {
 | 
						|
 | 
						|
		/* The simplest and the most common case: the update does not
 | 
						|
		change the size of any field and none of the updated fields is
 | 
						|
		externally stored in rec or update, and there is enough space
 | 
						|
		on the compressed page to log the update. */
 | 
						|
 | 
						|
		return(btr_cur_update_in_place(
 | 
						|
			       flags, cursor, *offsets, update,
 | 
						|
			       cmpl_info, thr, trx_id, mtr));
 | 
						|
	}
 | 
						|
 | 
						|
	if (rec_offs_any_extern(*offsets)) {
 | 
						|
any_extern:
 | 
						|
		/* Externally stored fields are treated in pessimistic
 | 
						|
		update */
 | 
						|
 | 
						|
		/* prefetch siblings of the leaf for the pessimistic
 | 
						|
		operation. */
 | 
						|
		btr_cur_prefetch_siblings(block, index);
 | 
						|
 | 
						|
		return(DB_OVERFLOW);
 | 
						|
	}
 | 
						|
 | 
						|
	if (rec_is_metadata(rec, *index) && index->table->instant) {
 | 
						|
		goto any_extern;
 | 
						|
	}
 | 
						|
 | 
						|
	for (i = 0; i < upd_get_n_fields(update); i++) {
 | 
						|
		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
 | 
						|
 | 
						|
			goto any_extern;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	DBUG_LOG("ib_cur",
 | 
						|
		 "update " << index->name << " (" << index->id << ") by "
 | 
						|
		 << ib::hex(trx_id) << ": "
 | 
						|
		 << rec_printer(rec, *offsets).str());
 | 
						|
 | 
						|
	page_cursor = btr_cur_get_page_cur(cursor);
 | 
						|
 | 
						|
	if (!*heap) {
 | 
						|
		*heap = mem_heap_create(
 | 
						|
			rec_offs_size(*offsets)
 | 
						|
			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
 | 
						|
	}
 | 
						|
 | 
						|
	new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
 | 
						|
	ut_ad(!dtuple_get_n_ext(new_entry));
 | 
						|
 | 
						|
	/* The page containing the clustered index record
 | 
						|
	corresponding to new_entry is latched in mtr.
 | 
						|
	Thus the following call is safe. */
 | 
						|
	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
 | 
						|
						     *heap);
 | 
						|
	btr_cur_trim(new_entry, index, update, thr);
 | 
						|
	old_rec_size = rec_offs_size(*offsets);
 | 
						|
	new_rec_size = rec_get_converted_size(index, new_entry, 0);
 | 
						|
 | 
						|
	page_zip = buf_block_get_page_zip(block);
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
	if (page_zip) {
 | 
						|
		ut_ad(!index->table->is_temporary());
 | 
						|
 | 
						|
		if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
 | 
						|
					   dict_index_get_n_fields(index),
 | 
						|
					   block->zip_size())) {
 | 
						|
			goto any_extern;
 | 
						|
		}
 | 
						|
 | 
						|
		if (!btr_cur_update_alloc_zip(
 | 
						|
			    page_zip, page_cursor, *offsets,
 | 
						|
			    new_rec_size, true, mtr)) {
 | 
						|
			return(DB_ZIP_OVERFLOW);
 | 
						|
		}
 | 
						|
 | 
						|
		rec = page_cur_get_rec(page_cursor);
 | 
						|
	}
 | 
						|
 | 
						|
	/* We limit max record size to 16k even for 64k page size. */
 | 
						|
	if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
 | 
						|
			(!dict_table_is_comp(index->table)
 | 
						|
			 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
 | 
						|
		err = DB_OVERFLOW;
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(new_rec_size
 | 
						|
			  >= (page_get_free_space_of_empty(page_is_comp(page))
 | 
						|
			      / 2))) {
 | 
						|
		err = DB_OVERFLOW;
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(page_get_data_size(page)
 | 
						|
			  - old_rec_size + new_rec_size
 | 
						|
			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
 | 
						|
		/* The page would become too empty */
 | 
						|
		err = DB_UNDERFLOW;
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	/* We do not attempt to reorganize if the page is compressed.
 | 
						|
	This is because the page may fail to compress after reorganization. */
 | 
						|
	max_size = page_zip
 | 
						|
		? page_get_max_insert_size(page, 1)
 | 
						|
		: (old_rec_size
 | 
						|
		   + page_get_max_insert_size_after_reorganize(page, 1));
 | 
						|
 | 
						|
	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
 | 
						|
	       && (max_size >= new_rec_size))
 | 
						|
	      || (page_get_n_recs(page) <= 1))) {
 | 
						|
		/* There was not enough space, or it did not pay to
 | 
						|
		reorganize: for simplicity, we decide what to do assuming a
 | 
						|
		reorganization is needed, though it might not be necessary */
 | 
						|
 | 
						|
		err = DB_OVERFLOW;
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Do lock checking and undo logging */
 | 
						|
	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
 | 
						|
					update, cmpl_info,
 | 
						|
					thr, mtr, &roll_ptr);
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Ok, we may do the replacement. Store on the page infimum the
 | 
						|
	explicit locks on rec, before deleting rec (see the comment in
 | 
						|
	btr_cur_pessimistic_update). */
 | 
						|
	if (index->has_locking()) {
 | 
						|
		lock_rec_store_on_page_infimum(block, rec);
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(update->is_metadata())) {
 | 
						|
		ut_ad(new_entry->is_metadata());
 | 
						|
		ut_ad(index->is_instant());
 | 
						|
		/* This can be innobase_add_instant_try() performing a
 | 
						|
		subsequent instant ADD COLUMN, or its rollback by
 | 
						|
		row_undo_mod_clust_low(). */
 | 
						|
		ut_ad(flags & BTR_NO_LOCKING_FLAG);
 | 
						|
	} else {
 | 
						|
		btr_search_update_hash_on_delete(cursor);
 | 
						|
	}
 | 
						|
 | 
						|
	page_cur_delete_rec(page_cursor, *offsets, mtr);
 | 
						|
 | 
						|
	if (!page_cur_move_to_prev(page_cursor)) {
 | 
						|
		return DB_CORRUPTION;
 | 
						|
	}
 | 
						|
 | 
						|
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 | 
						|
		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
 | 
						|
	}
 | 
						|
 | 
						|
	rec = btr_cur_insert_if_possible(cursor, new_entry, offsets, heap,
 | 
						|
					 0/*n_ext*/, mtr);
 | 
						|
	if (UNIV_UNLIKELY(!rec)) {
 | 
						|
		goto corrupted;
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(update->is_metadata())) {
 | 
						|
		/* We must empty the PAGE_FREE list, because if this
 | 
						|
		was a rollback, the shortened metadata record
 | 
						|
		would have too many fields, and we would be unable to
 | 
						|
		know the size of the freed record. */
 | 
						|
		err = btr_page_reorganize(page_cursor, mtr);
 | 
						|
		if (err != DB_SUCCESS) {
 | 
						|
			goto func_exit;
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		/* Restore the old explicit lock state on the record */
 | 
						|
		lock_rec_restore_from_page_infimum(*block, rec,
 | 
						|
						   block->page.id());
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(err == DB_SUCCESS);
 | 
						|
	if (!page_cur_move_to_next(page_cursor)) {
 | 
						|
corrupted:
 | 
						|
		return DB_CORRUPTION;
 | 
						|
	}
 | 
						|
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
func_exit:
 | 
						|
		/* prefetch siblings of the leaf for the pessimistic
 | 
						|
		operation. */
 | 
						|
		btr_cur_prefetch_siblings(block, index);
 | 
						|
	}
 | 
						|
 | 
						|
	return(err);
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
If, in a split, a new supremum record was created as the predecessor of the
 | 
						|
updated record, the supremum record must inherit exactly the locks on the
 | 
						|
updated record. In the split it may have inherited locks from the successor
 | 
						|
of the updated record, which is not correct. This function restores the
 | 
						|
right locks for the new supremum. */
 | 
						|
static
 | 
						|
dberr_t
 | 
						|
btr_cur_pess_upd_restore_supremum(
 | 
						|
/*==============================*/
 | 
						|
	buf_block_t*	block,	/*!< in: buffer block of rec */
 | 
						|
	const rec_t*	rec,	/*!< in: updated record */
 | 
						|
	mtr_t*		mtr)	/*!< in: mtr */
 | 
						|
{
 | 
						|
	page_t*		page;
 | 
						|
 | 
						|
	page = buf_block_get_frame(block);
 | 
						|
 | 
						|
	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
 | 
						|
		/* Updated record is not the first user record on its page */
 | 
						|
		return DB_SUCCESS;
 | 
						|
	}
 | 
						|
 | 
						|
	const uint32_t	prev_page_no = btr_page_get_prev(page);
 | 
						|
 | 
						|
	const page_id_t block_id{block->page.id()};
 | 
						|
	const page_id_t	prev_id(block_id.space(), prev_page_no);
 | 
						|
	buf_block_t* prev_block
 | 
						|
		= mtr->get_already_latched(prev_id, MTR_MEMO_PAGE_X_FIX);
 | 
						|
	if (UNIV_UNLIKELY(!prev_block)) {
 | 
						|
		return DB_CORRUPTION;
 | 
						|
	}
 | 
						|
	ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT,
 | 
						|
				 block->page.frame + FIL_PAGE_OFFSET, 4));
 | 
						|
 | 
						|
	lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id,
 | 
						|
					     PAGE_HEAP_NO_SUPREMUM,
 | 
						|
					     page_is_comp(page)
 | 
						|
					     ? rec_get_heap_no_new(rec)
 | 
						|
					     : rec_get_heap_no_old(rec));
 | 
						|
	return DB_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Performs an update of a record on a page of a tree. It is assumed
 | 
						|
that mtr holds an x-latch on the tree and on the cursor page. If the
 | 
						|
update is made on the leaf level, to avoid deadlocks, mtr must also
 | 
						|
own x-latches to brothers of page, if those brothers exist. We assume
 | 
						|
here that the ordering fields of the record do not change.
 | 
						|
@return DB_SUCCESS or error code */
 | 
						|
dberr_t
 | 
						|
btr_cur_pessimistic_update(
 | 
						|
/*=======================*/
 | 
						|
	ulint		flags,	/*!< in: undo logging, locking, and rollback
 | 
						|
				flags */
 | 
						|
	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
 | 
						|
				cursor may become invalid if *big_rec == NULL
 | 
						|
				|| !(flags & BTR_KEEP_POS_FLAG) */
 | 
						|
	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
 | 
						|
	mem_heap_t**	offsets_heap,
 | 
						|
				/*!< in/out: pointer to memory heap
 | 
						|
				that can be emptied */
 | 
						|
	mem_heap_t*	entry_heap,
 | 
						|
				/*!< in/out: memory heap for allocating
 | 
						|
				big_rec and the index tuple */
 | 
						|
	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
 | 
						|
				be stored externally by the caller */
 | 
						|
	upd_t*		update,	/*!< in/out: update vector; this is allowed to
 | 
						|
				also contain trx id and roll ptr fields.
 | 
						|
				Non-updated columns that are moved offpage will
 | 
						|
				be appended to this. */
 | 
						|
	ulint		cmpl_info,/*!< in: compiler info on secondary index
 | 
						|
				updates */
 | 
						|
	que_thr_t*	thr,	/*!< in: query thread */
 | 
						|
	trx_id_t	trx_id,	/*!< in: transaction id */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
 | 
						|
				committed before latching any further pages */
 | 
						|
{
 | 
						|
	big_rec_t*	big_rec_vec	= NULL;
 | 
						|
	big_rec_t*	dummy_big_rec;
 | 
						|
	dict_index_t*	index;
 | 
						|
	buf_block_t*	block;
 | 
						|
	rec_t*		rec;
 | 
						|
	page_cur_t*	page_cursor;
 | 
						|
	dberr_t		err;
 | 
						|
	dberr_t		optim_err;
 | 
						|
	roll_ptr_t	roll_ptr;
 | 
						|
	bool		was_first;
 | 
						|
	uint32_t	n_reserved	= 0;
 | 
						|
 | 
						|
	*offsets = NULL;
 | 
						|
	*big_rec = NULL;
 | 
						|
 | 
						|
	block = btr_cur_get_block(cursor);
 | 
						|
	index = cursor->index();
 | 
						|
 | 
						|
	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
 | 
						|
					 MTR_MEMO_SX_LOCK));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 | 
						|
#if defined UNIV_ZIP_DEBUG || defined UNIV_DEBUG
 | 
						|
	page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
 | 
						|
#endif
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
	ut_a(!page_zip
 | 
						|
	     || page_zip_validate(page_zip, block->page.frame, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
	ut_ad(!page_zip || !index->table->is_temporary());
 | 
						|
	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
 | 
						|
	      || index->table->is_temporary());
 | 
						|
	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
 | 
						|
	      || dict_index_is_clust(index));
 | 
						|
	ut_ad(thr_get_trx(thr)->id == trx_id
 | 
						|
	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
 | 
						|
	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
 | 
						|
		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
 | 
						|
 | 
						|
	err = optim_err = btr_cur_optimistic_update(
 | 
						|
		flags,
 | 
						|
		cursor, offsets, offsets_heap, update,
 | 
						|
		cmpl_info, thr, trx_id, mtr);
 | 
						|
 | 
						|
	switch (err) {
 | 
						|
	case DB_ZIP_OVERFLOW:
 | 
						|
	case DB_UNDERFLOW:
 | 
						|
	case DB_OVERFLOW:
 | 
						|
		break;
 | 
						|
	default:
 | 
						|
	err_exit:
 | 
						|
		if (big_rec_vec != NULL) {
 | 
						|
			dtuple_big_rec_free(big_rec_vec);
 | 
						|
		}
 | 
						|
 | 
						|
		return(err);
 | 
						|
	}
 | 
						|
 | 
						|
	rec = btr_cur_get_rec(cursor);
 | 
						|
	ut_ad(rec_offs_validate(rec, index, *offsets));
 | 
						|
 | 
						|
	dtuple_t* new_entry;
 | 
						|
 | 
						|
	const bool is_metadata = rec_is_metadata(rec, *index);
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(is_metadata)) {
 | 
						|
		ut_ad(update->is_metadata());
 | 
						|
		ut_ad(flags & BTR_NO_LOCKING_FLAG);
 | 
						|
		ut_ad(index->is_instant());
 | 
						|
		new_entry = row_metadata_to_tuple(
 | 
						|
			rec, index, *offsets, entry_heap,
 | 
						|
			update->info_bits, !thr_get_trx(thr)->in_rollback);
 | 
						|
		ut_ad(new_entry->n_fields
 | 
						|
		      == ulint(index->n_fields)
 | 
						|
		      + update->is_alter_metadata());
 | 
						|
	} else {
 | 
						|
		new_entry = row_rec_to_index_entry(rec, index, *offsets,
 | 
						|
						   entry_heap);
 | 
						|
	}
 | 
						|
 | 
						|
	/* The page containing the clustered index record
 | 
						|
	corresponding to new_entry is latched in mtr.  If the
 | 
						|
	clustered index record is delete-marked, then its externally
 | 
						|
	stored fields cannot have been purged yet, because then the
 | 
						|
	purge would also have removed the clustered index record
 | 
						|
	itself.  Thus the following call is safe. */
 | 
						|
	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
 | 
						|
						     entry_heap);
 | 
						|
	btr_cur_trim(new_entry, index, update, thr);
 | 
						|
 | 
						|
	/* We have to set appropriate extern storage bits in the new
 | 
						|
	record to be inserted: we have to remember which fields were such */
 | 
						|
 | 
						|
	ut_ad(!page_is_comp(block->page.frame) || !rec_get_node_ptr_flag(rec));
 | 
						|
	ut_ad(rec_offs_validate(rec, index, *offsets));
 | 
						|
 | 
						|
	if ((flags & BTR_NO_UNDO_LOG_FLAG)
 | 
						|
	    && rec_offs_any_extern(*offsets)) {
 | 
						|
		/* We are in a transaction rollback undoing a row
 | 
						|
		update: we must free possible externally stored fields
 | 
						|
		which got new values in the update, if they are not
 | 
						|
		inherited values. They can be inherited if we have
 | 
						|
		updated the primary key to another value, and then
 | 
						|
		update it back again. */
 | 
						|
 | 
						|
		ut_ad(big_rec_vec == NULL);
 | 
						|
		ut_ad(dict_index_is_clust(index));
 | 
						|
		ut_ad(thr_get_trx(thr)->in_rollback);
 | 
						|
 | 
						|
		DEBUG_SYNC_C("blob_rollback_middle");
 | 
						|
 | 
						|
		btr_rec_free_updated_extern_fields(
 | 
						|
			index, rec, block, *offsets, update, true, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
	ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
 | 
						|
 | 
						|
	if (page_zip_rec_needs_ext(
 | 
						|
		    rec_get_converted_size(index, new_entry, n_ext),
 | 
						|
		    page_is_comp(block->page.frame),
 | 
						|
		    dict_index_get_n_fields(index),
 | 
						|
		    block->zip_size())
 | 
						|
	    || (UNIV_UNLIKELY(update->is_alter_metadata())
 | 
						|
		&& !dfield_is_ext(dtuple_get_nth_field(
 | 
						|
					  new_entry,
 | 
						|
					  index->first_user_field())))) {
 | 
						|
		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
 | 
						|
		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
			ut_a(!page_zip
 | 
						|
			     || page_zip_validate(page_zip, block->page.frame,
 | 
						|
						  index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
			index->table->space->release_free_extents(n_reserved);
 | 
						|
			err = DB_TOO_BIG_RECORD;
 | 
						|
			goto err_exit;
 | 
						|
		}
 | 
						|
 | 
						|
		ut_ad(page_is_leaf(block->page.frame));
 | 
						|
		ut_ad(dict_index_is_clust(index));
 | 
						|
		if (UNIV_UNLIKELY(!(flags & BTR_KEEP_POS_FLAG))) {
 | 
						|
			ut_ad(page_zip != NULL);
 | 
						|
			dtuple_convert_back_big_rec(index, new_entry,
 | 
						|
						    big_rec_vec);
 | 
						|
			big_rec_vec = NULL;
 | 
						|
			n_ext = dtuple_get_n_ext(new_entry);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* Do lock checking and undo logging */
 | 
						|
	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
 | 
						|
					update, cmpl_info,
 | 
						|
					thr, mtr, &roll_ptr);
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
		goto err_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	if (optim_err == DB_OVERFLOW) {
 | 
						|
		/* First reserve enough free space for the file segments
 | 
						|
		of the index tree, so that the update will not fail because
 | 
						|
		of lack of space */
 | 
						|
 | 
						|
		err = fsp_reserve_free_extents(
 | 
						|
			&n_reserved, index->table->space,
 | 
						|
			uint32_t(cursor->tree_height / 16 + 3),
 | 
						|
			flags & BTR_NO_UNDO_LOG_FLAG
 | 
						|
			? FSP_CLEANING : FSP_NORMAL,
 | 
						|
			mtr);
 | 
						|
                if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 | 
						|
			err = DB_OUT_OF_FILE_SPACE;
 | 
						|
			goto err_exit;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 | 
						|
		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(is_metadata)) {
 | 
						|
		ut_ad(new_entry->is_metadata());
 | 
						|
		ut_ad(index->is_instant());
 | 
						|
		/* This can be innobase_add_instant_try() performing a
 | 
						|
		subsequent instant ALTER TABLE, or its rollback by
 | 
						|
		row_undo_mod_clust_low(). */
 | 
						|
		ut_ad(flags & BTR_NO_LOCKING_FLAG);
 | 
						|
	} else {
 | 
						|
		btr_search_update_hash_on_delete(cursor);
 | 
						|
 | 
						|
		/* Store state of explicit locks on rec on the page
 | 
						|
		infimum record, before deleting rec. The page infimum
 | 
						|
		acts as a dummy carrier of the locks, taking care also
 | 
						|
		of lock releases, before we can move the locks back on
 | 
						|
		the actual record. There is a special case: if we are
 | 
						|
		inserting on the root page and the insert causes a
 | 
						|
		call of btr_root_raise_and_insert. Therefore we cannot
 | 
						|
		in the lock system delete the lock structs set on the
 | 
						|
		root page even if the root page carries just node
 | 
						|
		pointers. */
 | 
						|
		lock_rec_store_on_page_infimum(block, rec);
 | 
						|
	}
 | 
						|
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
	ut_a(!page_zip
 | 
						|
	     || page_zip_validate(page_zip, block->page.frame, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
	page_cursor = btr_cur_get_page_cur(cursor);
 | 
						|
 | 
						|
	page_cur_delete_rec(page_cursor, *offsets, mtr);
 | 
						|
 | 
						|
	if (!page_cur_move_to_prev(page_cursor)) {
 | 
						|
		err = DB_CORRUPTION;
 | 
						|
		goto return_after_reservations;
 | 
						|
	}
 | 
						|
 | 
						|
	rec = btr_cur_insert_if_possible(cursor, new_entry,
 | 
						|
					 offsets, offsets_heap, n_ext, mtr);
 | 
						|
 | 
						|
	if (rec) {
 | 
						|
		page_cursor->rec = rec;
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(is_metadata)) {
 | 
						|
			/* We must empty the PAGE_FREE list, because if this
 | 
						|
			was a rollback, the shortened metadata record
 | 
						|
			would have too many fields, and we would be unable to
 | 
						|
			know the size of the freed record. */
 | 
						|
			err = btr_page_reorganize(page_cursor, mtr);
 | 
						|
			if (err != DB_SUCCESS) {
 | 
						|
				goto return_after_reservations;
 | 
						|
			}
 | 
						|
			rec = page_cursor->rec;
 | 
						|
			rec_offs_make_valid(rec, index, true, *offsets);
 | 
						|
			if (page_cursor->block->page.id().page_no()
 | 
						|
			    == index->page) {
 | 
						|
				btr_set_instant(page_cursor->block, *index,
 | 
						|
						mtr);
 | 
						|
			}
 | 
						|
		} else {
 | 
						|
			lock_rec_restore_from_page_infimum(
 | 
						|
				*btr_cur_get_block(cursor), rec,
 | 
						|
				block->page.id());
 | 
						|
		}
 | 
						|
 | 
						|
		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
 | 
						|
		    || rec_is_alter_metadata(rec, *index)) {
 | 
						|
			/* The new inserted record owns its possible externally
 | 
						|
			stored fields */
 | 
						|
			btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
 | 
						|
						     rec, index, *offsets, mtr);
 | 
						|
		} else {
 | 
						|
			/* In delete-marked records, DB_TRX_ID must
 | 
						|
			always refer to an existing undo log record. */
 | 
						|
			ut_ad(row_get_rec_trx_id(rec, index, *offsets));
 | 
						|
		}
 | 
						|
 | 
						|
		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
 | 
						|
		ut_ad(!adjust || page_is_leaf(block->page.frame));
 | 
						|
 | 
						|
		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
 | 
						|
			if (adjust) {
 | 
						|
				rec_offs_make_valid(page_cursor->rec, index,
 | 
						|
						    true, *offsets);
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
 | 
						|
		if (!big_rec_vec
 | 
						|
		    && page_is_leaf(block->page.frame)
 | 
						|
		    && !dict_index_is_online_ddl(index)) {
 | 
						|
			mtr->release(index->lock);
 | 
						|
			/* NOTE: We cannot release root block latch here, because it
 | 
						|
			has segment header and already modified in most of cases.*/
 | 
						|
		}
 | 
						|
#endif
 | 
						|
 | 
						|
		err = DB_SUCCESS;
 | 
						|
		goto return_after_reservations;
 | 
						|
	} else {
 | 
						|
		/* If the page is compressed and it initially
 | 
						|
		compresses very well, and there is a subsequent insert
 | 
						|
		of a badly-compressing record, it is possible for
 | 
						|
		btr_cur_optimistic_update() to return DB_UNDERFLOW and
 | 
						|
		btr_cur_insert_if_possible() to return FALSE. */
 | 
						|
		ut_ad(page_zip || optim_err != DB_UNDERFLOW);
 | 
						|
	}
 | 
						|
 | 
						|
	if (big_rec_vec != NULL) {
 | 
						|
		ut_ad(page_is_leaf(block->page.frame));
 | 
						|
		ut_ad(dict_index_is_clust(index));
 | 
						|
		ut_ad(flags & BTR_KEEP_POS_FLAG);
 | 
						|
 | 
						|
		/* btr_page_split_and_insert() in
 | 
						|
		btr_cur_pessimistic_insert() invokes
 | 
						|
		mtr->release(index->lock).
 | 
						|
		We must keep the index->lock when we created a
 | 
						|
		big_rec, so that row_upd_clust_rec() can store the
 | 
						|
		big_rec in the same mini-transaction. */
 | 
						|
 | 
						|
		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 | 
						|
						 | MTR_MEMO_SX_LOCK));
 | 
						|
		mtr_sx_lock_index(index, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
	/* Was the record to be updated positioned as the first user
 | 
						|
	record on its page? */
 | 
						|
	was_first = page_cur_is_before_first(page_cursor);
 | 
						|
 | 
						|
	/* Lock checks and undo logging were already performed by
 | 
						|
	btr_cur_upd_lock_and_undo(). We do not try
 | 
						|
	btr_cur_optimistic_insert() because
 | 
						|
	btr_cur_insert_if_possible() already failed above. */
 | 
						|
 | 
						|
	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
 | 
						|
					 | BTR_NO_LOCKING_FLAG
 | 
						|
					 | BTR_KEEP_SYS_FLAG,
 | 
						|
					 cursor, offsets, offsets_heap,
 | 
						|
					 new_entry, &rec,
 | 
						|
					 &dummy_big_rec, n_ext, NULL, mtr);
 | 
						|
	ut_a(err == DB_SUCCESS);
 | 
						|
	ut_a(rec);
 | 
						|
	ut_a(dummy_big_rec == NULL);
 | 
						|
	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
 | 
						|
	page_cursor->rec = rec;
 | 
						|
 | 
						|
	/* Multiple transactions cannot simultaneously operate on the
 | 
						|
	same temp-table in parallel.
 | 
						|
	max_trx_id is ignored for temp tables because it not required
 | 
						|
	for MVCC. */
 | 
						|
	if (!index->is_primary() && !index->table->is_temporary()) {
 | 
						|
		/* Update PAGE_MAX_TRX_ID in the index page header.
 | 
						|
		It was not updated by btr_cur_pessimistic_insert()
 | 
						|
		because of BTR_NO_LOCKING_FLAG. */
 | 
						|
		page_update_max_trx_id(btr_cur_get_block(cursor),
 | 
						|
				       btr_cur_get_page_zip(cursor),
 | 
						|
				       trx_id, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
 | 
						|
		/* The new inserted record owns its possible externally
 | 
						|
		stored fields */
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
		ut_a(!page_zip
 | 
						|
		     || page_zip_validate(page_zip, block->page.frame, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
		btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
 | 
						|
					     index, *offsets, mtr);
 | 
						|
	} else {
 | 
						|
		/* In delete-marked records, DB_TRX_ID must
 | 
						|
		always refer to an existing undo log record. */
 | 
						|
		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(is_metadata)) {
 | 
						|
		/* We must empty the PAGE_FREE list, because if this
 | 
						|
		was a rollback, the shortened metadata record
 | 
						|
		would have too many fields, and we would be unable to
 | 
						|
		know the size of the freed record. */
 | 
						|
		err = btr_page_reorganize(page_cursor, mtr);
 | 
						|
		if (err != DB_SUCCESS) {
 | 
						|
			goto return_after_reservations;
 | 
						|
		}
 | 
						|
		rec = page_cursor->rec;
 | 
						|
	} else {
 | 
						|
		lock_rec_restore_from_page_infimum(
 | 
						|
			*btr_cur_get_block(cursor), rec, block->page.id());
 | 
						|
	}
 | 
						|
 | 
						|
	/* If necessary, restore also the correct lock state for a new,
 | 
						|
	preceding supremum record created in a page split. While the old
 | 
						|
	record was nonexistent, the supremum might have inherited its locks
 | 
						|
	from a wrong record. */
 | 
						|
 | 
						|
	if (!was_first) {
 | 
						|
		err = btr_cur_pess_upd_restore_supremum(
 | 
						|
			btr_cur_get_block(cursor), rec, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
return_after_reservations:
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
	ut_a(err ||
 | 
						|
	     !page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
 | 
						|
					    btr_cur_get_page(cursor), index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
	index->table->space->release_free_extents(n_reserved);
 | 
						|
	*big_rec = big_rec_vec;
 | 
						|
	return(err);
 | 
						|
}
 | 
						|
 | 
						|
/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 | 
						|
 | 
						|
/** Modify the delete-mark flag of a record.
 | 
						|
@tparam         flag    the value of the delete-mark flag
 | 
						|
@param[in,out]  block   buffer block
 | 
						|
@param[in,out]  rec     record on a physical index page
 | 
						|
@param[in,out]  mtr     mini-transaction  */
 | 
						|
template<bool flag>
 | 
						|
void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
 | 
						|
{
 | 
						|
  if (UNIV_LIKELY(page_is_comp(block->page.frame) != 0))
 | 
						|
  {
 | 
						|
    byte *b= &rec[-REC_NEW_INFO_BITS];
 | 
						|
    const byte v= flag
 | 
						|
      ? (*b | REC_INFO_DELETED_FLAG)
 | 
						|
      : (*b & byte(~REC_INFO_DELETED_FLAG));
 | 
						|
    if (*b == v);
 | 
						|
    else if (UNIV_LIKELY_NULL(block->page.zip.data))
 | 
						|
    {
 | 
						|
      *b= v;
 | 
						|
      page_zip_rec_set_deleted(block, rec, flag, mtr);
 | 
						|
    }
 | 
						|
    else
 | 
						|
      mtr->write<1>(*block, b, v);
 | 
						|
  }
 | 
						|
  else
 | 
						|
  {
 | 
						|
    ut_ad(!block->page.zip.data);
 | 
						|
    byte *b= &rec[-REC_OLD_INFO_BITS];
 | 
						|
    const byte v = flag
 | 
						|
      ? (*b | REC_INFO_DELETED_FLAG)
 | 
						|
      : (*b & byte(~REC_INFO_DELETED_FLAG));
 | 
						|
    mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
 | 
						|
template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
 | 
						|
 | 
						|
/***********************************************************//**
 | 
						|
Marks a clustered index record deleted. Writes an undo log record to
 | 
						|
undo log on this delete marking. Writes in the trx id field the id
 | 
						|
of the deleting transaction, and in the roll ptr field pointer to the
 | 
						|
undo log record created.
 | 
						|
@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
 | 
						|
dberr_t
 | 
						|
btr_cur_del_mark_set_clust_rec(
 | 
						|
/*===========================*/
 | 
						|
	buf_block_t*	block,	/*!< in/out: buffer block of the record */
 | 
						|
	rec_t*		rec,	/*!< in/out: record */
 | 
						|
	dict_index_t*	index,	/*!< in: clustered index of the record */
 | 
						|
	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
 | 
						|
	que_thr_t*	thr,	/*!< in: query thread */
 | 
						|
	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
 | 
						|
				contains the virtual cols if there are any */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 | 
						|
{
 | 
						|
	roll_ptr_t	roll_ptr;
 | 
						|
	dberr_t		err;
 | 
						|
	trx_t*		trx;
 | 
						|
 | 
						|
	ut_ad(dict_index_is_clust(index));
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 | 
						|
	ut_ad(buf_block_get_frame(block) == page_align(rec));
 | 
						|
	ut_ad(page_rec_is_leaf(rec));
 | 
						|
	ut_ad(mtr->is_named_space(index->table->space));
 | 
						|
 | 
						|
	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
 | 
						|
		/* We may already have delete-marked this record
 | 
						|
		when executing an ON DELETE CASCADE operation. */
 | 
						|
		ut_ad(row_get_rec_trx_id(rec, index, offsets)
 | 
						|
		      == thr_get_trx(thr)->id);
 | 
						|
		return(DB_SUCCESS);
 | 
						|
	}
 | 
						|
 | 
						|
	err = trx_undo_report_row_operation(thr, index,
 | 
						|
					    entry, NULL, 0, rec, offsets,
 | 
						|
					    &roll_ptr);
 | 
						|
	if (err != DB_SUCCESS) {
 | 
						|
 | 
						|
		return(err);
 | 
						|
	}
 | 
						|
 | 
						|
	/* The search latch is not needed here, because
 | 
						|
	the adaptive hash index does not depend on the delete-mark
 | 
						|
	and the delete-mark is being updated in place. */
 | 
						|
 | 
						|
	btr_rec_set_deleted<true>(block, rec, mtr);
 | 
						|
 | 
						|
	trx = thr_get_trx(thr);
 | 
						|
 | 
						|
	DBUG_LOG("ib_cur",
 | 
						|
		 "delete-mark clust " << index->table->name
 | 
						|
		 << " (" << index->id << ") by "
 | 
						|
		 << ib::hex(trx->id) << ": "
 | 
						|
		 << rec_printer(rec, offsets).str());
 | 
						|
 | 
						|
	return btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr,
 | 
						|
				   mtr);
 | 
						|
}
 | 
						|
 | 
						|
/*==================== B-TREE RECORD REMOVE =========================*/
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Tries to compress a page of the tree if it seems useful. It is assumed
 | 
						|
that mtr holds an x-latch on the tree and on the cursor page. To avoid
 | 
						|
deadlocks, mtr must also own x-latches to brothers of page, if those
 | 
						|
brothers exist. NOTE: it is assumed that the caller has reserved enough
 | 
						|
free extents so that the compression will always succeed if done!
 | 
						|
@return whether compression occurred */
 | 
						|
bool
 | 
						|
btr_cur_compress_if_useful(
 | 
						|
/*=======================*/
 | 
						|
	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
 | 
						|
				cursor does not stay valid if !adjust and
 | 
						|
				compression occurs */
 | 
						|
	bool		adjust,	/*!< in: whether the cursor position should be
 | 
						|
				adjusted even when compression occurs */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 | 
						|
{
 | 
						|
	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
 | 
						|
					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
 | 
						|
					 MTR_MEMO_PAGE_X_FIX));
 | 
						|
 | 
						|
	if (cursor->index()->is_spatial()) {
 | 
						|
		const trx_t*	trx = cursor->rtr_info->thr
 | 
						|
			? thr_get_trx(cursor->rtr_info->thr)
 | 
						|
			: NULL;
 | 
						|
		const buf_block_t* block = btr_cur_get_block(cursor);
 | 
						|
 | 
						|
		/* Check whether page lock prevents the compression */
 | 
						|
		if (!lock_test_prdt_page_lock(trx, block->page.id())) {
 | 
						|
			return(false);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return btr_cur_compress_recommendation(cursor, mtr)
 | 
						|
		&& btr_compress(cursor, adjust, mtr) == DB_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************//**
 | 
						|
Removes the record on which the tree cursor is positioned on a leaf page.
 | 
						|
It is assumed that the mtr has an x-latch on the page where the cursor is
 | 
						|
positioned, but no latch on the whole tree.
 | 
						|
@return error code
 | 
						|
@retval DB_FAIL if the page would become too empty */
 | 
						|
dberr_t
 | 
						|
btr_cur_optimistic_delete(
 | 
						|
/*======================*/
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
 | 
						|
				delete; cursor stays valid: if deletion
 | 
						|
				succeeds, on function exit it points to the
 | 
						|
				successor of the deleted record */
 | 
						|
	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
 | 
						|
	mtr_t*		mtr)	/*!< in: mtr; if this function returns
 | 
						|
				TRUE on a leaf page of a secondary
 | 
						|
				index, the mtr must be committed
 | 
						|
				before latching any further pages */
 | 
						|
{
 | 
						|
	buf_block_t*	block;
 | 
						|
	rec_t*		rec;
 | 
						|
	mem_heap_t*	heap		= NULL;
 | 
						|
	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 | 
						|
	rec_offs*	offsets		= offsets_;
 | 
						|
	rec_offs_init(offsets_);
 | 
						|
 | 
						|
	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
 | 
						|
	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
 | 
						|
					 MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(mtr->is_named_space(cursor->index()->table->space));
 | 
						|
	ut_ad(!cursor->index()->is_dummy);
 | 
						|
 | 
						|
	/* This is intended only for leaf page deletions */
 | 
						|
 | 
						|
	block = btr_cur_get_block(cursor);
 | 
						|
 | 
						|
	ut_ad(block->page.id().space() == cursor->index()->table->space->id);
 | 
						|
	ut_ad(page_is_leaf(buf_block_get_frame(block)));
 | 
						|
	ut_ad(!dict_index_is_online_ddl(cursor->index())
 | 
						|
	      || cursor->index()->is_clust()
 | 
						|
	      || (flags & BTR_CREATE_FLAG));
 | 
						|
 | 
						|
	rec = btr_cur_get_rec(cursor);
 | 
						|
 | 
						|
	offsets = rec_get_offsets(rec, cursor->index(), offsets,
 | 
						|
				  cursor->index()->n_core_fields,
 | 
						|
				  ULINT_UNDEFINED, &heap);
 | 
						|
 | 
						|
	dberr_t err = DB_SUCCESS;
 | 
						|
	DBUG_EXECUTE_IF("btr_force_pessimistic_delete",
 | 
						|
		err = DB_FAIL; goto func_exit;);
 | 
						|
 | 
						|
	if (rec_offs_any_extern(offsets)
 | 
						|
	    || !btr_cur_can_delete_without_compress(cursor,
 | 
						|
						    rec_offs_size(offsets),
 | 
						|
						    mtr)) {
 | 
						|
		/* prefetch siblings of the leaf for the pessimistic
 | 
						|
		operation. */
 | 
						|
		btr_cur_prefetch_siblings(block, cursor->index());
 | 
						|
		err = DB_FAIL;
 | 
						|
		goto func_exit;
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index()->page
 | 
						|
			  && page_get_n_recs(block->page.frame) == 1
 | 
						|
			  + (cursor->index()->is_instant()
 | 
						|
			     && !rec_is_metadata(rec, *cursor->index()))
 | 
						|
			  && !cursor->index()
 | 
						|
			  ->must_avoid_clear_instant_add())) {
 | 
						|
		/* The whole index (and table) becomes logically empty.
 | 
						|
		Empty the whole page. That is, if we are deleting the
 | 
						|
		only user record, also delete the metadata record
 | 
						|
		if one exists for instant ADD COLUMN (not generic ALTER TABLE).
 | 
						|
		If we are deleting the metadata record and the
 | 
						|
		table becomes empty, clean up the whole page. */
 | 
						|
		dict_index_t* index = cursor->index();
 | 
						|
		const rec_t* first_rec = page_rec_get_next_const(
 | 
						|
			page_get_infimum_rec(block->page.frame));
 | 
						|
		if (UNIV_UNLIKELY(!first_rec)) {
 | 
						|
			err = DB_CORRUPTION;
 | 
						|
			goto func_exit;
 | 
						|
		}
 | 
						|
		ut_ad(!index->is_instant()
 | 
						|
		      || rec_is_metadata(first_rec, *index));
 | 
						|
		const bool is_metadata = rec_is_metadata(rec, *index);
 | 
						|
		/* We can remove the metadata when rolling back an
 | 
						|
		instant ALTER TABLE operation, or when deleting the
 | 
						|
		last user record on the page such that only metadata for
 | 
						|
		instant ADD COLUMN (not generic ALTER TABLE) remains. */
 | 
						|
		const bool empty_table = is_metadata
 | 
						|
			|| !index->is_instant()
 | 
						|
			|| (first_rec != rec
 | 
						|
			    && rec_is_add_metadata(first_rec, *index));
 | 
						|
		if (UNIV_LIKELY(empty_table)) {
 | 
						|
			if (UNIV_LIKELY(!is_metadata && !flags)) {
 | 
						|
				lock_update_delete(block, rec);
 | 
						|
			}
 | 
						|
			btr_page_empty(block, buf_block_get_page_zip(block),
 | 
						|
				       index, 0, mtr);
 | 
						|
			if (index->is_instant()) {
 | 
						|
				/* MDEV-17383: free metadata BLOBs! */
 | 
						|
				index->clear_instant_alter();
 | 
						|
			}
 | 
						|
 | 
						|
			page_cur_set_after_last(block,
 | 
						|
						btr_cur_get_page_cur(cursor));
 | 
						|
			goto func_exit;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	{
 | 
						|
		if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_is_comp(
 | 
						|
							    block->page.frame))
 | 
						|
				  & REC_INFO_MIN_REC_FLAG)) {
 | 
						|
			/* This should be rolling back instant ADD COLUMN.
 | 
						|
			If this is a recovered transaction, then
 | 
						|
			index->is_instant() will hold until the
 | 
						|
			insert into SYS_COLUMNS is rolled back. */
 | 
						|
			ut_ad(cursor->index()->table->supports_instant());
 | 
						|
			ut_ad(cursor->index()->is_primary());
 | 
						|
			ut_ad(!buf_block_get_page_zip(block));
 | 
						|
			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
 | 
						|
					    offsets, mtr);
 | 
						|
			/* We must empty the PAGE_FREE list, because
 | 
						|
			after rollback, this deleted metadata record
 | 
						|
			would have too many fields, and we would be
 | 
						|
			unable to know the size of the freed record. */
 | 
						|
			err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
 | 
						|
						  mtr);
 | 
						|
			goto func_exit;
 | 
						|
		} else {
 | 
						|
			if (!flags) {
 | 
						|
				lock_update_delete(block, rec);
 | 
						|
			}
 | 
						|
 | 
						|
			btr_search_update_hash_on_delete(cursor);
 | 
						|
		}
 | 
						|
 | 
						|
		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
 | 
						|
				    offsets, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
func_exit:
 | 
						|
	if (UNIV_LIKELY_NULL(heap)) {
 | 
						|
		mem_heap_free(heap);
 | 
						|
	}
 | 
						|
 | 
						|
	return err;
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************//**
 | 
						|
Removes the record on which the tree cursor is positioned. Tries
 | 
						|
to compress the page if its fillfactor drops below a threshold
 | 
						|
or if it is the only page on the level. It is assumed that mtr holds
 | 
						|
an x-latch on the tree and on the cursor page. To avoid deadlocks,
 | 
						|
mtr must also own x-latches to brothers of page, if those brothers
 | 
						|
exist.
 | 
						|
@return TRUE if compression occurred and FALSE if not or something
 | 
						|
wrong. */
 | 
						|
ibool
 | 
						|
btr_cur_pessimistic_delete(
 | 
						|
/*=======================*/
 | 
						|
	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
 | 
						|
				the latter may occur because we may have
 | 
						|
				to update node pointers on upper levels,
 | 
						|
				and in the case of variable length keys
 | 
						|
				these may actually grow in size */
 | 
						|
	ibool		has_reserved_extents, /*!< in: TRUE if the
 | 
						|
				caller has already reserved enough free
 | 
						|
				extents so that he knows that the operation
 | 
						|
				will succeed */
 | 
						|
	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
 | 
						|
				if compression does not occur, the cursor
 | 
						|
				stays valid: it points to successor of
 | 
						|
				deleted record on function exit */
 | 
						|
	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
 | 
						|
	bool		rollback,/*!< in: performing rollback? */
 | 
						|
	mtr_t*		mtr)	/*!< in: mtr */
 | 
						|
{
 | 
						|
	buf_block_t*	block;
 | 
						|
	page_t*		page;
 | 
						|
	page_zip_des_t*	page_zip;
 | 
						|
	dict_index_t*	index;
 | 
						|
	rec_t*		rec;
 | 
						|
	uint32_t	n_reserved	= 0;
 | 
						|
	ibool		ret		= FALSE;
 | 
						|
	mem_heap_t*	heap;
 | 
						|
	rec_offs*	offsets;
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
	bool		parent_latched	= false;
 | 
						|
#endif /* UNIV_DEBUG */
 | 
						|
 | 
						|
	block = btr_cur_get_block(cursor);
 | 
						|
	page = buf_block_get_frame(block);
 | 
						|
	index = btr_cur_get_index(cursor);
 | 
						|
 | 
						|
	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
 | 
						|
	ut_ad(!dict_index_is_online_ddl(index)
 | 
						|
	      || dict_index_is_clust(index)
 | 
						|
	      || (flags & BTR_CREATE_FLAG));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 | 
						|
					 | MTR_MEMO_SX_LOCK));
 | 
						|
	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(mtr->is_named_space(index->table->space));
 | 
						|
	ut_ad(!index->is_dummy);
 | 
						|
	ut_ad(block->page.id().space() == index->table->space->id);
 | 
						|
 | 
						|
	if (!has_reserved_extents) {
 | 
						|
		/* First reserve enough free space for the file segments
 | 
						|
		of the index tree, so that the node pointer updates will
 | 
						|
		not fail because of lack of space */
 | 
						|
 | 
						|
		uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
 | 
						|
 | 
						|
		*err = fsp_reserve_free_extents(&n_reserved,
 | 
						|
						index->table->space,
 | 
						|
						n_extents,
 | 
						|
						FSP_CLEANING, mtr);
 | 
						|
		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
 | 
						|
			return(FALSE);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	heap = mem_heap_create(1024);
 | 
						|
	rec = btr_cur_get_rec(cursor);
 | 
						|
	page_zip = buf_block_get_page_zip(block);
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
	offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
 | 
						|
				  ? index->n_core_fields : 0,
 | 
						|
				  ULINT_UNDEFINED, &heap);
 | 
						|
 | 
						|
	if (rec_offs_any_extern(offsets)) {
 | 
						|
		btr_rec_free_externally_stored_fields(index,
 | 
						|
						      rec, offsets, block,
 | 
						|
						      rollback, mtr);
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
	}
 | 
						|
 | 
						|
	rec_t* next_rec = NULL;
 | 
						|
	bool min_mark_next_rec = false;
 | 
						|
 | 
						|
	if (page_is_leaf(page)) {
 | 
						|
		const bool is_metadata = rec_is_metadata(
 | 
						|
			rec, page_is_comp(block->page.frame));
 | 
						|
		if (UNIV_UNLIKELY(is_metadata)) {
 | 
						|
			/* This should be rolling back instant ALTER TABLE.
 | 
						|
			If this is a recovered transaction, then
 | 
						|
			index->is_instant() will hold until the
 | 
						|
			insert into SYS_COLUMNS is rolled back. */
 | 
						|
			ut_ad(rollback);
 | 
						|
			ut_ad(index->table->supports_instant());
 | 
						|
			ut_ad(index->is_primary());
 | 
						|
		} else if (flags == 0) {
 | 
						|
			lock_update_delete(block, rec);
 | 
						|
		}
 | 
						|
 | 
						|
		if (block->page.id().page_no() != index->page) {
 | 
						|
			if (page_get_n_recs(page) < 2) {
 | 
						|
				goto discard_page;
 | 
						|
			}
 | 
						|
		} else if (page_get_n_recs(page) == 1
 | 
						|
			   + (index->is_instant() && !is_metadata)
 | 
						|
			   && !index->must_avoid_clear_instant_add()) {
 | 
						|
			/* The whole index (and table) becomes logically empty.
 | 
						|
			Empty the whole page. That is, if we are deleting the
 | 
						|
			only user record, also delete the metadata record
 | 
						|
			if one exists for instant ADD COLUMN
 | 
						|
			(not generic ALTER TABLE).
 | 
						|
			If we are deleting the metadata record
 | 
						|
			(in the rollback of instant ALTER TABLE) and the
 | 
						|
			table becomes empty, clean up the whole page. */
 | 
						|
 | 
						|
			const rec_t* first_rec = page_rec_get_next_const(
 | 
						|
				page_get_infimum_rec(page));
 | 
						|
			if (UNIV_UNLIKELY(!first_rec)) {
 | 
						|
				*err = DB_CORRUPTION;
 | 
						|
				goto err_exit;
 | 
						|
			}
 | 
						|
			ut_ad(!index->is_instant()
 | 
						|
			      || rec_is_metadata(first_rec, *index));
 | 
						|
			if (is_metadata || !index->is_instant()
 | 
						|
			    || (first_rec != rec
 | 
						|
				&& rec_is_add_metadata(first_rec, *index))) {
 | 
						|
				btr_page_empty(block, page_zip, index, 0, mtr);
 | 
						|
				if (index->is_instant()) {
 | 
						|
					/* MDEV-17383: free metadata BLOBs! */
 | 
						|
					index->clear_instant_alter();
 | 
						|
				}
 | 
						|
 | 
						|
				page_cur_set_after_last(
 | 
						|
					block,
 | 
						|
					btr_cur_get_page_cur(cursor));
 | 
						|
				ret = TRUE;
 | 
						|
				goto return_after_reservations;
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if (UNIV_LIKELY(!is_metadata)) {
 | 
						|
			btr_search_update_hash_on_delete(cursor);
 | 
						|
		} else {
 | 
						|
			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
 | 
						|
					    offsets, mtr);
 | 
						|
			/* We must empty the PAGE_FREE list, because
 | 
						|
			after rollback, this deleted metadata record
 | 
						|
			would carry too many fields, and we would be
 | 
						|
			unable to know the size of the freed record. */
 | 
						|
			*err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
 | 
						|
						   mtr);
 | 
						|
			ut_ad(!ret);
 | 
						|
			goto err_exit;
 | 
						|
		}
 | 
						|
	} else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
 | 
						|
		if (page_rec_is_last(rec, page)) {
 | 
						|
discard_page:
 | 
						|
			ut_ad(page_get_n_recs(page) == 1);
 | 
						|
			/* If there is only one record, drop
 | 
						|
			the whole page. */
 | 
						|
 | 
						|
			btr_discard_page(cursor, mtr);
 | 
						|
 | 
						|
			ret = TRUE;
 | 
						|
			goto return_after_reservations;
 | 
						|
		}
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(!(next_rec = page_rec_get_next(rec)))) {
 | 
						|
			ut_ad(!ret);
 | 
						|
			*err = DB_CORRUPTION;
 | 
						|
			goto err_exit;
 | 
						|
		}
 | 
						|
 | 
						|
		btr_cur_t cur;
 | 
						|
		cur.page_cur.index = index;
 | 
						|
		cur.page_cur.block = block;
 | 
						|
 | 
						|
		if (!page_has_prev(page)) {
 | 
						|
			/* If we delete the leftmost node pointer on a
 | 
						|
			non-leaf level, we must mark the new leftmost node
 | 
						|
			pointer as the predefined minimum record */
 | 
						|
 | 
						|
			min_mark_next_rec = true;
 | 
						|
		} else if (index->is_spatial()) {
 | 
						|
			/* For rtree, if delete the leftmost node pointer,
 | 
						|
			we need to update parent page. */
 | 
						|
			rtr_mbr_t	father_mbr;
 | 
						|
			rec_t*		father_rec;
 | 
						|
			rec_offs*	offsets;
 | 
						|
			ulint		len;
 | 
						|
 | 
						|
			rtr_page_get_father_block(nullptr, heap, nullptr,
 | 
						|
						  &cur,
 | 
						|
						  cursor->rtr_info->thr, mtr);
 | 
						|
			father_rec = btr_cur_get_rec(&cur);
 | 
						|
			offsets = rec_get_offsets(father_rec, index, NULL,
 | 
						|
						  0, ULINT_UNDEFINED, &heap);
 | 
						|
 | 
						|
			rtr_read_mbr(rec_get_nth_field(
 | 
						|
				father_rec, offsets, 0, &len), &father_mbr);
 | 
						|
 | 
						|
			rtr_update_mbr_field(&cur, offsets, NULL,
 | 
						|
					     page, &father_mbr, next_rec, mtr);
 | 
						|
			ut_d(parent_latched = true);
 | 
						|
		} else {
 | 
						|
			/* Otherwise, if we delete the leftmost node pointer
 | 
						|
			on a page, we have to change the parent node pointer
 | 
						|
			so that it is equal to the new leftmost node pointer
 | 
						|
			on the page */
 | 
						|
			ret = btr_page_get_father(mtr, &cur);
 | 
						|
			if (!ret) {
 | 
						|
				*err = DB_CORRUPTION;
 | 
						|
				goto err_exit;
 | 
						|
			}
 | 
						|
			*err = btr_cur_node_ptr_delete(&cur, mtr);
 | 
						|
			if (*err != DB_SUCCESS) {
 | 
						|
got_err:
 | 
						|
				ret = FALSE;
 | 
						|
				goto err_exit;
 | 
						|
			}
 | 
						|
 | 
						|
			const ulint	level = btr_page_get_level(page);
 | 
						|
			// FIXME: reuse the node_ptr from above
 | 
						|
			dtuple_t*	node_ptr = dict_index_build_node_ptr(
 | 
						|
				index, next_rec, block->page.id().page_no(),
 | 
						|
				heap, level);
 | 
						|
 | 
						|
			*err = btr_insert_on_non_leaf_level(
 | 
						|
				flags, index, level + 1, node_ptr, mtr);
 | 
						|
			if (*err != DB_SUCCESS) {
 | 
						|
				ret = FALSE;
 | 
						|
				goto got_err;
 | 
						|
			}
 | 
						|
 | 
						|
			ut_d(parent_latched = true);
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	/* SPATIAL INDEX never use U locks; we can allow page merges
 | 
						|
	while holding X lock on the spatial index tree.
 | 
						|
	Do not allow merges of non-leaf B-tree pages unless it is
 | 
						|
	safe to do so. */
 | 
						|
	{
 | 
						|
		const bool allow_merge = page_is_leaf(page)
 | 
						|
			|| dict_index_is_spatial(index)
 | 
						|
			|| btr_cur_will_modify_tree(
 | 
						|
				index, page, BTR_INTENTION_DELETE, rec,
 | 
						|
				btr_node_ptr_max_size(index),
 | 
						|
				block->zip_size(), mtr);
 | 
						|
		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
 | 
						|
				    offsets, mtr);
 | 
						|
 | 
						|
		if (min_mark_next_rec) {
 | 
						|
			btr_set_min_rec_mark(next_rec, *block, mtr);
 | 
						|
		}
 | 
						|
 | 
						|
#ifdef UNIV_ZIP_DEBUG
 | 
						|
		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 | 
						|
#endif /* UNIV_ZIP_DEBUG */
 | 
						|
 | 
						|
		ut_ad(!parent_latched
 | 
						|
		      || btr_check_node_ptr(index, block,
 | 
						|
					    cursor->rtr_info
 | 
						|
					    ? cursor->rtr_info->thr
 | 
						|
					    : nullptr, mtr));
 | 
						|
 | 
						|
		if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
 | 
						|
			if (UNIV_LIKELY(allow_merge)) {
 | 
						|
				ret = btr_cur_compress_if_useful(
 | 
						|
					cursor, FALSE, mtr);
 | 
						|
			} else {
 | 
						|
				ib::warn() << "Not merging page "
 | 
						|
					   << block->page.id()
 | 
						|
					   << " in index " << index->name
 | 
						|
					   << " of " << index->table->name;
 | 
						|
				ut_ad("MDEV-14637" == 0);
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
return_after_reservations:
 | 
						|
	*err = DB_SUCCESS;
 | 
						|
err_exit:
 | 
						|
	mem_heap_free(heap);
 | 
						|
 | 
						|
#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
 | 
						|
	if (page_is_leaf(page)
 | 
						|
	    && !dict_index_is_online_ddl(index)) {
 | 
						|
		mtr->release(index->lock);
 | 
						|
		/* NOTE: We cannot release root block latch here, because it
 | 
						|
		has segment header and already modified in most of cases.*/
 | 
						|
	}
 | 
						|
#endif
 | 
						|
 | 
						|
	index->table->space->release_free_extents(n_reserved);
 | 
						|
	return(ret);
 | 
						|
}
 | 
						|
 | 
						|
/** Delete the node pointer in a parent page.
 | 
						|
@param[in,out]	parent	cursor pointing to parent record
 | 
						|
@param[in,out]	mtr	mini-transaction */
 | 
						|
dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
 | 
						|
{
 | 
						|
	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
 | 
						|
					 MTR_MEMO_PAGE_X_FIX));
 | 
						|
	dberr_t err;
 | 
						|
	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
 | 
						|
						      BTR_CREATE_FLAG, false,
 | 
						|
						      mtr);
 | 
						|
	if (err == DB_SUCCESS && !compressed) {
 | 
						|
		btr_cur_compress_if_useful(parent, FALSE, mtr);
 | 
						|
	}
 | 
						|
 | 
						|
	return err;
 | 
						|
}
 | 
						|
 | 
						|
/** Represents the cursor for the number of rows estimation. The
 | 
						|
content is used for level-by-level diving and estimation the number of rows
 | 
						|
on each level. */
 | 
						|
class btr_est_cur_t
 | 
						|
{
 | 
						|
  /* Assume a page like:
 | 
						|
  records:             (inf, a, b, c, d, sup)
 | 
						|
  index of the record:    0, 1, 2, 3, 4, 5
 | 
						|
  */
 | 
						|
 | 
						|
  /** Index of the record where the page cursor stopped on this level
 | 
						|
  (index in alphabetical order). In the above example, if the search stopped on
 | 
						|
  record 'c', then nth_rec will be 3. */
 | 
						|
  ulint m_nth_rec;
 | 
						|
 | 
						|
  /** Number of the records on the page, not counting inf and sup.
 | 
						|
  In the above example n_recs will be 4. */
 | 
						|
  ulint m_n_recs;
 | 
						|
 | 
						|
  /** Search tuple */
 | 
						|
  const dtuple_t &m_tuple;
 | 
						|
  /** Cursor search mode */
 | 
						|
  page_cur_mode_t m_mode;
 | 
						|
  /** Page cursor which is used for search */
 | 
						|
  page_cur_t m_page_cur;
 | 
						|
  /** Page id of the page to get on level down, can differ from
 | 
						|
  m_block->page.id at the moment when the child's page id is already found, but
 | 
						|
  the child's block has not fetched yet */
 | 
						|
  page_id_t m_page_id;
 | 
						|
  /** Current block */
 | 
						|
  buf_block_t *m_block;
 | 
						|
  /** Page search mode, can differ from m_mode for non-leaf pages, see c-tor
 | 
						|
  comments for details */
 | 
						|
  page_cur_mode_t m_page_mode;
 | 
						|
 | 
						|
  /** Matched fields and bytes which are used for on-page search, see
 | 
						|
  btr_cur_t::(up|low)_(match|bytes) comments for details */
 | 
						|
  uint16_t m_up_match= 0;
 | 
						|
  uint16_t m_up_bytes= 0;
 | 
						|
  uint16_t m_low_match= 0;
 | 
						|
  uint16_t m_low_bytes= 0;
 | 
						|
 | 
						|
public:
 | 
						|
  btr_est_cur_t(dict_index_t *index, const dtuple_t &tuple,
 | 
						|
                page_cur_mode_t mode)
 | 
						|
      : m_tuple(tuple), m_mode(mode),
 | 
						|
        m_page_id(index->table->space_id, index->page), m_block(nullptr)
 | 
						|
  {
 | 
						|
 | 
						|
    ut_ad(dict_index_check_search_tuple(index, &tuple));
 | 
						|
    ut_ad(dtuple_check_typed(&tuple));
 | 
						|
 | 
						|
    m_page_cur.index = index;
 | 
						|
    /* We use these modified search modes on non-leaf levels of the B-tree.
 | 
						|
    These let us end up in the right B-tree leaf. In that leaf we use the
 | 
						|
    original search mode. */
 | 
						|
    switch (mode) {
 | 
						|
    case PAGE_CUR_GE:
 | 
						|
      m_page_mode= PAGE_CUR_L;
 | 
						|
      break;
 | 
						|
    case PAGE_CUR_G:
 | 
						|
      m_page_mode= PAGE_CUR_LE;
 | 
						|
      break;
 | 
						|
    default:
 | 
						|
      ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
 | 
						|
      m_page_mode= mode;
 | 
						|
      break;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  /** Retrieve block with m_page_id, release the previously gotten block
 | 
						|
  if necessary. If this is a left border block cursor and both left and right
 | 
						|
  border blocks have the same parent, don't unlatch the parent, as it must be
 | 
						|
  latched to get the right block, and will be unlatched after the right block
 | 
						|
  is fetched.
 | 
						|
  @param  level distance from the leaf page level; ULINT_UNDEFINED when
 | 
						|
          fetching the root page
 | 
						|
  @param  mtr mtr
 | 
						|
  @param  right_parent right border block parent, nullptr if the function
 | 
						|
          is called for the right block itself
 | 
						|
  @return true on success or false otherwise. */
 | 
						|
  bool fetch_child(ulint level, mtr_t &mtr, const buf_block_t *right_parent)
 | 
						|
  {
 | 
						|
    buf_block_t *parent_block= m_block;
 | 
						|
 | 
						|
    m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH,
 | 
						|
                           &mtr, nullptr);
 | 
						|
    if (!m_block)
 | 
						|
      return false;
 | 
						|
 | 
						|
    if (parent_block && parent_block != right_parent)
 | 
						|
    {
 | 
						|
      ut_ad(mtr.get_savepoint() >= 2);
 | 
						|
      mtr.rollback_to_savepoint(1, 2);
 | 
						|
    }
 | 
						|
 | 
						|
    return level == ULINT_UNDEFINED ||
 | 
						|
      btr_page_get_level(m_block->page.frame) == level;
 | 
						|
  }
 | 
						|
 | 
						|
  /** Sets page mode for leaves */
 | 
						|
  void set_page_mode_for_leaves() { m_page_mode= m_mode; }
 | 
						|
 | 
						|
  /** Does search on the current page. If there is no border in m_tuple, then
 | 
						|
  just move the cursor to the most left or right record.
 | 
						|
  @param level current level on tree.
 | 
						|
  @param root_height root height
 | 
						|
  @param left true if this is left border, false otherwise.
 | 
						|
  @return true on success, false otherwise. */
 | 
						|
  bool search_on_page(ulint level, ulint root_height, bool left)
 | 
						|
  {
 | 
						|
    if (level != btr_page_get_level(m_block->page.frame))
 | 
						|
      return false;
 | 
						|
 | 
						|
    m_n_recs= page_get_n_recs(m_block->page.frame);
 | 
						|
 | 
						|
    if (dtuple_get_n_fields(&m_tuple) > 0)
 | 
						|
    {
 | 
						|
      m_up_bytes= m_low_bytes= 0;
 | 
						|
      m_page_cur.block= m_block;
 | 
						|
      if (page_cur_search_with_match(&m_tuple, m_page_mode,
 | 
						|
                                     &m_up_match, &m_low_match, &m_page_cur,
 | 
						|
                                     nullptr))
 | 
						|
        return false;
 | 
						|
      m_nth_rec= page_rec_get_n_recs_before(page_cur_get_rec(&m_page_cur));
 | 
						|
    }
 | 
						|
    else if (left)
 | 
						|
    {
 | 
						|
      page_cur_set_before_first(m_block, &m_page_cur);
 | 
						|
      if (level)
 | 
						|
      {
 | 
						|
        if (!page_cur_move_to_next(&m_page_cur))
 | 
						|
          return false;
 | 
						|
        m_nth_rec= 1;
 | 
						|
      }
 | 
						|
      else
 | 
						|
        m_nth_rec= 0;
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
      m_nth_rec= m_n_recs;
 | 
						|
      if (!level)
 | 
						|
      {
 | 
						|
        page_cur_set_after_last(m_block, &m_page_cur);
 | 
						|
        ++m_nth_rec;
 | 
						|
      }
 | 
						|
      else
 | 
						|
      {
 | 
						|
        m_page_cur.block= m_block;
 | 
						|
        m_page_cur.rec= page_rec_get_nth(m_block->page.frame, m_nth_rec);
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    return true;
 | 
						|
  }
 | 
						|
 | 
						|
  /** Read page id of the current record child.
 | 
						|
  @param offsets offsets array.
 | 
						|
  @param heap heap for offsets array */
 | 
						|
  void read_child_page_id(rec_offs **offsets, mem_heap_t **heap)
 | 
						|
  {
 | 
						|
    const rec_t *node_ptr= page_cur_get_rec(&m_page_cur);
 | 
						|
 | 
						|
    /* FIXME: get the child page number directly without computing offsets */
 | 
						|
    *offsets= rec_get_offsets(node_ptr, index(), *offsets, 0, ULINT_UNDEFINED,
 | 
						|
                              heap);
 | 
						|
 | 
						|
    /* Go to the child node */
 | 
						|
    m_page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, *offsets));
 | 
						|
  }
 | 
						|
 | 
						|
  /** @return true if left border should be counted */
 | 
						|
  bool should_count_the_left_border() const
 | 
						|
  {
 | 
						|
    if (dtuple_get_n_fields(&m_tuple) > 0)
 | 
						|
    {
 | 
						|
      ut_ad(!page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
 | 
						|
      return !page_rec_is_supremum(page_cur_get_rec(&m_page_cur));
 | 
						|
    }
 | 
						|
    ut_ad(page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
 | 
						|
  /** @return true if right border should be counted */
 | 
						|
  bool should_count_the_right_border() const
 | 
						|
  {
 | 
						|
    if (dtuple_get_n_fields(&m_tuple) > 0)
 | 
						|
    {
 | 
						|
      const rec_t *rec= page_cur_get_rec(&m_page_cur);
 | 
						|
      ut_ad(!(m_mode == PAGE_CUR_L && page_rec_is_supremum(rec)));
 | 
						|
 | 
						|
      return (m_mode == PAGE_CUR_LE /* if the range is '<=' */
 | 
						|
              /* and the record was found */
 | 
						|
              && m_low_match >= dtuple_get_n_fields(&m_tuple)) ||
 | 
						|
             (m_mode == PAGE_CUR_L /* or if the range is '<' */
 | 
						|
              /* and there are any records to match the criteria, i.e. if the
 | 
						|
              minimum record on the tree is 5 and x < 7 is specified then the
 | 
						|
              cursor will be positioned at 5 and we should count the border,
 | 
						|
              but if x < 2 is specified, then the cursor will be positioned at
 | 
						|
              'inf' and we should not count the border */
 | 
						|
              && !page_rec_is_infimum(rec));
 | 
						|
      /* Notice that for "WHERE col <= 'foo'" the server passes to
 | 
						|
      ha_innobase::records_in_range(): min_key=NULL (left-unbounded) which is
 | 
						|
      expected max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
 | 
						|
      unexpected - one would expect flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In
 | 
						|
      this case the cursor will be positioned on the first record to the right
 | 
						|
      of the requested one (can also be positioned on the 'sup') and we should
 | 
						|
      not count the right border. */
 | 
						|
    }
 | 
						|
    ut_ad(page_rec_is_supremum(page_cur_get_rec(&m_page_cur)));
 | 
						|
 | 
						|
    /* The range specified is without a right border, just 'x > 123'
 | 
						|
    or 'x >= 123' and search_on_page() positioned the cursor on the
 | 
						|
    supremum record on the rightmost page, which must not be counted. */
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
 | 
						|
  /** @return index */
 | 
						|
  const dict_index_t *index() const { return m_page_cur.index; }
 | 
						|
 | 
						|
  /** @return current block */
 | 
						|
  const buf_block_t *block() const { return m_block; }
 | 
						|
 | 
						|
  /** @return current page id */
 | 
						|
  page_id_t page_id() const { return m_page_id; }
 | 
						|
 | 
						|
  /** Copies block pointer and savepoint from another btr_est_cur_t in the case
 | 
						|
  if both left and right border cursors point to the same block.
 | 
						|
  @param o reference to the other btr_est_cur_t object. */
 | 
						|
  void set_block(const btr_est_cur_t &o) { m_block= o.m_block; }
 | 
						|
 | 
						|
  /** @return current record number. */
 | 
						|
  ulint nth_rec() const { return m_nth_rec; }
 | 
						|
 | 
						|
  /** @return number of records in the current page. */
 | 
						|
  ulint n_recs() const { return m_n_recs; }
 | 
						|
};
 | 
						|
 | 
						|
/** Estimate the number of rows between the left record of the path and the
 | 
						|
right one(non-inclusive) for the certain level on a B-tree. This function
 | 
						|
starts from the page next to the left page and reads a few pages to the right,
 | 
						|
counting their records. If we reach the right page quickly then we know exactly
 | 
						|
how many records there are between left and right records and we set
 | 
						|
is_n_rows_exact to true. After some page is latched, the previous page is
 | 
						|
unlatched. If we cannot reach the right page quickly then we calculate the
 | 
						|
average number of records in the pages scanned so far and assume that all pages
 | 
						|
that we did not scan up to the right page contain the same number of records,
 | 
						|
then we multiply that average to the number of pages between right and left
 | 
						|
records (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to
 | 
						|
false.
 | 
						|
@param level current level.
 | 
						|
@param left_cur the cursor of the left page.
 | 
						|
@param right_page_no right page number.
 | 
						|
@param n_rows_on_prev_level number of rows on the previous level.
 | 
						|
@param[out] is_n_rows_exact true if exact rows number is returned.
 | 
						|
@param[in,out] mtr mtr,
 | 
						|
@return number of rows, not including the borders (exact or estimated). */
 | 
						|
static ha_rows btr_estimate_n_rows_in_range_on_level(
 | 
						|
    ulint level, btr_est_cur_t &left_cur, uint32_t right_page_no,
 | 
						|
    ha_rows n_rows_on_prev_level, bool &is_n_rows_exact, mtr_t &mtr)
 | 
						|
{
 | 
						|
  ha_rows n_rows= 0;
 | 
						|
  uint n_pages_read= 0;
 | 
						|
  /* Do not read more than this number of pages in order not to hurt
 | 
						|
  performance with this code which is just an estimation. If we read this many
 | 
						|
  pages before reaching right_page_no, then we estimate the average from the
 | 
						|
  pages scanned so far. */
 | 
						|
  static constexpr uint n_pages_read_limit= 9;
 | 
						|
  buf_block_t *block= nullptr;
 | 
						|
  const dict_index_t *index= left_cur.index();
 | 
						|
 | 
						|
  /* Assume by default that we will scan all pages between left and right(non
 | 
						|
  inclusive) pages */
 | 
						|
  is_n_rows_exact= true;
 | 
						|
 | 
						|
  /* Add records from the left page which are to the right of the record which
 | 
						|
  serves as a left border of the range, if any (we don't include the record
 | 
						|
  itself in this count). */
 | 
						|
  if (left_cur.nth_rec() <= left_cur.n_recs())
 | 
						|
  {
 | 
						|
    n_rows+= left_cur.n_recs() - left_cur.nth_rec();
 | 
						|
  }
 | 
						|
 | 
						|
  /* Count the records in the pages between left and right (non inclusive)
 | 
						|
  pages */
 | 
						|
 | 
						|
  const fil_space_t *space= index->table->space;
 | 
						|
  page_id_t page_id(space->id,
 | 
						|
                    btr_page_get_next(buf_block_get_frame(left_cur.block())));
 | 
						|
 | 
						|
  if (page_id.page_no() == FIL_NULL)
 | 
						|
    goto inexact;
 | 
						|
 | 
						|
  do
 | 
						|
  {
 | 
						|
    page_t *page;
 | 
						|
    buf_block_t *prev_block= block;
 | 
						|
 | 
						|
    /* Fetch the page. */
 | 
						|
    block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, &mtr, nullptr);
 | 
						|
 | 
						|
    if (prev_block)
 | 
						|
    {
 | 
						|
      ulint savepoint = mtr.get_savepoint();
 | 
						|
      /* Index s-lock, p1, p2 latches, can also be p1 and p2 parent latch if
 | 
						|
      they are not diverged */
 | 
						|
      ut_ad(savepoint >= 3);
 | 
						|
      mtr.rollback_to_savepoint(savepoint - 2, savepoint - 1);
 | 
						|
    }
 | 
						|
 | 
						|
    if (!block || btr_page_get_level(buf_block_get_frame(block)) != level)
 | 
						|
      goto inexact;
 | 
						|
 | 
						|
    page= buf_block_get_frame(block);
 | 
						|
 | 
						|
    /* It is possible but highly unlikely that the page was originally written
 | 
						|
    by an old version of InnoDB that did not initialize FIL_PAGE_TYPE on other
 | 
						|
    than B-tree pages. For example, this could be an almost-empty BLOB page
 | 
						|
    that happens to contain the magic values in the fields
 | 
						|
    that we checked above. */
 | 
						|
 | 
						|
    n_pages_read++;
 | 
						|
 | 
						|
    n_rows+= page_get_n_recs(page);
 | 
						|
 | 
						|
    page_id.set_page_no(btr_page_get_next(page));
 | 
						|
 | 
						|
    if (n_pages_read == n_pages_read_limit)
 | 
						|
    {
 | 
						|
      /* We read too many pages or we reached the end of the level
 | 
						|
      without passing through right_page_no. */
 | 
						|
      goto inexact;
 | 
						|
    }
 | 
						|
 | 
						|
  } while (page_id.page_no() != right_page_no);
 | 
						|
 | 
						|
  if (block)
 | 
						|
  {
 | 
						|
    ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1));
 | 
						|
    mtr.rollback_to_savepoint(mtr.get_savepoint() - 1);
 | 
						|
  }
 | 
						|
 | 
						|
  return (n_rows);
 | 
						|
 | 
						|
inexact:
 | 
						|
 | 
						|
  if (block)
 | 
						|
  {
 | 
						|
    ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1));
 | 
						|
    mtr.rollback_to_savepoint(mtr.get_savepoint() - 1);
 | 
						|
  }
 | 
						|
 | 
						|
  is_n_rows_exact= false;
 | 
						|
 | 
						|
  /* We did interrupt before reaching right page */
 | 
						|
 | 
						|
  if (n_pages_read > 0)
 | 
						|
  {
 | 
						|
    /* The number of pages on this level is
 | 
						|
    n_rows_on_prev_level, multiply it by the
 | 
						|
    average number of recs per page so far */
 | 
						|
    n_rows= n_rows_on_prev_level * n_rows / n_pages_read;
 | 
						|
  }
 | 
						|
  else
 | 
						|
  {
 | 
						|
    n_rows= 10;
 | 
						|
  }
 | 
						|
 | 
						|
  return (n_rows);
 | 
						|
}
 | 
						|
 | 
						|
/** Estimates the number of rows in a given index range. Do search in the left
 | 
						|
page, then if there are pages between left and right ones, read a few pages to
 | 
						|
the right, if the right page is reached, count the exact number of rows without
 | 
						|
fetching the right page, the right page will be fetched in the caller of this
 | 
						|
function and the amount of its rows will be added. If the right page is not
 | 
						|
reached, count the estimated(see btr_estimate_n_rows_in_range_on_level() for
 | 
						|
details) rows number, and fetch the right page. If leaves are reached, unlatch
 | 
						|
non-leaf pages except the right leaf parent. After the right leaf page is
 | 
						|
fetched, commit mtr.
 | 
						|
@param[in]  index index
 | 
						|
@param[in]  range_start range start
 | 
						|
@param[in]  range_end   range end
 | 
						|
@return estimated number of rows; */
 | 
						|
ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
 | 
						|
                                     btr_pos_t *range_start,
 | 
						|
                                     btr_pos_t *range_end)
 | 
						|
{
 | 
						|
  DBUG_ENTER("btr_estimate_n_rows_in_range");
 | 
						|
 | 
						|
  if (UNIV_UNLIKELY(index->page == FIL_NULL || index->is_corrupted()))
 | 
						|
    DBUG_RETURN(0);
 | 
						|
 | 
						|
  ut_ad(index->is_btree());
 | 
						|
 | 
						|
  btr_est_cur_t p1(index, *range_start->tuple, range_start->mode);
 | 
						|
  btr_est_cur_t p2(index, *range_end->tuple, range_end->mode);
 | 
						|
  mtr_t mtr;
 | 
						|
 | 
						|
  ulint height;
 | 
						|
  ulint root_height= 0; /* remove warning */
 | 
						|
 | 
						|
  mem_heap_t *heap= NULL;
 | 
						|
  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
 | 
						|
  rec_offs *offsets= offsets_;
 | 
						|
  rec_offs_init(offsets_);
 | 
						|
 | 
						|
  mtr.start();
 | 
						|
 | 
						|
  ut_ad(mtr.get_savepoint() == 0);
 | 
						|
  mtr_s_lock_index(index, &mtr);
 | 
						|
 | 
						|
  ha_rows table_n_rows= dict_table_get_n_rows(index->table);
 | 
						|
 | 
						|
  height= ULINT_UNDEFINED;
 | 
						|
 | 
						|
  /* This becomes true when the two paths do not pass through the same pages
 | 
						|
  anymore. */
 | 
						|
  bool diverged= false;
 | 
						|
  /* This is the height, i.e. the number of levels from the root, where paths
 | 
						|
   are not the same or adjacent any more. */
 | 
						|
  ulint divergence_height= ULINT_UNDEFINED;
 | 
						|
  bool should_count_the_left_border= true;
 | 
						|
  bool should_count_the_right_border= true;
 | 
						|
  bool is_n_rows_exact= true;
 | 
						|
  ha_rows n_rows= 0;
 | 
						|
 | 
						|
  /* Loop and search until we arrive at the desired level. */
 | 
						|
search_loop:
 | 
						|
  if (!p1.fetch_child(height, mtr, p2.block()))
 | 
						|
    goto error;
 | 
						|
 | 
						|
  if (height == ULINT_UNDEFINED)
 | 
						|
  {
 | 
						|
    /* We are in the root node */
 | 
						|
    height= btr_page_get_level(buf_block_get_frame(p1.block()));
 | 
						|
    root_height= height;
 | 
						|
  }
 | 
						|
 | 
						|
  if (!height)
 | 
						|
  {
 | 
						|
    p1.set_page_mode_for_leaves();
 | 
						|
    p2.set_page_mode_for_leaves();
 | 
						|
  }
 | 
						|
 | 
						|
  if (p1.page_id() == p2.page_id())
 | 
						|
    p2.set_block(p1);
 | 
						|
  else
 | 
						|
  {
 | 
						|
    ut_ad(diverged);
 | 
						|
    if (divergence_height != ULINT_UNDEFINED) {
 | 
						|
      /* We need to call p1.search_on_page() here as
 | 
						|
      btr_estimate_n_rows_in_range_on_level() uses p1.m_n_recs and
 | 
						|
      p1.m_nth_rec. */
 | 
						|
      if (!p1.search_on_page(height, root_height, true))
 | 
						|
        goto error;
 | 
						|
      n_rows= btr_estimate_n_rows_in_range_on_level(
 | 
						|
          height, p1, p2.page_id().page_no(), n_rows, is_n_rows_exact, mtr);
 | 
						|
    }
 | 
						|
    if (!p2.fetch_child(height, mtr, nullptr))
 | 
						|
      goto error;
 | 
						|
  }
 | 
						|
 | 
						|
  if (height == 0)
 | 
						|
    /* There is no need to release non-leaf pages here as they must already be
 | 
						|
    unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after
 | 
						|
    releasing the index latch, to decrease contention. */
 | 
						|
    mtr.rollback_to_savepoint(0, 1);
 | 
						|
 | 
						|
  /* There is no need to search on left page if
 | 
						|
  divergence_height != ULINT_UNDEFINED, as it was already searched before
 | 
						|
  btr_estimate_n_rows_in_range_on_level() call */
 | 
						|
  if (divergence_height == ULINT_UNDEFINED &&
 | 
						|
      !p1.search_on_page(height, root_height, true))
 | 
						|
    goto error;
 | 
						|
 | 
						|
  if (!p2.search_on_page(height, root_height, false))
 | 
						|
    goto error;
 | 
						|
 | 
						|
  if (!diverged && (p1.nth_rec() != p2.nth_rec()))
 | 
						|
  {
 | 
						|
    ut_ad(p1.page_id() == p2.page_id());
 | 
						|
    diverged= true;
 | 
						|
    if (p1.nth_rec() < p2.nth_rec())
 | 
						|
    {
 | 
						|
      /* We do not count the borders (nor the left nor the right one), thus
 | 
						|
      "- 1". */
 | 
						|
      n_rows= p2.nth_rec() - p1.nth_rec() - 1;
 | 
						|
 | 
						|
      if (n_rows > 0)
 | 
						|
      {
 | 
						|
        /* There is at least one row between the two borders pointed to by p1
 | 
						|
        and p2, so on the level below the slots will point to non-adjacent
 | 
						|
        pages. */
 | 
						|
        divergence_height= root_height - height;
 | 
						|
      }
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
      /* It is possible that p1->nth_rec > p2->nth_rec if, for example, we have
 | 
						|
      a single page tree which contains (inf, 5, 6, supr) and we select where x
 | 
						|
      > 20 and x < 30; in this case p1->nth_rec will point to the supr record
 | 
						|
      and p2->nth_rec will point to 6. */
 | 
						|
      n_rows= 0;
 | 
						|
      should_count_the_left_border= false;
 | 
						|
      should_count_the_right_border= false;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  else if (diverged && divergence_height == ULINT_UNDEFINED)
 | 
						|
  {
 | 
						|
 | 
						|
    if (p1.nth_rec() < p1.n_recs() || p2.nth_rec() > 1)
 | 
						|
    {
 | 
						|
      ut_ad(p1.page_id() != p2.page_id());
 | 
						|
      divergence_height= root_height - height;
 | 
						|
 | 
						|
      n_rows= 0;
 | 
						|
 | 
						|
      if (p1.nth_rec() < p1.n_recs())
 | 
						|
      {
 | 
						|
        n_rows+= p1.n_recs() - p1.nth_rec();
 | 
						|
      }
 | 
						|
 | 
						|
      if (p2.nth_rec() > 1)
 | 
						|
      {
 | 
						|
        n_rows+= p2.nth_rec() - 1;
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
  else if (divergence_height != ULINT_UNDEFINED)
 | 
						|
  {
 | 
						|
    /* All records before the right page was already counted. Add records from
 | 
						|
    p2->page_no which are to the left of the record which servers as a right
 | 
						|
    border of the range, if any (we don't include the record itself in this
 | 
						|
    count). */
 | 
						|
    if (p2.nth_rec() > 1)
 | 
						|
      n_rows+= p2.nth_rec() - 1;
 | 
						|
  }
 | 
						|
 | 
						|
  if (height)
 | 
						|
  {
 | 
						|
    ut_ad(height > 0);
 | 
						|
    height--;
 | 
						|
    ut_ad(mtr.memo_contains(p1.index()->lock, MTR_MEMO_S_LOCK));
 | 
						|
    ut_ad(mtr.memo_contains_flagged(p1.block(), MTR_MEMO_PAGE_S_FIX));
 | 
						|
    p1.read_child_page_id(&offsets, &heap);
 | 
						|
    ut_ad(mtr.memo_contains(p2.index()->lock, MTR_MEMO_S_LOCK));
 | 
						|
    ut_ad(mtr.memo_contains_flagged(p2.block(), MTR_MEMO_PAGE_S_FIX));
 | 
						|
    p2.read_child_page_id(&offsets, &heap);
 | 
						|
    goto search_loop;
 | 
						|
  }
 | 
						|
 | 
						|
  should_count_the_left_border=
 | 
						|
      should_count_the_left_border && p1.should_count_the_left_border();
 | 
						|
  should_count_the_right_border=
 | 
						|
      should_count_the_right_border && p2.should_count_the_right_border();
 | 
						|
 | 
						|
  mtr.commit();
 | 
						|
  if (UNIV_LIKELY_NULL(heap))
 | 
						|
    mem_heap_free(heap);
 | 
						|
 | 
						|
 | 
						|
  range_start->page_id= p1.page_id();
 | 
						|
  range_end->page_id= p2.page_id();
 | 
						|
 | 
						|
  /* Here none of the borders were counted. For example, if on the leaf level
 | 
						|
  we descended to:
 | 
						|
  (inf, a, b, c, d, e, f, sup)
 | 
						|
           ^        ^
 | 
						|
         path1    path2
 | 
						|
  then n_rows will be 2 (c and d). */
 | 
						|
 | 
						|
  if (is_n_rows_exact)
 | 
						|
  {
 | 
						|
    /* Only fiddle to adjust this off-by-one if the number is exact, otherwise
 | 
						|
    we do much grosser adjustments below. */
 | 
						|
 | 
						|
    /* If both paths end up on the same record on the leaf level. */
 | 
						|
    if (p1.page_id() == p2.page_id() && p1.nth_rec() == p2.nth_rec())
 | 
						|
    {
 | 
						|
 | 
						|
      /* n_rows can be > 0 here if the paths were first different and then
 | 
						|
      converged to the same record on the leaf level.
 | 
						|
      For example:
 | 
						|
      SELECT ... LIKE 'wait/synch/rwlock%'
 | 
						|
      mode1=PAGE_CUR_GE,
 | 
						|
      tuple1="wait/synch/rwlock"
 | 
						|
      path1[0]={nth_rec=58, n_recs=58,
 | 
						|
                page_no=3, page_level=1}
 | 
						|
      path1[1]={nth_rec=56, n_recs=55,
 | 
						|
                page_no=119, page_level=0}
 | 
						|
 | 
						|
      mode2=PAGE_CUR_G
 | 
						|
      tuple2="wait/synch/rwlock"
 | 
						|
      path2[0]={nth_rec=57, n_recs=57,
 | 
						|
                page_no=3, page_level=1}
 | 
						|
      path2[1]={nth_rec=56, n_recs=55,
 | 
						|
                page_no=119, page_level=0} */
 | 
						|
 | 
						|
      /* If the range is such that we should count both borders, then avoid
 | 
						|
      counting that record twice - once as a left border and once as a right
 | 
						|
      border. Some of the borders should not be counted, e.g. [3,3). */
 | 
						|
      n_rows= should_count_the_left_border && should_count_the_right_border;
 | 
						|
    }
 | 
						|
    else
 | 
						|
      n_rows+= should_count_the_left_border + should_count_the_right_border;
 | 
						|
  }
 | 
						|
 | 
						|
  if (root_height > divergence_height && !is_n_rows_exact)
 | 
						|
    /* In trees whose height is > 1 our algorithm tends to underestimate:
 | 
						|
    multiply the estimate by 2: */
 | 
						|
    n_rows*= 2;
 | 
						|
 | 
						|
  DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows););
 | 
						|
 | 
						|
#ifdef NOT_USED
 | 
						|
  /* Do not estimate the number of rows in the range to over 1 / 2 of the
 | 
						|
  estimated rows in the whole table */
 | 
						|
 | 
						|
  if (n_rows > table_n_rows / 2 && !is_n_rows_exact)
 | 
						|
  {
 | 
						|
 | 
						|
    n_rows= table_n_rows / 2;
 | 
						|
 | 
						|
    /* If there are just 0 or 1 rows in the table, then we estimate all rows
 | 
						|
    are in the range */
 | 
						|
 | 
						|
    if (n_rows == 0)
 | 
						|
      n_rows= table_n_rows;
 | 
						|
  }
 | 
						|
#else
 | 
						|
                        if (n_rows > table_n_rows)
 | 
						|
                          n_rows= table_n_rows;
 | 
						|
#endif
 | 
						|
 | 
						|
  DBUG_RETURN(n_rows);
 | 
						|
 | 
						|
error:
 | 
						|
  mtr.commit();
 | 
						|
  if (UNIV_LIKELY_NULL(heap))
 | 
						|
    mem_heap_free(heap);
 | 
						|
 | 
						|
  DBUG_RETURN(0);
 | 
						|
}
 | 
						|
 | 
						|
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
 | 
						|
 | 
						|
/***********************************************************//**
 | 
						|
Gets the offset of the pointer to the externally stored part of a field.
 | 
						|
@return offset of the pointer to the externally stored part */
 | 
						|
static
 | 
						|
ulint
 | 
						|
btr_rec_get_field_ref_offs(
 | 
						|
/*=======================*/
 | 
						|
	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 | 
						|
	ulint		n)	/*!< in: index of the external field */
 | 
						|
{
 | 
						|
	ulint	field_ref_offs;
 | 
						|
	ulint	local_len;
 | 
						|
 | 
						|
	ut_a(rec_offs_nth_extern(offsets, n));
 | 
						|
	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
 | 
						|
	ut_a(len_is_stored(local_len));
 | 
						|
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
 | 
						|
	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
}
 | 
						|
 | 
						|
/** Gets a pointer to the externally stored part of a field.
 | 
						|
@param rec record
 | 
						|
@param offsets rec_get_offsets(rec)
 | 
						|
@param n index of the externally stored field
 | 
						|
@return pointer to the externally stored part */
 | 
						|
#define btr_rec_get_field_ref(rec, offsets, n)			\
 | 
						|
	((rec) + btr_rec_get_field_ref_offs(offsets, n))
 | 
						|
 | 
						|
/** Gets the externally stored size of a record, in units of a database page.
 | 
						|
@param[in]	rec	record
 | 
						|
@param[in]	offsets	array returned by rec_get_offsets()
 | 
						|
@return externally stored part, in units of a database page */
 | 
						|
ulint
 | 
						|
btr_rec_get_externally_stored_len(
 | 
						|
	const rec_t*	rec,
 | 
						|
	const rec_offs*	offsets)
 | 
						|
{
 | 
						|
	ulint	n_fields;
 | 
						|
	ulint	total_extern_len = 0;
 | 
						|
	ulint	i;
 | 
						|
 | 
						|
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 | 
						|
 | 
						|
	if (!rec_offs_any_extern(offsets)) {
 | 
						|
		return(0);
 | 
						|
	}
 | 
						|
 | 
						|
	n_fields = rec_offs_n_fields(offsets);
 | 
						|
 | 
						|
	for (i = 0; i < n_fields; i++) {
 | 
						|
		if (rec_offs_nth_extern(offsets, i)) {
 | 
						|
 | 
						|
			ulint	extern_len = mach_read_from_4(
 | 
						|
				btr_rec_get_field_ref(rec, offsets, i)
 | 
						|
				+ BTR_EXTERN_LEN + 4);
 | 
						|
 | 
						|
			total_extern_len += ut_calc_align(
 | 
						|
				extern_len, ulint(srv_page_size));
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return total_extern_len >> srv_page_size_shift;
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Sets the ownership bit of an externally stored field in a record. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_cur_set_ownership_of_extern_field(
 | 
						|
/*==================================*/
 | 
						|
	buf_block_t*	block,	/*!< in/out: index page */
 | 
						|
	rec_t*		rec,	/*!< in/out: clustered index record */
 | 
						|
	dict_index_t*	index,	/*!< in: index of the page */
 | 
						|
	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 | 
						|
	ulint		i,	/*!< in: field number */
 | 
						|
	bool		val,	/*!< in: value to set */
 | 
						|
	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
 | 
						|
{
 | 
						|
	byte*	data;
 | 
						|
	ulint	local_len;
 | 
						|
	ulint	byte_val;
 | 
						|
 | 
						|
	data = rec_get_nth_field(rec, offsets, i, &local_len);
 | 
						|
	ut_ad(rec_offs_nth_extern(offsets, i));
 | 
						|
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
 | 
						|
	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 | 
						|
 | 
						|
	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
 | 
						|
 | 
						|
	if (val) {
 | 
						|
		byte_val &= ~BTR_EXTERN_OWNER_FLAG;
 | 
						|
	} else {
 | 
						|
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 | 
						|
		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
 | 
						|
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 | 
						|
		byte_val |= BTR_EXTERN_OWNER_FLAG;
 | 
						|
	}
 | 
						|
 | 
						|
	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 | 
						|
		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
 | 
						|
		page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
 | 
						|
	} else {
 | 
						|
		mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
 | 
						|
					       + BTR_EXTERN_LEN, byte_val);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Marks non-updated off-page fields as disowned by this record. The ownership
 | 
						|
must be transferred to the updated record which is inserted elsewhere in the
 | 
						|
index tree. In purge only the owner of externally stored field is allowed
 | 
						|
to free the field. */
 | 
						|
void
 | 
						|
btr_cur_disown_inherited_fields(
 | 
						|
/*============================*/
 | 
						|
	buf_block_t*	block,	/*!< in/out: index page */
 | 
						|
	rec_t*		rec,	/*!< in/out: record in a clustered index */
 | 
						|
	dict_index_t*	index,	/*!< in: index of the page */
 | 
						|
	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 | 
						|
	const upd_t*	update,	/*!< in: update vector */
 | 
						|
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 | 
						|
{
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 | 
						|
	ut_ad(rec_offs_any_extern(offsets));
 | 
						|
 | 
						|
	for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
 | 
						|
		if (rec_offs_nth_extern(offsets, i)
 | 
						|
		    && !upd_get_field_by_field_no(update, i, false)) {
 | 
						|
			btr_cur_set_ownership_of_extern_field(
 | 
						|
				block, rec, index, offsets, i, false, mtr);
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Marks all extern fields in a record as owned by the record. This function
 | 
						|
should be called if the delete mark of a record is removed: a not delete
 | 
						|
marked record always owns all its extern fields. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_cur_unmark_extern_fields(
 | 
						|
/*=========================*/
 | 
						|
	buf_block_t*	block,	/*!< in/out: index page */
 | 
						|
	rec_t*		rec,	/*!< in/out: record in a clustered index */
 | 
						|
	dict_index_t*	index,	/*!< in: index of the page */
 | 
						|
	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 | 
						|
	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
 | 
						|
{
 | 
						|
	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 | 
						|
	if (!rec_offs_any_extern(offsets)) {
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	const ulint n = rec_offs_n_fields(offsets);
 | 
						|
 | 
						|
	for (ulint i = 0; i < n; i++) {
 | 
						|
		if (rec_offs_nth_extern(offsets, i)) {
 | 
						|
			btr_cur_set_ownership_of_extern_field(
 | 
						|
				block, rec, index, offsets, i, true, mtr);
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Returns the length of a BLOB part stored on the header page.
 | 
						|
@return part length */
 | 
						|
static
 | 
						|
uint32_t
 | 
						|
btr_blob_get_part_len(
 | 
						|
/*==================*/
 | 
						|
	const byte*	blob_header)	/*!< in: blob header */
 | 
						|
{
 | 
						|
	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Returns the page number where the next BLOB part is stored.
 | 
						|
@return page number or FIL_NULL if no more pages */
 | 
						|
static
 | 
						|
uint32_t
 | 
						|
btr_blob_get_next_page_no(
 | 
						|
/*======================*/
 | 
						|
	const byte*	blob_header)	/*!< in: blob header */
 | 
						|
{
 | 
						|
	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
 | 
						|
}
 | 
						|
 | 
						|
/** Deallocate a buffer block that was reserved for a BLOB part.
 | 
						|
@param block   buffer block
 | 
						|
@param all     flag whether to remove a ROW_FORMAT=COMPRESSED page
 | 
						|
@param mtr     mini-transaction to commit */
 | 
						|
static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
 | 
						|
{
 | 
						|
  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 | 
						|
  block->page.fix();
 | 
						|
#ifdef UNIV_DEBUG
 | 
						|
  const page_id_t page_id{block->page.id()};
 | 
						|
  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
 | 
						|
#endif
 | 
						|
  mtr->commit();
 | 
						|
 | 
						|
  mysql_mutex_lock(&buf_pool.mutex);
 | 
						|
  block->page.unfix();
 | 
						|
  ut_ad(block->page.id() == page_id);
 | 
						|
  ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain));
 | 
						|
 | 
						|
  if (!buf_LRU_free_page(&block->page, all) && all && block->page.zip.data)
 | 
						|
    /* Attempt to deallocate the redundant copy of the uncompressed page
 | 
						|
    if the whole ROW_FORMAT=COMPRESSED block cannot be deallocated. */
 | 
						|
    buf_LRU_free_page(&block->page, false);
 | 
						|
 | 
						|
  mysql_mutex_unlock(&buf_pool.mutex);
 | 
						|
}
 | 
						|
 | 
						|
/** Helper class used while writing blob pages, during insert or update. */
 | 
						|
struct btr_blob_log_check_t {
 | 
						|
	/** Persistent cursor on a clusterex index record with blobs. */
 | 
						|
	btr_pcur_t*	m_pcur;
 | 
						|
	/** Mini transaction holding the latches for m_pcur */
 | 
						|
	mtr_t*		m_mtr;
 | 
						|
	/** rec_get_offsets(rec, index); offset of clust_rec */
 | 
						|
	const rec_offs*	m_offsets;
 | 
						|
	/** The block containing clustered record */
 | 
						|
	buf_block_t**	m_block;
 | 
						|
	/** The clustered record pointer */
 | 
						|
	rec_t**		m_rec;
 | 
						|
	/** The blob operation code */
 | 
						|
	enum blob_op	m_op;
 | 
						|
 | 
						|
	/** Constructor
 | 
						|
	@param[in]	pcur		persistent cursor on a clustered
 | 
						|
					index record with blobs.
 | 
						|
	@param[in]	mtr		mini-transaction holding latches for
 | 
						|
					pcur.
 | 
						|
	@param[in]	offsets		offsets of the clust_rec
 | 
						|
	@param[in,out]	block		record block containing pcur record
 | 
						|
	@param[in,out]	rec		the clustered record pointer
 | 
						|
	@param[in]	op		the blob operation code */
 | 
						|
	btr_blob_log_check_t(
 | 
						|
		btr_pcur_t*	pcur,
 | 
						|
		mtr_t*		mtr,
 | 
						|
		const rec_offs*	offsets,
 | 
						|
		buf_block_t**	block,
 | 
						|
		rec_t**		rec,
 | 
						|
		enum blob_op	op)
 | 
						|
		: m_pcur(pcur),
 | 
						|
		  m_mtr(mtr),
 | 
						|
		  m_offsets(offsets),
 | 
						|
		  m_block(block),
 | 
						|
		  m_rec(rec),
 | 
						|
		  m_op(op)
 | 
						|
	{
 | 
						|
		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
 | 
						|
		ut_ad((*m_block)->page.frame == page_align(*m_rec));
 | 
						|
		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
 | 
						|
	}
 | 
						|
 | 
						|
	/** Check if there is enough space in log file. Commit and re-start the
 | 
						|
	mini transaction. */
 | 
						|
	void check()
 | 
						|
	{
 | 
						|
		dict_index_t*	index = m_pcur->index();
 | 
						|
		ulint		offs = 0;
 | 
						|
		uint32_t	page_no = FIL_NULL;
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
 | 
						|
			offs = *m_rec - (*m_block)->page.frame;
 | 
						|
			ut_ad(offs == page_offset(*m_rec));
 | 
						|
			page_no = (*m_block)->page.id().page_no();
 | 
						|
			(*m_block)->page.fix();
 | 
						|
			ut_ad(page_no != FIL_NULL);
 | 
						|
		} else {
 | 
						|
			btr_pcur_store_position(m_pcur, m_mtr);
 | 
						|
		}
 | 
						|
		m_mtr->commit();
 | 
						|
 | 
						|
		DEBUG_SYNC_C("blob_write_middle");
 | 
						|
 | 
						|
		const mtr_log_t log_mode = m_mtr->get_log_mode();
 | 
						|
		m_mtr->start();
 | 
						|
		m_mtr->set_log_mode(log_mode);
 | 
						|
		index->set_modified(*m_mtr);
 | 
						|
 | 
						|
		log_free_check();
 | 
						|
 | 
						|
		DEBUG_SYNC_C("blob_write_middle_after_check");
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
 | 
						|
			dberr_t err;
 | 
						|
			if (UNIV_LIKELY(index->page != page_no)) {
 | 
						|
				ut_a(btr_root_block_get(index, RW_SX_LATCH,
 | 
						|
							m_mtr, &err));
 | 
						|
			}
 | 
						|
			m_pcur->btr_cur.page_cur.block = btr_block_get(
 | 
						|
				*index, page_no, RW_X_LATCH, m_mtr);
 | 
						|
			/* The page should not be evicted or corrupted while
 | 
						|
			we are holding a buffer-fix on it. */
 | 
						|
			m_pcur->btr_cur.page_cur.block->page.unfix();
 | 
						|
			m_pcur->btr_cur.page_cur.rec
 | 
						|
				= m_pcur->btr_cur.page_cur.block->page.frame
 | 
						|
				+ offs;
 | 
						|
		} else {
 | 
						|
			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
 | 
						|
			mtr_sx_lock_index(index, m_mtr);
 | 
						|
			ut_a(m_pcur->restore_position(
 | 
						|
			      BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED,
 | 
						|
			      m_mtr) == btr_pcur_t::SAME_ALL);
 | 
						|
		}
 | 
						|
 | 
						|
		*m_block	= btr_pcur_get_block(m_pcur);
 | 
						|
		*m_rec		= btr_pcur_get_rec(m_pcur);
 | 
						|
 | 
						|
		rec_offs_make_valid(*m_rec, index, true,
 | 
						|
				    const_cast<rec_offs*>(m_offsets));
 | 
						|
 | 
						|
		ut_ad(m_mtr->memo_contains_page_flagged(
 | 
						|
		      *m_rec,
 | 
						|
		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
 | 
						|
 | 
						|
		ut_ad((m_op == BTR_STORE_INSERT_BULK)
 | 
						|
		      == !m_mtr->memo_contains_flagged(&index->lock,
 | 
						|
						       MTR_MEMO_SX_LOCK
 | 
						|
						       | MTR_MEMO_X_LOCK));
 | 
						|
	}
 | 
						|
};
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Stores the fields in big_rec_vec to the tablespace and puts pointers to
 | 
						|
them in rec.  The extern flags in rec will have to be set beforehand.
 | 
						|
The fields are stored on pages allocated from leaf node
 | 
						|
file segment of the index tree.
 | 
						|
 | 
						|
TODO: If the allocation extends the tablespace, it will not be redo logged, in
 | 
						|
any mini-transaction.  Tablespace extension should be redo-logged, so that
 | 
						|
recovery will not fail when the big_rec was written to the extended portion of
 | 
						|
the file, in case the file was somehow truncated in the crash.
 | 
						|
 | 
						|
@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 | 
						|
dberr_t
 | 
						|
btr_store_big_rec_extern_fields(
 | 
						|
/*============================*/
 | 
						|
	btr_pcur_t*	pcur,		/*!< in: a persistent cursor */
 | 
						|
	rec_offs*	offsets,	/*!< in/out: rec_get_offsets() on
 | 
						|
					pcur. the "external storage" flags
 | 
						|
					in offsets will correctly correspond
 | 
						|
					to rec when this function returns */
 | 
						|
	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
 | 
						|
					to be stored externally */
 | 
						|
	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
 | 
						|
					latches to the clustered index. can be
 | 
						|
					committed and restarted. */
 | 
						|
	enum blob_op	op)		/*! in: operation code */
 | 
						|
{
 | 
						|
	byte*		field_ref;
 | 
						|
	ulint		extern_len;
 | 
						|
	ulint		store_len;
 | 
						|
	ulint		i;
 | 
						|
	mtr_t		mtr;
 | 
						|
	mem_heap_t*	heap = NULL;
 | 
						|
	page_zip_des_t*	page_zip;
 | 
						|
	z_stream	c_stream;
 | 
						|
	dberr_t		error		= DB_SUCCESS;
 | 
						|
	dict_index_t*	index		= pcur->index();
 | 
						|
	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
 | 
						|
	rec_t*		rec		= btr_pcur_get_rec(pcur);
 | 
						|
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(rec_offs_any_extern(offsets));
 | 
						|
	ut_ad(op == BTR_STORE_INSERT_BULK
 | 
						|
	      || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 | 
						|
						| MTR_MEMO_SX_LOCK));
 | 
						|
	ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
 | 
						|
	ut_a(dict_index_is_clust(index));
 | 
						|
 | 
						|
	if (!fil_page_index_page_check(btr_pcur_get_page(pcur))) {
 | 
						|
		if (op != BTR_STORE_INSERT_BULK) {
 | 
						|
			return DB_PAGE_CORRUPTED;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
 | 
						|
				      &rec, op);
 | 
						|
	page_zip = buf_block_get_page_zip(rec_block);
 | 
						|
 | 
						|
	if (page_zip) {
 | 
						|
		int	err;
 | 
						|
 | 
						|
		/* Zlib deflate needs 128 kilobytes for the default
 | 
						|
		window size, plus 512 << memLevel, plus a few
 | 
						|
		kilobytes for small objects.  We use reduced memLevel
 | 
						|
		to limit the memory consumption, and preallocate the
 | 
						|
		heap, hoping to avoid memory fragmentation. */
 | 
						|
		heap = mem_heap_create(250000);
 | 
						|
		page_zip_set_alloc(&c_stream, heap);
 | 
						|
 | 
						|
		err = deflateInit2(&c_stream, int(page_zip_level),
 | 
						|
				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
 | 
						|
		ut_a(err == Z_OK);
 | 
						|
	}
 | 
						|
 | 
						|
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 | 
						|
	/* All pointers to externally stored columns in the record
 | 
						|
	must either be zero or they must be pointers to inherited
 | 
						|
	columns, owned by this record or an earlier record version. */
 | 
						|
	for (i = 0; i < big_rec_vec->n_fields; i++) {
 | 
						|
		field_ref = btr_rec_get_field_ref(
 | 
						|
			rec, offsets, big_rec_vec->fields[i].field_no);
 | 
						|
 | 
						|
		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
 | 
						|
		/* Either this must be an update in place,
 | 
						|
		or the BLOB must be inherited, or the BLOB pointer
 | 
						|
		must be zero (will be written in this function). */
 | 
						|
		ut_a(op == BTR_STORE_UPDATE
 | 
						|
		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
 | 
						|
		     || !memcmp(field_ref, field_ref_zero,
 | 
						|
				BTR_EXTERN_FIELD_REF_SIZE));
 | 
						|
	}
 | 
						|
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 | 
						|
 | 
						|
	/* Space available in compressed page to carry blob data */
 | 
						|
	const ulint	payload_size_zip = rec_block->physical_size()
 | 
						|
		- FIL_PAGE_DATA;
 | 
						|
 | 
						|
	/* Space available in uncompressed page to carry blob data */
 | 
						|
	const ulint	payload_size = payload_size_zip
 | 
						|
		- (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
 | 
						|
 | 
						|
	/* We have to create a file segment to the tablespace
 | 
						|
	for each field and put the pointer to the field in rec */
 | 
						|
 | 
						|
	for (i = 0; i < big_rec_vec->n_fields; i++) {
 | 
						|
		const ulint field_no = big_rec_vec->fields[i].field_no;
 | 
						|
 | 
						|
		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
 | 
						|
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 | 
						|
		/* A zero BLOB pointer should have been initially inserted. */
 | 
						|
		ut_a(!memcmp(field_ref, field_ref_zero,
 | 
						|
			     BTR_EXTERN_FIELD_REF_SIZE));
 | 
						|
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 | 
						|
		extern_len = big_rec_vec->fields[i].len;
 | 
						|
		MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
 | 
						|
		ut_a(extern_len > 0);
 | 
						|
 | 
						|
		uint32_t prev_page_no = FIL_NULL;
 | 
						|
 | 
						|
		if (page_zip) {
 | 
						|
			int	err = deflateReset(&c_stream);
 | 
						|
			ut_a(err == Z_OK);
 | 
						|
 | 
						|
			c_stream.next_in = (Bytef*)
 | 
						|
				big_rec_vec->fields[i].data;
 | 
						|
			c_stream.avail_in = static_cast<uInt>(extern_len);
 | 
						|
		}
 | 
						|
 | 
						|
		for (ulint blob_npages = 0;; ++blob_npages) {
 | 
						|
			buf_block_t*	block;
 | 
						|
			const ulint	commit_freq = 4;
 | 
						|
 | 
						|
			ut_ad(page_align(field_ref) == page_align(rec));
 | 
						|
 | 
						|
			if (!(blob_npages % commit_freq)) {
 | 
						|
 | 
						|
				redo_log.check();
 | 
						|
 | 
						|
				field_ref = btr_rec_get_field_ref(
 | 
						|
					rec, offsets, field_no);
 | 
						|
 | 
						|
				page_zip = buf_block_get_page_zip(rec_block);
 | 
						|
			}
 | 
						|
 | 
						|
			ut_ad(btr_mtr->get_already_latched(
 | 
						|
				      page_id_t{index->table->space_id, index->page},
 | 
						|
				      MTR_MEMO_PAGE_SX_FIX));
 | 
						|
 | 
						|
			mtr.start();
 | 
						|
			index->set_modified(mtr);
 | 
						|
			mtr.set_log_mode_sub(*btr_mtr);
 | 
						|
 | 
						|
			rec_block->page.fix();
 | 
						|
			rec_block->page.lock.x_lock();
 | 
						|
 | 
						|
			mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX);
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
			ut_ad(!btr_search_check_marked_free_index(rec_block));
 | 
						|
#endif
 | 
						|
 | 
						|
			uint32_t hint_prev = prev_page_no;
 | 
						|
			if (hint_prev == FIL_NULL) {
 | 
						|
				hint_prev = rec_block->page.id().page_no();
 | 
						|
			}
 | 
						|
 | 
						|
			block = btr_page_alloc(index, hint_prev + 1,
 | 
						|
					       FSP_NO_DIR, 0, &mtr, &mtr,
 | 
						|
					       &error);
 | 
						|
 | 
						|
			if (!block) {
 | 
						|
alloc_fail:
 | 
						|
                                mtr.commit();
 | 
						|
				goto func_exit;
 | 
						|
			}
 | 
						|
 | 
						|
			const uint32_t space_id = block->page.id().space();
 | 
						|
			const uint32_t page_no = block->page.id().page_no();
 | 
						|
 | 
						|
			if (prev_page_no == FIL_NULL) {
 | 
						|
			} else if (buf_block_t* prev_block =
 | 
						|
				   buf_page_get_gen(page_id_t(space_id,
 | 
						|
							  prev_page_no),
 | 
						|
                                                    rec_block->zip_size(),
 | 
						|
                                                    RW_X_LATCH, nullptr,
 | 
						|
                                                    BUF_GET, &mtr, &error)) {
 | 
						|
				if (page_zip) {
 | 
						|
					mtr.write<4>(*prev_block,
 | 
						|
						     prev_block->page.frame
 | 
						|
						     + FIL_PAGE_NEXT,
 | 
						|
						     page_no);
 | 
						|
					memcpy_aligned<4>(
 | 
						|
						buf_block_get_page_zip(
 | 
						|
							prev_block)
 | 
						|
						->data + FIL_PAGE_NEXT,
 | 
						|
						prev_block->page.frame
 | 
						|
						+ FIL_PAGE_NEXT, 4);
 | 
						|
				} else {
 | 
						|
					mtr.write<4>(*prev_block,
 | 
						|
						     BTR_BLOB_HDR_NEXT_PAGE_NO
 | 
						|
						     + FIL_PAGE_DATA
 | 
						|
						     + prev_block->page.frame,
 | 
						|
						     page_no);
 | 
						|
				}
 | 
						|
			} else {
 | 
						|
				goto alloc_fail;
 | 
						|
			}
 | 
						|
 | 
						|
			ut_ad(!page_has_siblings(block->page.frame));
 | 
						|
			ut_ad(!fil_page_get_type(block->page.frame));
 | 
						|
 | 
						|
			if (page_zip) {
 | 
						|
				int		err;
 | 
						|
				page_zip_des_t*	blob_page_zip;
 | 
						|
 | 
						|
				mtr.write<1>(*block,
 | 
						|
					     FIL_PAGE_TYPE + 1
 | 
						|
					     + block->page.frame,
 | 
						|
					     prev_page_no == FIL_NULL
 | 
						|
					     ? FIL_PAGE_TYPE_ZBLOB
 | 
						|
					     : FIL_PAGE_TYPE_ZBLOB2);
 | 
						|
				block->page.zip.data[FIL_PAGE_TYPE + 1]
 | 
						|
					= block->page.frame[FIL_PAGE_TYPE + 1];
 | 
						|
 | 
						|
				c_stream.next_out = block->page.frame
 | 
						|
					+ FIL_PAGE_DATA;
 | 
						|
				c_stream.avail_out = static_cast<uInt>(
 | 
						|
					payload_size_zip);
 | 
						|
 | 
						|
				err = deflate(&c_stream, Z_FINISH);
 | 
						|
				ut_a(err == Z_OK || err == Z_STREAM_END);
 | 
						|
				ut_a(err == Z_STREAM_END
 | 
						|
				     || c_stream.avail_out == 0);
 | 
						|
 | 
						|
				mtr.memcpy(*block,
 | 
						|
					   FIL_PAGE_DATA,
 | 
						|
					   page_zip_get_size(page_zip)
 | 
						|
					   - FIL_PAGE_DATA
 | 
						|
					   - c_stream.avail_out);
 | 
						|
				/* Copy the page to compressed storage,
 | 
						|
				because it will be flushed to disk
 | 
						|
				from there. */
 | 
						|
				blob_page_zip = buf_block_get_page_zip(block);
 | 
						|
				ut_ad(blob_page_zip);
 | 
						|
				ut_ad(page_zip_get_size(blob_page_zip)
 | 
						|
				      == page_zip_get_size(page_zip));
 | 
						|
				memcpy(blob_page_zip->data, block->page.frame,
 | 
						|
				       page_zip_get_size(page_zip));
 | 
						|
 | 
						|
				if (err == Z_OK && prev_page_no != FIL_NULL) {
 | 
						|
 | 
						|
					goto next_zip_page;
 | 
						|
				}
 | 
						|
 | 
						|
				if (err == Z_STREAM_END) {
 | 
						|
					mach_write_to_4(field_ref
 | 
						|
							+ BTR_EXTERN_LEN, 0);
 | 
						|
					mach_write_to_4(field_ref
 | 
						|
							+ BTR_EXTERN_LEN + 4,
 | 
						|
							c_stream.total_in);
 | 
						|
				} else {
 | 
						|
					memset(field_ref + BTR_EXTERN_LEN,
 | 
						|
					       0, 8);
 | 
						|
				}
 | 
						|
 | 
						|
				if (prev_page_no == FIL_NULL) {
 | 
						|
					ut_ad(blob_npages == 0);
 | 
						|
					mach_write_to_4(field_ref
 | 
						|
							+ BTR_EXTERN_SPACE_ID,
 | 
						|
							space_id);
 | 
						|
 | 
						|
					mach_write_to_4(field_ref
 | 
						|
							+ BTR_EXTERN_PAGE_NO,
 | 
						|
							page_no);
 | 
						|
 | 
						|
					mach_write_to_4(field_ref
 | 
						|
							+ BTR_EXTERN_OFFSET,
 | 
						|
							FIL_PAGE_NEXT);
 | 
						|
				}
 | 
						|
 | 
						|
				/* We compress a page when finish bulk insert.*/
 | 
						|
				if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
 | 
						|
					page_zip_write_blob_ptr(
 | 
						|
						rec_block, rec, index, offsets,
 | 
						|
						field_no, &mtr);
 | 
						|
				}
 | 
						|
 | 
						|
next_zip_page:
 | 
						|
				prev_page_no = page_no;
 | 
						|
 | 
						|
				/* Commit mtr and release the
 | 
						|
				uncompressed page frame to save memory. */
 | 
						|
				btr_blob_free(block, FALSE, &mtr);
 | 
						|
 | 
						|
				if (err == Z_STREAM_END) {
 | 
						|
					break;
 | 
						|
				}
 | 
						|
			} else {
 | 
						|
				mtr.write<1>(*block, FIL_PAGE_TYPE + 1
 | 
						|
					     + block->page.frame,
 | 
						|
					     FIL_PAGE_TYPE_BLOB);
 | 
						|
 | 
						|
				if (extern_len > payload_size) {
 | 
						|
					store_len = payload_size;
 | 
						|
				} else {
 | 
						|
					store_len = extern_len;
 | 
						|
				}
 | 
						|
 | 
						|
				mtr.memcpy<mtr_t::MAYBE_NOP>(
 | 
						|
					*block,
 | 
						|
					FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
 | 
						|
					+ block->page.frame,
 | 
						|
					static_cast<const byte*>
 | 
						|
					(big_rec_vec->fields[i].data)
 | 
						|
					+ big_rec_vec->fields[i].len
 | 
						|
					- extern_len, store_len);
 | 
						|
				mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
 | 
						|
					     + FIL_PAGE_DATA
 | 
						|
					     + block->page.frame,
 | 
						|
					     store_len);
 | 
						|
				compile_time_assert(FIL_NULL == 0xffffffff);
 | 
						|
				mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
 | 
						|
					   + FIL_PAGE_DATA, 4, 0xff);
 | 
						|
 | 
						|
				extern_len -= store_len;
 | 
						|
 | 
						|
				ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
 | 
						|
							+ field_ref));
 | 
						|
				mtr.write<4>(*rec_block,
 | 
						|
					     BTR_EXTERN_LEN + 4 + field_ref,
 | 
						|
					     big_rec_vec->fields[i].len
 | 
						|
					     - extern_len);
 | 
						|
 | 
						|
				if (prev_page_no == FIL_NULL) {
 | 
						|
					ut_ad(blob_npages == 0);
 | 
						|
					mtr.write<4,mtr_t::MAYBE_NOP>(
 | 
						|
						*rec_block,
 | 
						|
						field_ref + BTR_EXTERN_SPACE_ID,
 | 
						|
						space_id);
 | 
						|
 | 
						|
					mtr.write<4>(*rec_block, field_ref
 | 
						|
						     + BTR_EXTERN_PAGE_NO,
 | 
						|
						     page_no);
 | 
						|
 | 
						|
					mtr.write<4>(*rec_block, field_ref
 | 
						|
						     + BTR_EXTERN_OFFSET,
 | 
						|
						     FIL_PAGE_DATA);
 | 
						|
				}
 | 
						|
 | 
						|
				prev_page_no = page_no;
 | 
						|
 | 
						|
				mtr.commit();
 | 
						|
 | 
						|
				if (extern_len == 0) {
 | 
						|
					break;
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
 | 
						|
				error = DB_OUT_OF_FILE_SPACE;
 | 
						|
				goto func_exit;);
 | 
						|
 | 
						|
		rec_offs_make_nth_extern(offsets, field_no);
 | 
						|
	}
 | 
						|
 | 
						|
func_exit:
 | 
						|
	if (page_zip) {
 | 
						|
		deflateEnd(&c_stream);
 | 
						|
	}
 | 
						|
 | 
						|
	if (heap != NULL) {
 | 
						|
		mem_heap_free(heap);
 | 
						|
	}
 | 
						|
 | 
						|
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
 | 
						|
	/* All pointers to externally stored columns in the record
 | 
						|
	must be valid. */
 | 
						|
	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
 | 
						|
		if (!rec_offs_nth_extern(offsets, i)) {
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		field_ref = btr_rec_get_field_ref(rec, offsets, i);
 | 
						|
 | 
						|
		/* The pointer must not be zero if the operation
 | 
						|
		succeeded. */
 | 
						|
		ut_a(0 != memcmp(field_ref, field_ref_zero,
 | 
						|
				 BTR_EXTERN_FIELD_REF_SIZE)
 | 
						|
		     || error != DB_SUCCESS);
 | 
						|
		/* The column must not be disowned by this record. */
 | 
						|
		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
 | 
						|
	}
 | 
						|
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 | 
						|
	return(error);
 | 
						|
}
 | 
						|
 | 
						|
/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
 | 
						|
@param block   uncompressed BLOB page
 | 
						|
@param op      operation
 | 
						|
@return whether the type is invalid */
 | 
						|
static bool btr_check_blob_fil_page_type(const buf_block_t& block,
 | 
						|
                                         const char *op)
 | 
						|
{
 | 
						|
  uint16_t type= fil_page_get_type(block.page.frame);
 | 
						|
 | 
						|
  if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB));
 | 
						|
  else if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
 | 
						|
  {
 | 
						|
    /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
 | 
						|
    pages.  Do not print anything about the type mismatch when reading
 | 
						|
    a BLOB page that may be from old versions. */
 | 
						|
    bool fail= space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags);
 | 
						|
    if (fail)
 | 
						|
      sql_print_error("InnoDB: FIL_PAGE_TYPE=%u on BLOB %s file %s page %u",
 | 
						|
                      type, op, space->chain.start->name,
 | 
						|
                      block.page.id().page_no());
 | 
						|
    space->release();
 | 
						|
    return fail;
 | 
						|
  }
 | 
						|
  return false;
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Frees the space in an externally stored field to the file space
 | 
						|
management if the field in data is owned by the externally stored field,
 | 
						|
in a rollback we may have the additional condition that the field must
 | 
						|
not be inherited. */
 | 
						|
void
 | 
						|
btr_free_externally_stored_field(
 | 
						|
/*=============================*/
 | 
						|
	dict_index_t*	index,		/*!< in: index of the data, the index
 | 
						|
					tree MUST be X-latched; if the tree
 | 
						|
					height is 1, then also the root page
 | 
						|
					must be X-latched! (this is relevant
 | 
						|
					in the case this function is called
 | 
						|
					from purge where 'data' is located on
 | 
						|
					an undo log page, not an index
 | 
						|
					page) */
 | 
						|
	byte*		field_ref,	/*!< in/out: field reference */
 | 
						|
	const rec_t*	rec,		/*!< in: record containing field_ref, for
 | 
						|
					page_zip_write_blob_ptr(), or NULL */
 | 
						|
	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
 | 
						|
					or NULL */
 | 
						|
	buf_block_t*	block,		/*!< in/out: page of field_ref */
 | 
						|
	ulint		i,		/*!< in: field number of field_ref;
 | 
						|
					ignored if rec == NULL */
 | 
						|
	bool		rollback,	/*!< in: performing rollback? */
 | 
						|
	mtr_t*		local_mtr)	/*!< in: mtr
 | 
						|
					containing the latch to data an an
 | 
						|
					X-latch to the index tree */
 | 
						|
{
 | 
						|
	const uint32_t	space_id	= mach_read_from_4(
 | 
						|
		field_ref + BTR_EXTERN_SPACE_ID);
 | 
						|
 | 
						|
	ut_ad(index->is_primary());
 | 
						|
	ut_ad(block->page.lock.have_x());
 | 
						|
	ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 | 
						|
					       | MTR_MEMO_SX_LOCK));
 | 
						|
	ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
 | 
						|
						    MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
 | 
						|
	ut_ad(index->table->space_id == index->table->space->id);
 | 
						|
	ut_ad(local_mtr->is_named_space(index->table->space));
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
 | 
						|
				  BTR_EXTERN_FIELD_REF_SIZE))) {
 | 
						|
		/* In the rollback, we may encounter a clustered index
 | 
						|
		record with some unwritten off-page columns. There is
 | 
						|
		nothing to free then. */
 | 
						|
		ut_a(rollback);
 | 
						|
		return;
 | 
						|
	}
 | 
						|
 | 
						|
	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
 | 
						|
	        & ~((BTR_EXTERN_OWNER_FLAG
 | 
						|
	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
 | 
						|
	ut_ad(space_id == index->table->space_id);
 | 
						|
 | 
						|
	const ulint ext_zip_size = index->table->space->zip_size();
 | 
						|
	/* !rec holds in a call from purge when field_ref is in an undo page */
 | 
						|
	ut_ad(rec || !block->page.zip.data);
 | 
						|
 | 
						|
	for (;;) {
 | 
						|
		mtr_t mtr;
 | 
						|
 | 
						|
		mtr.start();
 | 
						|
		mtr.set_spaces(*local_mtr);
 | 
						|
		mtr.set_log_mode_sub(*local_mtr);
 | 
						|
 | 
						|
		ut_ad(!index->table->is_temporary()
 | 
						|
		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
 | 
						|
 | 
						|
		const uint32_t page_no = mach_read_from_4(
 | 
						|
			field_ref + BTR_EXTERN_PAGE_NO);
 | 
						|
		buf_block_t* ext_block;
 | 
						|
 | 
						|
		if (/* There is no external storage data */
 | 
						|
		    page_no == FIL_NULL
 | 
						|
		    /* This field does not own the externally stored field */
 | 
						|
		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
 | 
						|
			& BTR_EXTERN_OWNER_FLAG)
 | 
						|
		    /* Rollback and inherited field */
 | 
						|
		    || (rollback
 | 
						|
			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
 | 
						|
			    & BTR_EXTERN_INHERITED_FLAG))) {
 | 
						|
skip_free:
 | 
						|
			/* Do not free */
 | 
						|
			mtr.commit();
 | 
						|
 | 
						|
			return;
 | 
						|
		}
 | 
						|
 | 
						|
		ext_block = buf_page_get(page_id_t(space_id, page_no),
 | 
						|
					 ext_zip_size, RW_X_LATCH, &mtr);
 | 
						|
 | 
						|
		if (!ext_block) {
 | 
						|
			goto skip_free;
 | 
						|
		}
 | 
						|
 | 
						|
		/* The buffer pool block containing the BLOB pointer is
 | 
						|
		exclusively latched by local_mtr. To satisfy some design
 | 
						|
		constraints, we must recursively latch it in mtr as well. */
 | 
						|
		block->fix();
 | 
						|
		block->page.lock.x_lock();
 | 
						|
 | 
						|
		mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
 | 
						|
#ifdef BTR_CUR_HASH_ADAPT
 | 
						|
		ut_ad(!btr_search_check_marked_free_index(block));
 | 
						|
#endif
 | 
						|
 | 
						|
		const page_t* page = buf_block_get_frame(ext_block);
 | 
						|
 | 
						|
		if (ext_zip_size) {
 | 
						|
			/* Note that page_zip will be NULL
 | 
						|
			in row_purge_upd_exist_or_extern(). */
 | 
						|
			switch (fil_page_get_type(page)) {
 | 
						|
			case FIL_PAGE_TYPE_ZBLOB:
 | 
						|
			case FIL_PAGE_TYPE_ZBLOB2:
 | 
						|
				break;
 | 
						|
			default:
 | 
						|
				MY_ASSERT_UNREACHABLE();
 | 
						|
			}
 | 
						|
			const uint32_t next_page_no = mach_read_from_4(
 | 
						|
				page + FIL_PAGE_NEXT);
 | 
						|
 | 
						|
			btr_page_free(index, ext_block, &mtr, true,
 | 
						|
				      local_mtr->memo_contains(
 | 
						|
					      *index->table->space));
 | 
						|
 | 
						|
			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 | 
						|
				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
 | 
						|
						next_page_no);
 | 
						|
				memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
 | 
						|
				page_zip_write_blob_ptr(block, rec, index,
 | 
						|
							offsets, i, &mtr);
 | 
						|
			} else {
 | 
						|
				mtr.write<4>(*block,
 | 
						|
					     BTR_EXTERN_PAGE_NO + field_ref,
 | 
						|
					     next_page_no);
 | 
						|
				mtr.write<4,mtr_t::MAYBE_NOP>(*block,
 | 
						|
							      BTR_EXTERN_LEN
 | 
						|
							      + 4 + field_ref,
 | 
						|
							      0U);
 | 
						|
			}
 | 
						|
		} else {
 | 
						|
			ut_ad(!block->page.zip.data);
 | 
						|
			btr_check_blob_fil_page_type(*ext_block, "purge");
 | 
						|
 | 
						|
			const uint32_t next_page_no = mach_read_from_4(
 | 
						|
				page + FIL_PAGE_DATA
 | 
						|
				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
 | 
						|
			btr_page_free(index, ext_block, &mtr, true,
 | 
						|
				      local_mtr->memo_contains(
 | 
						|
					      *index->table->space));
 | 
						|
 | 
						|
			mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
 | 
						|
				     next_page_no);
 | 
						|
			/* Zero out the BLOB length.  If the server
 | 
						|
			crashes during the execution of this function,
 | 
						|
			trx_rollback_all_recovered() could
 | 
						|
			dereference the half-deleted BLOB, fetching a
 | 
						|
			wrong prefix for the BLOB. */
 | 
						|
			mtr.write<4,mtr_t::MAYBE_NOP>(*block,
 | 
						|
						      BTR_EXTERN_LEN + 4
 | 
						|
						      + field_ref, 0U);
 | 
						|
		}
 | 
						|
 | 
						|
		/* Commit mtr and release the BLOB block to save memory. */
 | 
						|
		btr_blob_free(ext_block, TRUE, &mtr);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/***********************************************************//**
 | 
						|
Frees the externally stored fields for a record. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_rec_free_externally_stored_fields(
 | 
						|
/*==================================*/
 | 
						|
	dict_index_t*	index,	/*!< in: index of the data, the index
 | 
						|
				tree MUST be X-latched */
 | 
						|
	rec_t*		rec,	/*!< in/out: record */
 | 
						|
	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 | 
						|
	buf_block_t*	block,	/*!< in: index page of rec */
 | 
						|
	bool		rollback,/*!< in: performing rollback? */
 | 
						|
	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
 | 
						|
				an X-latch to record page and to the index
 | 
						|
				tree */
 | 
						|
{
 | 
						|
	ulint	n_fields;
 | 
						|
	ulint	i;
 | 
						|
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
 | 
						|
	ut_ad(index->is_primary());
 | 
						|
	ut_ad(page_rec_is_leaf(rec));
 | 
						|
	/* Free possible externally stored fields in the record */
 | 
						|
 | 
						|
	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
 | 
						|
	n_fields = rec_offs_n_fields(offsets);
 | 
						|
 | 
						|
	for (i = 0; i < n_fields; i++) {
 | 
						|
		if (rec_offs_nth_extern(offsets, i)) {
 | 
						|
			btr_free_externally_stored_field(
 | 
						|
				index, btr_rec_get_field_ref(rec, offsets, i),
 | 
						|
				rec, offsets, block, i, rollback, mtr);
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/***********************************************************//**
 | 
						|
Frees the externally stored fields for a record, if the field is mentioned
 | 
						|
in the update vector. */
 | 
						|
static
 | 
						|
void
 | 
						|
btr_rec_free_updated_extern_fields(
 | 
						|
/*===============================*/
 | 
						|
	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
 | 
						|
				X-latched */
 | 
						|
	rec_t*		rec,	/*!< in/out: record */
 | 
						|
	buf_block_t*	block,	/*!< in: index page of rec */
 | 
						|
	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 | 
						|
	const upd_t*	update,	/*!< in: update vector */
 | 
						|
	bool		rollback,/*!< in: performing rollback? */
 | 
						|
	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
 | 
						|
				an X-latch to record page and to the tree */
 | 
						|
{
 | 
						|
	ulint	n_fields;
 | 
						|
	ulint	i;
 | 
						|
 | 
						|
	ut_ad(rec_offs_validate(rec, index, offsets));
 | 
						|
	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
 | 
						|
 | 
						|
	/* Free possible externally stored fields in the record */
 | 
						|
 | 
						|
	n_fields = upd_get_n_fields(update);
 | 
						|
 | 
						|
	for (i = 0; i < n_fields; i++) {
 | 
						|
		const upd_field_t* ufield = upd_get_nth_field(update, i);
 | 
						|
 | 
						|
		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
 | 
						|
			ulint	len;
 | 
						|
			byte*	data = rec_get_nth_field(
 | 
						|
				rec, offsets, ufield->field_no, &len);
 | 
						|
			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
 | 
						|
			btr_free_externally_stored_field(
 | 
						|
				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
 | 
						|
				rec, offsets, block,
 | 
						|
				ufield->field_no, rollback, mtr);
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/*******************************************************************//**
 | 
						|
Copies the prefix of an uncompressed BLOB.  The clustered index record
 | 
						|
that points to this BLOB must be protected by a lock or a page latch.
 | 
						|
@return number of bytes written to buf */
 | 
						|
static
 | 
						|
ulint
 | 
						|
btr_copy_blob_prefix(
 | 
						|
/*=================*/
 | 
						|
	byte*		buf,	/*!< out: the externally stored part of
 | 
						|
				the field, or a prefix of it */
 | 
						|
	uint32_t	len,	/*!< in: length of buf, in bytes */
 | 
						|
	page_id_t	id,	/*!< in: page identifier of the first BLOB page */
 | 
						|
	uint32_t	offset)	/*!< in: offset on the first BLOB page */
 | 
						|
{
 | 
						|
	ulint	copied_len	= 0;
 | 
						|
 | 
						|
	for (;;) {
 | 
						|
		mtr_t		mtr;
 | 
						|
		buf_block_t*	block;
 | 
						|
		const page_t*	page;
 | 
						|
		const byte*	blob_header;
 | 
						|
		ulint		part_len;
 | 
						|
		ulint		copy_len;
 | 
						|
 | 
						|
		mtr_start(&mtr);
 | 
						|
 | 
						|
		block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
 | 
						|
		if (!block || btr_check_blob_fil_page_type(*block, "read")) {
 | 
						|
			mtr.commit();
 | 
						|
			return copied_len;
 | 
						|
		}
 | 
						|
		if (!buf_page_make_young_if_needed(&block->page)) {
 | 
						|
			buf_read_ahead_linear(id);
 | 
						|
		}
 | 
						|
 | 
						|
		page = buf_block_get_frame(block);
 | 
						|
 | 
						|
		blob_header = page + offset;
 | 
						|
		part_len = btr_blob_get_part_len(blob_header);
 | 
						|
		copy_len = ut_min(part_len, len - copied_len);
 | 
						|
 | 
						|
		memcpy(buf + copied_len,
 | 
						|
		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
 | 
						|
		copied_len += copy_len;
 | 
						|
 | 
						|
		id.set_page_no(btr_blob_get_next_page_no(blob_header));
 | 
						|
 | 
						|
		mtr_commit(&mtr);
 | 
						|
 | 
						|
		if (id.page_no() == FIL_NULL || copy_len != part_len) {
 | 
						|
			MEM_CHECK_DEFINED(buf, copied_len);
 | 
						|
			return(copied_len);
 | 
						|
		}
 | 
						|
 | 
						|
		/* On other BLOB pages except the first the BLOB header
 | 
						|
		always is at the page data start: */
 | 
						|
 | 
						|
		offset = FIL_PAGE_DATA;
 | 
						|
 | 
						|
		ut_ad(copied_len <= len);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
/** Copies the prefix of a compressed BLOB.
 | 
						|
The clustered index record that points to this BLOB must be protected
 | 
						|
by a lock or a page latch.
 | 
						|
@param[out]	buf		the externally stored part of the field,
 | 
						|
or a prefix of it
 | 
						|
@param[in]	len		length of buf, in bytes
 | 
						|
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
 | 
						|
@param[in]	id		page identifier of the BLOB pages
 | 
						|
@return number of bytes written to buf */
 | 
						|
static
 | 
						|
ulint
 | 
						|
btr_copy_zblob_prefix(
 | 
						|
	byte*			buf,
 | 
						|
	uint32_t		len,
 | 
						|
	ulint			zip_size,
 | 
						|
	page_id_t		id,
 | 
						|
	uint32_t		offset)
 | 
						|
{
 | 
						|
	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
 | 
						|
	mem_heap_t*	heap;
 | 
						|
	int		err;
 | 
						|
	z_stream	d_stream;
 | 
						|
 | 
						|
	d_stream.next_out = buf;
 | 
						|
	d_stream.avail_out = static_cast<uInt>(len);
 | 
						|
	d_stream.next_in = Z_NULL;
 | 
						|
	d_stream.avail_in = 0;
 | 
						|
 | 
						|
	/* Zlib inflate needs 32 kilobytes for the default
 | 
						|
	window size, plus a few kilobytes for small objects. */
 | 
						|
	heap = mem_heap_create(40000);
 | 
						|
	page_zip_set_alloc(&d_stream, heap);
 | 
						|
 | 
						|
	ut_ad(zip_size);
 | 
						|
	ut_ad(ut_is_2pow(zip_size));
 | 
						|
	ut_ad(id.space());
 | 
						|
 | 
						|
	err = inflateInit(&d_stream);
 | 
						|
	ut_a(err == Z_OK);
 | 
						|
 | 
						|
	for (;;) {
 | 
						|
		buf_page_t*	bpage;
 | 
						|
		uint32_t	next_page_no;
 | 
						|
 | 
						|
		bpage = buf_page_get_zip(id);
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY(!bpage)) {
 | 
						|
			goto func_exit;
 | 
						|
		}
 | 
						|
 | 
						|
		if (UNIV_UNLIKELY
 | 
						|
		    (fil_page_get_type(bpage->zip.data) != page_type)) {
 | 
						|
 | 
						|
			ib::error() << "Unexpected type "
 | 
						|
				<< fil_page_get_type(bpage->zip.data)
 | 
						|
				<< " of compressed BLOB page " << id;
 | 
						|
 | 
						|
			ut_ad(0);
 | 
						|
			goto end_of_blob;
 | 
						|
		}
 | 
						|
 | 
						|
		next_page_no = mach_read_from_4(bpage->zip.data + offset);
 | 
						|
 | 
						|
		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
 | 
						|
			/* When the BLOB begins at page header,
 | 
						|
			the compressed data payload does not
 | 
						|
			immediately follow the next page pointer. */
 | 
						|
			offset = FIL_PAGE_DATA;
 | 
						|
		} else {
 | 
						|
			offset += 4;
 | 
						|
		}
 | 
						|
 | 
						|
		d_stream.next_in = bpage->zip.data + offset;
 | 
						|
		d_stream.avail_in = uInt(zip_size - offset);
 | 
						|
 | 
						|
		err = inflate(&d_stream, Z_NO_FLUSH);
 | 
						|
		switch (err) {
 | 
						|
		case Z_OK:
 | 
						|
			if (!d_stream.avail_out) {
 | 
						|
				goto end_of_blob;
 | 
						|
			}
 | 
						|
			break;
 | 
						|
		case Z_STREAM_END:
 | 
						|
			if (next_page_no == FIL_NULL) {
 | 
						|
				goto end_of_blob;
 | 
						|
			}
 | 
						|
			/* fall through */
 | 
						|
		default:
 | 
						|
inflate_error:
 | 
						|
			ib::error() << "inflate() of compressed BLOB page "
 | 
						|
				<< id
 | 
						|
				<< " returned " << err
 | 
						|
				<< " (" << d_stream.msg << ")";
 | 
						|
 | 
						|
		case Z_BUF_ERROR:
 | 
						|
			goto end_of_blob;
 | 
						|
		}
 | 
						|
 | 
						|
		if (next_page_no == FIL_NULL) {
 | 
						|
			if (!d_stream.avail_in) {
 | 
						|
				ib::error()
 | 
						|
					<< "Unexpected end of compressed "
 | 
						|
					<< "BLOB page " << id;
 | 
						|
			} else {
 | 
						|
				err = inflate(&d_stream, Z_FINISH);
 | 
						|
				switch (err) {
 | 
						|
				case Z_STREAM_END:
 | 
						|
				case Z_BUF_ERROR:
 | 
						|
					break;
 | 
						|
				default:
 | 
						|
					goto inflate_error;
 | 
						|
				}
 | 
						|
			}
 | 
						|
 | 
						|
end_of_blob:
 | 
						|
			bpage->lock.s_unlock();
 | 
						|
			goto func_exit;
 | 
						|
		}
 | 
						|
 | 
						|
		bpage->lock.s_unlock();
 | 
						|
 | 
						|
		/* On other BLOB pages except the first
 | 
						|
		the BLOB header always is at the page header: */
 | 
						|
 | 
						|
		id.set_page_no(next_page_no);
 | 
						|
		offset = FIL_PAGE_NEXT;
 | 
						|
		page_type = FIL_PAGE_TYPE_ZBLOB2;
 | 
						|
	}
 | 
						|
 | 
						|
func_exit:
 | 
						|
	inflateEnd(&d_stream);
 | 
						|
	mem_heap_free(heap);
 | 
						|
	MEM_CHECK_DEFINED(buf, d_stream.total_out);
 | 
						|
	return(d_stream.total_out);
 | 
						|
}
 | 
						|
 | 
						|
/** Copies the prefix of an externally stored field of a record.
 | 
						|
The clustered index record that points to this BLOB must be protected
 | 
						|
by a lock or a page latch.
 | 
						|
@param[out]	buf		the externally stored part of the
 | 
						|
field, or a prefix of it
 | 
						|
@param[in]	len		length of buf, in bytes
 | 
						|
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 | 
						|
@param[in]	id		page identifier of the first BLOB page
 | 
						|
@param[in]	offset		offset on the first BLOB page
 | 
						|
@return number of bytes written to buf */
 | 
						|
static
 | 
						|
ulint
 | 
						|
btr_copy_externally_stored_field_prefix_low(
 | 
						|
	byte*			buf,
 | 
						|
	uint32_t		len,
 | 
						|
	ulint			zip_size,
 | 
						|
	page_id_t		id,
 | 
						|
	uint32_t		offset)
 | 
						|
{
 | 
						|
  if (len == 0)
 | 
						|
    return 0;
 | 
						|
 | 
						|
  return zip_size
 | 
						|
    ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
 | 
						|
    : btr_copy_blob_prefix(buf, len, id, offset);
 | 
						|
}
 | 
						|
 | 
						|
/** Copies the prefix of an externally stored field of a record.
 | 
						|
The clustered index record must be protected by a lock or a page latch.
 | 
						|
@param[out]	buf		the field, or a prefix of it
 | 
						|
@param[in]	len		length of buf, in bytes
 | 
						|
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 | 
						|
@param[in]	data		'internally' stored part of the field
 | 
						|
containing also the reference to the external part; must be protected by
 | 
						|
a lock or a page latch
 | 
						|
@param[in]	local_len	length of data, in bytes
 | 
						|
@return the length of the copied field, or 0 if the column was being
 | 
						|
or has been deleted */
 | 
						|
ulint
 | 
						|
btr_copy_externally_stored_field_prefix(
 | 
						|
	byte*			buf,
 | 
						|
	ulint			len,
 | 
						|
	ulint			zip_size,
 | 
						|
	const byte*		data,
 | 
						|
	ulint			local_len)
 | 
						|
{
 | 
						|
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
 | 
						|
	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY(local_len >= len)) {
 | 
						|
		memcpy(buf, data, len);
 | 
						|
		return(len);
 | 
						|
	}
 | 
						|
 | 
						|
	memcpy(buf, data, local_len);
 | 
						|
	data += local_len;
 | 
						|
 | 
						|
	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
 | 
						|
 | 
						|
	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
 | 
						|
		/* The externally stored part of the column has been
 | 
						|
		(partially) deleted.  Signal the half-deleted BLOB
 | 
						|
		to the caller. */
 | 
						|
 | 
						|
		return(0);
 | 
						|
	}
 | 
						|
 | 
						|
	uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
 | 
						|
	uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
 | 
						|
	uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
 | 
						|
	len -= local_len;
 | 
						|
 | 
						|
	return(local_len
 | 
						|
	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
 | 
						|
							     uint32_t(len),
 | 
						|
							     zip_size,
 | 
						|
							     page_id_t(
 | 
						|
								     space_id,
 | 
						|
								     page_no),
 | 
						|
							     offset));
 | 
						|
}
 | 
						|
 | 
						|
/** Copies an externally stored field of a record to mem heap.
 | 
						|
The clustered index record must be protected by a lock or a page latch.
 | 
						|
@param[out]	len		length of the whole field
 | 
						|
@param[in]	data		'internally' stored part of the field
 | 
						|
containing also the reference to the external part; must be protected by
 | 
						|
a lock or a page latch
 | 
						|
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 | 
						|
@param[in]	local_len	length of data
 | 
						|
@param[in,out]	heap		mem heap
 | 
						|
@return the whole field copied to heap */
 | 
						|
byte*
 | 
						|
btr_copy_externally_stored_field(
 | 
						|
	ulint*			len,
 | 
						|
	const byte*		data,
 | 
						|
	ulint			zip_size,
 | 
						|
	ulint			local_len,
 | 
						|
	mem_heap_t*		heap)
 | 
						|
{
 | 
						|
	byte*	buf;
 | 
						|
 | 
						|
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
 | 
						|
	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 | 
						|
 | 
						|
	uint32_t space_id = mach_read_from_4(data + local_len
 | 
						|
					     + BTR_EXTERN_SPACE_ID);
 | 
						|
	uint32_t page_no = mach_read_from_4(data + local_len
 | 
						|
					    + BTR_EXTERN_PAGE_NO);
 | 
						|
	uint32_t offset = mach_read_from_4(data + local_len
 | 
						|
					   + BTR_EXTERN_OFFSET);
 | 
						|
 | 
						|
	/* Currently a BLOB cannot be bigger than 4 GB; we
 | 
						|
	leave the 4 upper bytes in the length field unused */
 | 
						|
 | 
						|
	uint32_t extern_len = mach_read_from_4(data + local_len
 | 
						|
					       + BTR_EXTERN_LEN + 4);
 | 
						|
 | 
						|
	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
 | 
						|
 | 
						|
	memcpy(buf, data, local_len);
 | 
						|
	*len = local_len
 | 
						|
		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
 | 
						|
							      extern_len,
 | 
						|
							      zip_size,
 | 
						|
							      page_id_t(
 | 
						|
								      space_id,
 | 
						|
								      page_no),
 | 
						|
							      offset);
 | 
						|
 | 
						|
	return(buf);
 | 
						|
}
 | 
						|
 | 
						|
/** Copies an externally stored field of a record to mem heap.
 | 
						|
@param[in]	rec		record in a clustered index; must be
 | 
						|
protected by a lock or a page latch
 | 
						|
@param[in]	offset		array returned by rec_get_offsets()
 | 
						|
@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 | 
						|
@param[in]	no		field number
 | 
						|
@param[out]	len		length of the field
 | 
						|
@param[in,out]	heap		mem heap
 | 
						|
@return the field copied to heap, or NULL if the field is incomplete */
 | 
						|
byte*
 | 
						|
btr_rec_copy_externally_stored_field(
 | 
						|
	const rec_t*		rec,
 | 
						|
	const rec_offs*		offsets,
 | 
						|
	ulint			zip_size,
 | 
						|
	ulint			no,
 | 
						|
	ulint*			len,
 | 
						|
	mem_heap_t*		heap)
 | 
						|
{
 | 
						|
	ulint		local_len;
 | 
						|
	const byte*	data;
 | 
						|
 | 
						|
	ut_a(rec_offs_nth_extern(offsets, no));
 | 
						|
 | 
						|
	/* An externally stored field can contain some initial
 | 
						|
	data from the field, and in the last 20 bytes it has the
 | 
						|
	space id, page number, and offset where the rest of the
 | 
						|
	field data is stored, and the data length in addition to
 | 
						|
	the data stored locally. We may need to store some data
 | 
						|
	locally to get the local record length above the 128 byte
 | 
						|
	limit so that field offsets are stored in two bytes, and
 | 
						|
	the extern bit is available in those two bytes. */
 | 
						|
 | 
						|
	data = rec_get_nth_field(rec, offsets, no, &local_len);
 | 
						|
 | 
						|
	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 | 
						|
 | 
						|
	if (UNIV_UNLIKELY
 | 
						|
	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
 | 
						|
		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
 | 
						|
		/* The externally stored field was not written yet.
 | 
						|
		This record should only be seen by
 | 
						|
		trx_rollback_recovered() or any
 | 
						|
		TRX_ISO_READ_UNCOMMITTED transactions. */
 | 
						|
		return(NULL);
 | 
						|
	}
 | 
						|
 | 
						|
	return(btr_copy_externally_stored_field(len, data,
 | 
						|
						zip_size, local_len, heap));
 | 
						|
}
 |