mirror of
				https://github.com/MariaDB/server.git
				synced 2025-10-25 08:58:14 +02:00 
			
		
		
		
	 b7b9f3ce82
			
		
	
	
	b7b9f3ce82
	
	
	
		
			
			In a Sysbench oltp_update_index workload that involves 1 table,
a serious contention between the workload and the purge of history
was observed. This was the worst when the table contained only 1 record.
This turned out to be fixed by setting innodb_purge_batch_size=128,
which corresponds to the number of usable persistent rollback segments.
When we go above that, there would be contention between row_purge_poss_sec()
and the workload, typically on the clustered index page latch, sometimes
also on a secondary index page latch. It might be that with smaller
batches, trx_sys.history_size() will end up pausing all concurrent
transaction start/commit frequently enough so that purge will be able
to make some progress, so that there would be less contention on the
index page latches between purge and SQL execution.
In commit aa719b5010 (part of MDEV-32050)
the interpretation of the parameter innodb_purge_batch_size was slightly
changed. It would correspond to the maximum desired size of the
purge_sys.pages cache. Before that change, the parameter was referring to
a number of undo log pages, but the accounting might have been inaccurate.
To avoid a regression, we will reduce the default value to
innodb_purge_batch_size=127, which will also be compatible with
innodb_undo_tablespaces>1 (which will disable rollback segment 0).
Additionally, some logic in the purge and MVCC checks is simplified.
The purge tasks will make use of purge_sys.pages when accessing undo
log pages to find out if a secondary index record can be removed.
If an undo page needs to be looked up in buf_pool.page_hash, we will
merely buffer-fix it. This is correct, because the undo pages are
append-only in nature. Holding purge_sys.latch or purge_sys.end_latch
or the fact that the current thread is executing as a part of an
in-progress purge batch will prevent the contents of the undo page from
being freed and subsequently reused. The buffer-fix will prevent the
page from being evicted form the buffer pool. Thanks to this logic,
we can refer to the undo log record directly in the buffer pool page
and avoid copying the record.
buf_pool_t::page_fix(): Look up and buffer-fix a page. This is useful
for accessing undo log pages, which are append-only by nature.
There will be no need to deal with change buffer or ROW_FORMAT=COMPRESSED
in that case.
purge_sys_t::view_guard::view_guard(): Allow the type of guard to be
acquired: end_latch, latch, or no latch (in case we are a purge thread).
purge_sys_t::view_guard::get(): Read-only accessor to purge_sys.pages.
purge_sys_t::get_page(): Invoke buf_pool_t::page_fix().
row_vers_old_has_index_entry(): Replaced with row_purge_is_unsafe()
and row_undo_mod_sec_unsafe().
trx_undo_get_undo_rec(): Merged to trx_undo_prev_version_build().
row_purge_poss_sec(): Add the parameter mtr and remove redundant
or unused parameters sec_pcur, sec_mtr, is_tree. We will use the
caller's mtr object but release any acquired page latches before
returning.
btr_cur_get_page(), page_cur_get_page(): Do not invoke page_align().
row_purge_remove_sec_if_poss_leaf(): Return the value of PAGE_MAX_TRX_ID
to be checked against the page in row_purge_remove_sec_if_poss_tree().
If the secondary index page was not changed meanwhile, it will be
unnecessary to invoke row_purge_poss_sec() again.
trx_undo_prev_version_build(): Access any undo log pages using
the caller's mini-transaction object.
row_purge_vc_matches_cluster(): Moved to the only compilation unit that
needs it.
Reviewed by: Debarun Banerjee
		
	
			
		
			
				
	
	
		
			293 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			293 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*****************************************************************************
 | |
| 
 | |
| Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
 | |
| Copyright (c) 2017, 2022, MariaDB Corporation.
 | |
| 
 | |
| This program is free software; you can redistribute it and/or modify it under
 | |
| the terms of the GNU General Public License as published by the Free Software
 | |
| Foundation; version 2 of the License.
 | |
| 
 | |
| This program is distributed in the hope that it will be useful, but WITHOUT
 | |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 | |
| FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 | |
| 
 | |
| You should have received a copy of the GNU General Public License along with
 | |
| this program; if not, write to the Free Software Foundation, Inc.,
 | |
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
 | |
| 
 | |
| *****************************************************************************/
 | |
| 
 | |
| /**************************************************//**
 | |
| @file include/trx0rec.h
 | |
| Transaction undo log record
 | |
| 
 | |
| Created 3/26/1996 Heikki Tuuri
 | |
| *******************************************************/
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #include "trx0types.h"
 | |
| #include "row0types.h"
 | |
| #include "page0types.h"
 | |
| #include "que0types.h"
 | |
| 
 | |
| /**********************************************************************//**
 | |
| Reads the undo log record number.
 | |
| @return undo no */
 | |
| inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec)
 | |
| {
 | |
|   return mach_u64_read_much_compressed(undo_rec + 3);
 | |
| }
 | |
| 
 | |
| /**********************************************************************//**
 | |
| Returns the start of the undo record data area. */
 | |
| #define trx_undo_rec_get_ptr(undo_rec, undo_no)		\
 | |
| 	((undo_rec) + trx_undo_rec_get_offset(undo_no))
 | |
| 
 | |
| /**********************************************************************//**
 | |
| Reads from an undo log record the general parameters.
 | |
| @return remaining part of undo log record after reading these values */
 | |
| const byte*
 | |
| trx_undo_rec_get_pars(
 | |
| /*==================*/
 | |
| 	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
 | |
| 	byte*		type,		/*!< out: undo record type:
 | |
| 					TRX_UNDO_INSERT_REC, ... */
 | |
| 	byte*		cmpl_info,	/*!< out: compiler info, relevant only
 | |
| 					for update type records */
 | |
| 	bool*		updated_extern,	/*!< out: true if we updated an
 | |
| 					externally stored fild */
 | |
| 	undo_no_t*	undo_no,	/*!< out: undo log record number */
 | |
| 	table_id_t*	table_id)	/*!< out: table id */
 | |
| 	MY_ATTRIBUTE((nonnull));
 | |
| 
 | |
| /*******************************************************************//**
 | |
| Builds a row reference from an undo log record.
 | |
| @return pointer to remaining part of undo record */
 | |
| const byte*
 | |
| trx_undo_rec_get_row_ref(
 | |
| /*=====================*/
 | |
| 	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
 | |
| 				record, at the start of the row reference;
 | |
| 				NOTE that this copy of the undo log record must
 | |
| 				be preserved as long as the row reference is
 | |
| 				used, as we do NOT copy the data in the
 | |
| 				record! */
 | |
| 	dict_index_t*	index,	/*!< in: clustered index */
 | |
| 	const dtuple_t**ref,	/*!< out, own: row reference */
 | |
| 	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
 | |
| 				needed is allocated */
 | |
| 	MY_ATTRIBUTE((nonnull));
 | |
| /**********************************************************************//**
 | |
| Reads from an undo log update record the system field values of the old
 | |
| version.
 | |
| @return remaining part of undo log record after reading these values */
 | |
| byte*
 | |
| trx_undo_update_rec_get_sys_cols(
 | |
| /*=============================*/
 | |
| 	const byte*	ptr,		/*!< in: remaining part of undo
 | |
| 					log record after reading
 | |
| 					general parameters */
 | |
| 	trx_id_t*	trx_id,		/*!< out: trx id */
 | |
| 	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
 | |
| 	byte*		info_bits);	/*!< out: info bits state */
 | |
| /*******************************************************************//**
 | |
| Builds an update vector based on a remaining part of an undo log record.
 | |
| @return remaining part of the record, NULL if an error detected, which
 | |
| means that the record is corrupted */
 | |
| byte*
 | |
| trx_undo_update_rec_get_update(
 | |
| /*===========================*/
 | |
| 	const byte*	ptr,	/*!< in: remaining part in update undo log
 | |
| 				record, after reading the row reference
 | |
| 				NOTE that this copy of the undo log record must
 | |
| 				be preserved as long as the update vector is
 | |
| 				used, as we do NOT copy the data in the
 | |
| 				record! */
 | |
| 	dict_index_t*	index,	/*!< in: clustered index */
 | |
| 	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
 | |
| 				TRX_UNDO_UPD_DEL_REC, or
 | |
| 				TRX_UNDO_DEL_MARK_REC; in the last case,
 | |
| 				only trx id and roll ptr fields are added to
 | |
| 				the update vector */
 | |
| 	trx_id_t	trx_id,	/*!< in: transaction id from this undorecord */
 | |
| 	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
 | |
| 	byte		info_bits,/*!< in: info bits from this undo record */
 | |
| 	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
 | |
| 				needed is allocated */
 | |
| 	upd_t**		upd);	/*!< out, own: update vector */
 | |
| /** Report a RENAME TABLE operation.
 | |
| @param[in,out]	trx	transaction
 | |
| @param[in]	table	table that is being renamed
 | |
| @return	DB_SUCCESS or error code */
 | |
| dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
 | |
| 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 | |
| /***********************************************************************//**
 | |
| Writes information to an undo log about an insert, update, or a delete marking
 | |
| of a clustered index record. This information is used in a rollback of the
 | |
| transaction and in consistent reads that must look to the history of this
 | |
| transaction.
 | |
| @return DB_SUCCESS or error code */
 | |
| dberr_t
 | |
| trx_undo_report_row_operation(
 | |
| /*==========================*/
 | |
| 	que_thr_t*	thr,		/*!< in: query thread */
 | |
| 	dict_index_t*	index,		/*!< in: clustered index */
 | |
| 	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
 | |
| 					index entry to insert into the
 | |
| 					clustered index; in updates,
 | |
| 					may contain a clustered index
 | |
| 					record tuple that also contains
 | |
| 					virtual columns of the table;
 | |
| 					otherwise, NULL */
 | |
| 	const upd_t*	update,		/*!< in: in the case of an update,
 | |
| 					the update vector, otherwise NULL */
 | |
| 	ulint		cmpl_info,	/*!< in: compiler info on secondary
 | |
| 					index updates */
 | |
| 	const rec_t*	rec,		/*!< in: case of an update or delete
 | |
| 					marking, the record in the clustered
 | |
| 					index; NULL if insert */
 | |
| 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
 | |
| 	roll_ptr_t*	roll_ptr)	/*!< out: DB_ROLL_PTR to the
 | |
| 					undo log record */
 | |
| 	MY_ATTRIBUTE((nonnull(1,2,8), warn_unused_result));
 | |
| 
 | |
| /** status bit used for trx_undo_prev_version_build() */
 | |
| 
 | |
| /** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it
 | |
| is being called purge view and we would like to get the purge record
 | |
| even it is in the purge view (in normal case, it will return without
 | |
| fetching the purge record) */
 | |
| static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
 | |
| 
 | |
| /** This tells trx_undo_prev_version_build() to fetch the old value in
 | |
| the undo log (which is the after image for an update) */
 | |
| static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
 | |
| 
 | |
| /** indicate a call from row_undo_mod_sec_is_unsafe() */
 | |
| static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
 | |
| 
 | |
| /** indicate a call from row_purge_is_unsafe() */
 | |
| static constexpr ulint TRX_UNDO_CHECK_PURGE_PAGES = 8;
 | |
| 
 | |
| /** Build a previous version of a clustered index record. The caller
 | |
| must hold a latch on the index page of the clustered index record.
 | |
| @param rec       version of a clustered index record
 | |
| @param index     clustered index
 | |
| @param offsets   rec_get_offsets(rec, index)
 | |
| @param heap      memory heap from which the memory needed is allocated
 | |
| @param old_vers  previous version, or NULL if rec is the first inserted
 | |
|                  version, or if history data has been deleted (an error),
 | |
|                  or if the purge could have removed the version though
 | |
|                  it has not yet done so
 | |
| @param mtr       mini-transaction
 | |
| @param v_status  TRX_UNDO_PREV_IN_PURGE, ...
 | |
| @param v_heap    memory heap used to create vrow dtuple if it is not yet
 | |
|                  created. This heap diffs from "heap" above in that it could be
 | |
|                  prebuilt->old_vers_heap for selection
 | |
| @param vrow      virtual column info, if any
 | |
| @return error code
 | |
| @retval DB_SUCCESS if previous version was successfully built,
 | |
| or if it was an insert or the undo record refers to the table before rebuild
 | |
| @retval DB_MISSING_HISTORY if the history is missing */
 | |
| dberr_t trx_undo_prev_version_build(const rec_t *rec, dict_index_t *index,
 | |
|                                     rec_offs *offsets, mem_heap_t *heap,
 | |
|                                     rec_t **old_vers, mtr_t *mtr,
 | |
|                                     ulint v_status,
 | |
|                                     mem_heap_t *v_heap, dtuple_t **vrow);
 | |
| 
 | |
| /** Read from an undo log record a non-virtual column value.
 | |
| @param ptr	pointer to remaining part of the undo record
 | |
| @param field	stored field
 | |
| @param len	length of the field, or UNIV_SQL_NULL
 | |
| @param orig_len	original length of the locally stored part
 | |
| of an externally stored column, or 0
 | |
| @return remaining part of undo log record after reading these values */
 | |
| const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
 | |
|                                      uint32_t *len, uint32_t *orig_len);
 | |
| 
 | |
| /** Read virtual column value from undo log
 | |
| @param[in]	table		the table
 | |
| @param[in]	ptr		undo log pointer
 | |
| @param[in,out]	row		the dtuple to fill
 | |
| @param[in]	in_purge	whether this is called by purge */
 | |
| void
 | |
| trx_undo_read_v_cols(
 | |
| 	const dict_table_t*	table,
 | |
| 	const byte*		ptr,
 | |
| 	dtuple_t*		row,
 | |
| 	bool			in_purge);
 | |
| 
 | |
| /** Read virtual column index from undo log if the undo log contains such
 | |
| info, and verify the column is still indexed, and output its position
 | |
| @param[in]	table		the table
 | |
| @param[in]	ptr		undo log pointer
 | |
| @param[in]	first_v_col	if this is the first virtual column, which
 | |
| 				has the version marker
 | |
| @param[in,out]	is_undo_log	his function is used to parse both undo log,
 | |
| 				and online log for virtual columns. So
 | |
| 				check to see if this is undo log
 | |
| @param[out]	field_no	the column number, or FIL_NULL if not indexed
 | |
| @return remaining part of undo log record after reading these values */
 | |
| const byte*
 | |
| trx_undo_read_v_idx(
 | |
| 	const dict_table_t*	table,
 | |
| 	const byte*		ptr,
 | |
| 	bool			first_v_col,
 | |
| 	bool*			is_undo_log,
 | |
| 	uint32_t*		field_no);
 | |
| 
 | |
| /* Types of an undo log record: these have to be smaller than 16, as the
 | |
| compilation info multiplied by 16 is ORed to this value in an undo log
 | |
| record */
 | |
| 
 | |
| /** Undo log records for DDL operations
 | |
| 
 | |
| Note: special rollback and purge triggers exist for SYS_INDEXES records:
 | |
| @see dict_drop_index_tree() */
 | |
| enum trx_undo_ddl_type
 | |
| {
 | |
|   /** RENAME TABLE (logging the old table name).
 | |
| 
 | |
|   Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records
 | |
|   for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */
 | |
|   TRX_UNDO_RENAME_TABLE= 9,
 | |
|   /** insert a metadata pseudo-record for instant ALTER TABLE */
 | |
|   TRX_UNDO_INSERT_METADATA= 10
 | |
| };
 | |
| 
 | |
| /* DML operations */
 | |
| #define	TRX_UNDO_INSERT_REC	11	/* fresh insert into clustered index */
 | |
| #define	TRX_UNDO_UPD_EXIST_REC	12	/* update of a non-delete-marked
 | |
| 					record */
 | |
| #define	TRX_UNDO_UPD_DEL_REC	13	/* update of a delete marked record to
 | |
| 					a not delete marked record; also the
 | |
| 					fields of the record can change */
 | |
| #define	TRX_UNDO_DEL_MARK_REC	14	/* delete marking of a record; fields
 | |
| 					do not change */
 | |
| /** Bulk insert operation. It is written only when the table is
 | |
| under exclusive lock and the clustered index root page latch is being held,
 | |
| and the clustered index is empty. Rollback will empty the table and
 | |
| free the leaf segment of all indexes, re-create the new
 | |
| leaf segment and re-initialize the root page alone. */
 | |
| #define	TRX_UNDO_EMPTY		15
 | |
| 
 | |
| #define	TRX_UNDO_CMPL_INFO_MULT	16U	/* compilation info is multiplied by
 | |
| 					this and ORed to the type above */
 | |
| #define	TRX_UNDO_UPD_EXTERN	128U	/* This bit can be ORed to type_cmpl
 | |
| 					to denote that we updated external
 | |
| 					storage fields: used by purge to
 | |
| 					free the external storage */
 | |
| 
 | |
| /** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */
 | |
| extern const dtuple_t trx_undo_metadata;
 | |
| 
 | |
| /** Read the table id from an undo log record.
 | |
| @param[in]      rec        Undo log record
 | |
| @return table id stored as a part of undo log record */
 | |
| inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
 | |
| {
 | |
|   rec+= 3;
 | |
|   mach_read_next_much_compressed(&rec);
 | |
|   return mach_read_next_much_compressed(&rec);
 | |
| }
 |