mariadb/storage/xtradb/trx/trx0trx.cc
2015-12-13 16:23:02 +01:00

2575 lines
65 KiB
C++

/*****************************************************************************
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
@file trx/trx0trx.cc
The transaction
Created 3/26/1996 Heikki Tuuri
*******************************************************/
#include "btr0types.h"
#include "trx0trx.h"
#ifdef UNIV_NONINL
#include "trx0trx.ic"
#endif
#include "trx0undo.h"
#include "trx0rseg.h"
#include "log0log.h"
#include "que0que.h"
#include "lock0lock.h"
#include "trx0roll.h"
#include "usr0sess.h"
#include "read0read.h"
#include "srv0srv.h"
#include "srv0start.h"
#include "btr0sea.h"
#include "os0proc.h"
#include "trx0xa.h"
#include "trx0rec.h"
#include "trx0purge.h"
#include "ha_prototypes.h"
#include "srv0mon.h"
#include "ut0vec.h"
#include<set>
extern "C"
int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
/** Set of table_id */
typedef std::set<table_id_t> table_id_set;
/** Dummy session used currently in MySQL interface */
UNIV_INTERN sess_t* trx_dummy_sess = NULL;
#ifdef UNIV_PFS_MUTEX
/* Key to register the mutex with performance schema */
UNIV_INTERN mysql_pfs_key_t trx_mutex_key;
/* Key to register the mutex with performance schema */
UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key;
#endif /* UNIV_PFS_MUTEX */
/*************************************************************//**
Set detailed error message for the transaction. */
UNIV_INTERN
void
trx_set_detailed_error(
/*===================*/
trx_t* trx, /*!< in: transaction struct */
const char* msg) /*!< in: detailed error message */
{
ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
}
/*************************************************************//**
Set detailed error message for the transaction from a file. Note that the
file is rewinded before reading from it. */
UNIV_INTERN
void
trx_set_detailed_error_from_file(
/*=============================*/
trx_t* trx, /*!< in: transaction struct */
FILE* file) /*!< in: file to read message from */
{
os_file_read_string(file, trx->detailed_error,
sizeof(trx->detailed_error));
}
/*************************************************************//**
Callback function for trx_find_descriptor() to compare trx IDs. */
UNIV_INTERN
int
trx_descr_cmp(
/*==========*/
const void *a, /*!< in: pointer to first comparison argument */
const void *b) /*!< in: pointer to second comparison argument */
{
const trx_id_t* da = (const trx_id_t*) a;
const trx_id_t* db = (const trx_id_t*) b;
if (*da < *db) {
return -1;
} else if (*da > *db) {
return 1;
}
return 0;
}
/*************************************************************//**
Reserve a slot for a given trx in the global descriptors array. */
UNIV_INLINE
void
trx_reserve_descriptor(
/*===================*/
const trx_t* trx) /*!< in: trx pointer */
{
ulint n_used;
ulint n_max;
trx_id_t* descr;
ut_ad(mutex_own(&trx_sys->mutex) || srv_is_being_started);
ut_ad(srv_is_being_started ||
!trx_find_descriptor(trx_sys->descriptors,
trx_sys->descr_n_used,
trx->id));
n_used = trx_sys->descr_n_used + 1;
n_max = trx_sys->descr_n_max;
if (UNIV_UNLIKELY(n_used > n_max)) {
n_max = n_max * 2;
trx_sys->descriptors = static_cast<trx_id_t*>(
ut_realloc(trx_sys->descriptors,
n_max * sizeof(trx_id_t)));
trx_sys->descr_n_max = n_max;
srv_descriptors_memory = n_max * sizeof(trx_id_t);
}
descr = trx_sys->descriptors + n_used - 1;
if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) {
/* Find the slot where it should be inserted. We could use a
binary search, but in reality linear search should be faster,
because the slot we are looking for is near the array end. */
trx_id_t* tdescr;
for (tdescr = descr - 1;
tdescr >= trx_sys->descriptors && *tdescr > trx->id;
tdescr--) {
}
tdescr++;
ut_memmove(tdescr + 1, tdescr, (descr - tdescr) *
sizeof(trx_id_t));
descr = tdescr;
}
*descr = trx->id;
trx_sys->descr_n_used = n_used;
}
/*************************************************************//**
Release a slot for a given trx in the global descriptors array. */
UNIV_INTERN
void
trx_release_descriptor(
/*===================*/
trx_t* trx) /*!< in: trx pointer */
{
ulint size;
trx_id_t* descr;
ut_ad(mutex_own(&trx_sys->mutex));
if (UNIV_LIKELY(trx->in_trx_serial_list)) {
UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list,
trx);
trx->in_trx_serial_list = false;
}
descr = trx_find_descriptor(trx_sys->descriptors,
trx_sys->descr_n_used,
trx->id);
if (UNIV_UNLIKELY(descr == NULL)) {
return;
}
size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) *
sizeof(trx_id_t);
if (UNIV_LIKELY(size > 0)) {
ut_memmove(descr, descr + 1, size);
}
trx_sys->descr_n_used--;
}
/****************************************************************//**
Creates and initializes a transaction object. It must be explicitly
started with trx_start_if_not_started() before using it. The default
isolation level is TRX_ISO_REPEATABLE_READ.
@return transaction instance, should never be NULL */
static
trx_t*
trx_create(void)
/*============*/
{
trx_t* trx;
mem_heap_t* heap;
ib_alloc_t* heap_alloc;
trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx)));
mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX);
trx->magic_n = TRX_MAGIC_N;
trx->active_commit_ordered = 0;
trx->state = TRX_STATE_NOT_STARTED;
trx->isolation_level = TRX_ISO_REPEATABLE_READ;
trx->no = TRX_ID_MAX;
trx->in_trx_serial_list = false;
trx->support_xa = TRUE;
trx->fake_changes = FALSE;
trx->check_foreigns = TRUE;
trx->check_unique_secondary = TRUE;
trx->dict_operation = TRX_DICT_OP_NONE;
trx->idle_start = 0;
trx->last_stmt_start = 0;
mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
trx->error_state = DB_SUCCESS;
trx->lock.que_state = TRX_QUE_RUNNING;
trx->lock.lock_heap = mem_heap_create_typed(
256, MEM_HEAP_FOR_LOCK_HEAP);
trx->search_latch_timeout = BTR_SEA_TIMEOUT;
trx->io_reads = 0;
trx->io_read = 0;
trx->io_reads_wait_timer = 0;
trx->lock_que_wait_timer = 0;
trx->innodb_que_wait_timer = 0;
trx->distinct_page_access = 0;
trx->distinct_page_access_hash = NULL;
trx->take_stats = FALSE;
trx->xid.formatID = -1;
trx->op_info = "";
trx->api_trx = false;
trx->api_auto_commit = false;
trx->read_write = true;
heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
heap_alloc = ib_heap_allocator_create(heap);
/* Remember to free the vector explicitly in trx_free(). */
trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4);
/* Remember to free the vector explicitly in trx_free(). */
heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128);
heap_alloc = ib_heap_allocator_create(heap);
trx->lock.table_locks = ib_vector_create(
heap_alloc, sizeof(void**), 32);
return(trx);
}
/********************************************************************//**
Creates a transaction object for background operations by the master thread.
@return own: transaction object */
UNIV_INTERN
trx_t*
trx_allocate_for_background(void)
/*=============================*/
{
trx_t* trx;
trx = trx_create();
trx->sess = trx_dummy_sess;
return(trx);
}
/********************************************************************//**
Creates a transaction object for MySQL.
@return own: transaction object */
UNIV_INTERN
trx_t*
trx_allocate_for_mysql(void)
/*========================*/
{
trx_t* trx;
trx = trx_allocate_for_background();
mutex_enter(&trx_sys->mutex);
ut_d(trx->in_mysql_trx_list = TRUE);
UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
mutex_exit(&trx_sys->mutex);
if (UNIV_UNLIKELY(trx->take_stats)) {
trx->distinct_page_access_hash
= static_cast<byte *>(mem_alloc(DPAH_SIZE));
memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
}
return(trx);
}
/********************************************************************//**
Frees a transaction object without releasing the corresponding descriptor.
Should be used by callers that already own trx_sys->mutex. */
static
void
trx_free_low(
/*=========*/
trx_t* trx) /*!< in, own: trx object */
{
ut_a(trx->magic_n == TRX_MAGIC_N);
ut_ad(!trx->in_ro_trx_list);
ut_ad(!trx->in_rw_trx_list);
ut_ad(!trx->in_mysql_trx_list);
mutex_free(&trx->undo_mutex);
if (trx->undo_no_arr != NULL) {
trx_undo_arr_free(trx->undo_no_arr);
}
ut_a(trx->lock.wait_lock == NULL);
ut_a(trx->lock.wait_thr == NULL);
ut_a(!trx->has_search_latch);
#ifdef UNIV_SYNC_DEBUG
ut_ad(!btr_search_own_any());
#endif
ut_a(trx->dict_operation_lock_mode == 0);
if (trx->lock.lock_heap) {
mem_heap_free(trx->lock.lock_heap);
}
ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
ut_a(ib_vector_is_empty(trx->autoinc_locks));
/* We allocated a dedicated heap for the vector. */
ib_vector_free(trx->autoinc_locks);
if (trx->lock.table_locks != NULL) {
/* We allocated a dedicated heap for the vector. */
ib_vector_free(trx->lock.table_locks);
}
mutex_free(&trx->mutex);
read_view_free(trx->prebuilt_view);
mem_free(trx);
}
/********************************************************************//**
Frees a transaction object. */
static
void
trx_free(
/*=========*/
trx_t* trx) /*!< in, own: trx object */
{
mutex_enter(&trx_sys->mutex);
trx_release_descriptor(trx);
mutex_exit(&trx_sys->mutex);
trx_free_low(trx);
}
/********************************************************************//**
Frees a transaction object of a background operation of the master thread. */
UNIV_INTERN
void
trx_free_for_background(
/*====================*/
trx_t* trx) /*!< in, own: trx object */
{
if (trx->distinct_page_access_hash)
{
mem_free(trx->distinct_page_access_hash);
trx->distinct_page_access_hash= NULL;
}
if (trx->declared_to_be_inside_innodb) {
ib_logf(IB_LOG_LEVEL_ERROR,
"Freeing a trx (%p, " TRX_ID_FMT ") which is declared "
"to be processing inside InnoDB", trx, trx->id);
trx_print(stderr, trx, 600);
putc('\n', stderr);
/* This is an error but not a fatal error. We must keep
the counters like srv_conc_n_threads accurate. */
srv_conc_force_exit_innodb(trx);
}
if (trx->n_mysql_tables_in_use != 0
|| trx->mysql_n_tables_locked != 0) {
ib_logf(IB_LOG_LEVEL_ERROR,
"MySQL is freeing a thd though "
"trx->n_mysql_tables_in_use is %lu and "
"trx->mysql_n_tables_locked is %lu.",
(ulong) trx->n_mysql_tables_in_use,
(ulong) trx->mysql_n_tables_locked);
trx_print(stderr, trx, 600);
ut_print_buf(stderr, trx, sizeof(trx_t));
putc('\n', stderr);
}
ut_a(trx->state == TRX_STATE_NOT_STARTED);
ut_a(trx->insert_undo == NULL);
ut_a(trx->update_undo == NULL);
ut_a(trx->read_view == NULL);
trx_free(trx);
}
/********************************************************************//**
At shutdown, frees a transaction object that is in the PREPARED state. */
UNIV_INTERN
void
trx_free_prepared(
/*==============*/
trx_t* trx) /*!< in, own: trx object */
{
ut_a(trx_state_eq(trx, TRX_STATE_PREPARED));
ut_a(trx->magic_n == TRX_MAGIC_N);
lock_trx_release_locks(trx);
trx_undo_free_prepared(trx);
assert_trx_in_rw_list(trx);
ut_a(!trx->read_only);
UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
ut_d(trx->in_rw_trx_list = FALSE);
mutex_enter(&trx_sys->mutex);
trx_release_descriptor(trx);
mutex_exit(&trx_sys->mutex);
/* Undo trx_resurrect_table_locks(). */
UT_LIST_INIT(trx->lock.trx_locks);
trx_free_low(trx);
ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
}
/********************************************************************//**
Frees a transaction object for MySQL. */
UNIV_INTERN
void
trx_free_for_mysql(
/*===============*/
trx_t* trx) /*!< in, own: trx object */
{
if (trx->distinct_page_access_hash)
{
mem_free(trx->distinct_page_access_hash);
trx->distinct_page_access_hash= NULL;
}
mutex_enter(&trx_sys->mutex);
ut_ad(trx->in_mysql_trx_list);
ut_d(trx->in_mysql_trx_list = FALSE);
UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
ut_ad(trx_sys_validate_trx_list());
mutex_exit(&trx_sys->mutex);
trx_free_for_background(trx);
}
/****************************************************************//**
Inserts the trx handle in the trx system trx list in the right position.
The list is sorted on the trx id so that the biggest id is at the list
start. This function is used at the database startup to insert incomplete
transactions to the list. */
static
void
trx_list_rw_insert_ordered(
/*=======================*/
trx_t* trx) /*!< in: trx handle */
{
trx_t* trx2;
ut_ad(!trx->read_only);
ut_d(trx->start_file = __FILE__);
ut_d(trx->start_line = __LINE__);
ut_a(srv_is_being_started);
ut_ad(!trx->in_ro_trx_list);
ut_ad(!trx->in_rw_trx_list);
ut_ad(trx->state != TRX_STATE_NOT_STARTED);
ut_ad(trx->is_recovered);
for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
trx2 != NULL;
trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) {
assert_trx_in_rw_list(trx2);
if (trx->id >= trx2->id) {
ut_ad(trx->id > trx2->id);
break;
}
}
if (trx2 != NULL) {
trx2 = UT_LIST_GET_PREV(trx_list, trx2);
if (trx2 == NULL) {
UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
} else {
UT_LIST_INSERT_AFTER(
trx_list, trx_sys->rw_trx_list, trx2, trx);
}
} else {
UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx);
}
#ifdef UNIV_DEBUG
if (trx->id > trx_sys->rw_max_trx_id) {
trx_sys->rw_max_trx_id = trx->id;
}
#endif /* UNIV_DEBUG */
ut_ad(!trx->in_rw_trx_list);
ut_d(trx->in_rw_trx_list = TRUE);
}
/****************************************************************//**
Resurrect the table locks for a resurrected transaction. */
static
void
trx_resurrect_table_locks(
/*======================*/
trx_t* trx, /*!< in/out: transaction */
const trx_undo_t* undo) /*!< in: undo log */
{
mtr_t mtr;
page_t* undo_page;
trx_undo_rec_t* undo_rec;
table_id_set tables;
ut_ad(undo == trx->insert_undo || undo == trx->update_undo);
if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)
|| undo->empty) {
return;
}
mtr_start(&mtr);
/* trx_rseg_mem_create() may have acquired an X-latch on this
page, so we cannot acquire an S-latch. */
undo_page = trx_undo_page_get(
undo->space, undo->zip_size, undo->top_page_no, &mtr);
undo_rec = undo_page + undo->top_offset;
do {
ulint type;
ulint cmpl_info;
bool updated_extern;
undo_no_t undo_no;
table_id_t table_id;
page_t* undo_rec_page = page_align(undo_rec);
if (undo_rec_page != undo_page) {
if (!mtr_memo_release(&mtr,
buf_block_align(undo_page),
MTR_MEMO_PAGE_X_FIX)) {
/* The page of the previous undo_rec
should have been latched by
trx_undo_page_get() or
trx_undo_get_prev_rec(). */
ut_ad(0);
}
undo_page = undo_rec_page;
}
trx_undo_rec_get_pars(
undo_rec, &type, &cmpl_info,
&updated_extern, &undo_no, &table_id);
tables.insert(table_id);
undo_rec = trx_undo_get_prev_rec(
undo_rec, undo->hdr_page_no,
undo->hdr_offset, false, &mtr);
} while (undo_rec);
mtr_commit(&mtr);
for (table_id_set::const_iterator i = tables.begin();
i != tables.end(); i++) {
if (dict_table_t* table = dict_table_open_on_id(
*i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
if (table->ibd_file_missing
|| dict_table_is_temporary(table)) {
mutex_enter(&dict_sys->mutex);
dict_table_close(table, TRUE, FALSE);
dict_table_remove_from_cache(table);
mutex_exit(&dict_sys->mutex);
continue;
}
lock_table_ix_resurrect(table, trx);
DBUG_PRINT("ib_trx",
("resurrect" TRX_ID_FMT
" table '%s' IX lock from %s undo",
trx->id, table->name,
undo == trx->insert_undo
? "insert" : "update"));
dict_table_close(table, FALSE, FALSE);
}
}
}
/****************************************************************//**
Resurrect the transactions that were doing inserts the time of the
crash, they need to be undone.
@return trx_t instance */
static
trx_t*
trx_resurrect_insert(
/*=================*/
trx_undo_t* undo, /*!< in: entry to UNDO */
trx_rseg_t* rseg) /*!< in: rollback segment */
{
trx_t* trx;
trx = trx_allocate_for_background();
trx->rseg = rseg;
trx->xid = undo->xid;
trx->id = undo->trx_id;
trx->insert_undo = undo;
trx->is_recovered = TRUE;
/* This is single-threaded startup code, we do not need the
protection of trx->mutex or trx_sys->mutex here. */
if (undo->state != TRX_UNDO_ACTIVE) {
/* Prepared transactions are left in the prepared state
waiting for a commit or abort decision from MySQL */
if (undo->state == TRX_UNDO_PREPARED) {
fprintf(stderr,
"InnoDB: Transaction " TRX_ID_FMT " was in the"
" XA prepared state.\n", trx->id);
if (srv_force_recovery == 0) {
trx->state = TRX_STATE_PREPARED;
trx_sys->n_prepared_trx++;
trx_sys->n_prepared_recovered_trx++;
} else {
fprintf(stderr,
"InnoDB: Since innodb_force_recovery"
" > 0, we will rollback it anyway.\n");
trx->state = TRX_STATE_ACTIVE;
}
} else {
trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
}
/* We give a dummy value for the trx no; this should have no
relevance since purge is not interested in committed
transaction numbers, unless they are in the history
list, in which case it looks the number from the disk based
undo log structure */
trx->no = trx->id;
} else {
trx->state = TRX_STATE_ACTIVE;
/* A running transaction always has the number
field inited to TRX_ID_MAX */
trx->no = TRX_ID_MAX;
}
/* trx_start_low() is not called with resurrect, so need to initialize
start time here.*/
if (trx->state == TRX_STATE_ACTIVE
|| trx->state == TRX_STATE_PREPARED) {
trx->start_time = ut_time();
}
if (undo->dict_operation) {
trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
trx->table_id = undo->table_id;
}
if (!undo->empty) {
trx->undo_no = undo->top_undo_no + 1;
}
return(trx);
}
/****************************************************************//**
Prepared transactions are left in the prepared state waiting for a
commit or abort decision from MySQL */
static
void
trx_resurrect_update_in_prepared_state(
/*===================================*/
trx_t* trx, /*!< in,out: transaction */
const trx_undo_t* undo) /*!< in: update UNDO record */
{
/* This is single-threaded startup code, we do not need the
protection of trx->mutex or trx_sys->mutex here. */
if (undo->state == TRX_UNDO_PREPARED) {
fprintf(stderr,
"InnoDB: Transaction " TRX_ID_FMT
" was in the XA prepared state.\n", trx->id);
if (srv_force_recovery == 0) {
if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
trx_sys->n_prepared_trx++;
trx_sys->n_prepared_recovered_trx++;
} else {
ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
}
trx->state = TRX_STATE_PREPARED;
} else {
fprintf(stderr,
"InnoDB: Since innodb_force_recovery"
" > 0, we will rollback it anyway.\n");
trx->state = TRX_STATE_ACTIVE;
}
} else {
trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
}
}
/****************************************************************//**
Resurrect the transactions that were doing updates the time of the
crash, they need to be undone. */
static
void
trx_resurrect_update(
/*=================*/
trx_t* trx, /*!< in/out: transaction */
trx_undo_t* undo, /*!< in/out: update UNDO record */
trx_rseg_t* rseg) /*!< in/out: rollback segment */
{
trx->rseg = rseg;
trx->xid = undo->xid;
trx->id = undo->trx_id;
trx->update_undo = undo;
trx->is_recovered = TRUE;
/* This is single-threaded startup code, we do not need the
protection of trx->mutex or trx_sys->mutex here. */
if (undo->state != TRX_UNDO_ACTIVE) {
trx_resurrect_update_in_prepared_state(trx, undo);
/* We give a dummy value for the trx number */
trx->no = trx->id;
} else {
trx->state = TRX_STATE_ACTIVE;
/* A running transaction always has the number field inited to
TRX_ID_MAX */
trx->no = TRX_ID_MAX;
}
/* trx_start_low() is not called with resurrect, so need to initialize
start time here.*/
if (trx->state == TRX_STATE_ACTIVE
|| trx->state == TRX_STATE_PREPARED) {
trx->start_time = ut_time();
}
if (undo->dict_operation) {
trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
trx->table_id = undo->table_id;
}
if (!undo->empty && undo->top_undo_no >= trx->undo_no) {
trx->undo_no = undo->top_undo_no + 1;
}
}
/****************************************************************//**
Creates trx objects for transactions and initializes the trx list of
trx_sys at database start. Rollback segment and undo log lists must
already exist when this function is called, because the lists of
transactions to be rolled back or cleaned up are built based on the
undo log lists. */
UNIV_INTERN
void
trx_lists_init_at_db_start(void)
/*============================*/
{
ulint i;
ut_a(srv_is_being_started);
UT_LIST_INIT(trx_sys->ro_trx_list);
UT_LIST_INIT(trx_sys->rw_trx_list);
UT_LIST_INIT(trx_sys->trx_serial_list);
/* Look from the rollback segments if there exist undo logs for
transactions */
for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
trx_undo_t* undo;
trx_rseg_t* rseg;
rseg = trx_sys->rseg_array[i];
if (rseg == NULL) {
continue;
}
/* Resurrect transactions that were doing inserts. */
for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
undo != NULL;
undo = UT_LIST_GET_NEXT(undo_list, undo)) {
trx_t* trx;
trx = trx_resurrect_insert(undo, rseg);
if (trx->state == TRX_STATE_ACTIVE ||
trx->state == TRX_STATE_PREPARED) {
trx_reserve_descriptor(trx);
}
trx_list_rw_insert_ordered(trx);
trx_resurrect_table_locks(trx, undo);
}
/* Ressurrect transactions that were doing updates. */
for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
undo != NULL;
undo = UT_LIST_GET_NEXT(undo_list, undo)) {
trx_t* trx;
ibool trx_created;
/* Check the trx_sys->rw_trx_list first. */
mutex_enter(&trx_sys->mutex);
trx = trx_get_rw_trx_by_id(undo->trx_id);
mutex_exit(&trx_sys->mutex);
if (trx == NULL) {
trx = trx_allocate_for_background();
trx_created = TRUE;
} else {
trx_created = FALSE;
}
trx_resurrect_update(trx, undo, rseg);
if (trx_created) {
if (trx->state == TRX_STATE_ACTIVE ||
trx->state == TRX_STATE_PREPARED) {
trx_reserve_descriptor(trx);
}
trx_list_rw_insert_ordered(trx);
}
trx_resurrect_table_locks(trx, undo);
}
}
}
/******************************************************************//**
Assigns a rollback segment to a transaction in a round-robin fashion.
@return assigned rollback segment instance */
static
trx_rseg_t*
trx_assign_rseg_low(
/*================*/
ulong max_undo_logs, /*!< in: maximum number of UNDO logs to use */
ulint n_tablespaces) /*!< in: number of rollback tablespaces */
{
ulint i;
trx_rseg_t* rseg;
static ulint latest_rseg = 0;
if (srv_read_only_mode) {
ut_a(max_undo_logs == ULONG_UNDEFINED);
return(NULL);
}
/* This breaks true round robin but that should be OK. */
ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS);
i = latest_rseg++;
i %= max_undo_logs;
/* Note: The assumption here is that there can't be any gaps in
the array. Once we implement more flexible rollback segment
management this may not hold. The assertion checks for that case. */
if (trx_sys->rseg_array[0] == NULL) {
return(NULL);
}
/* Skip the system tablespace if we have more than one tablespace
defined for rollback segments. We want all UNDO records to be in
the non-system tablespaces. */
do {
rseg = trx_sys->rseg_array[i];
ut_a(rseg == NULL || i == rseg->id);
i = (rseg == NULL) ? 0 : i + 1;
} while (rseg == NULL
|| (rseg->space == 0
&& n_tablespaces > 0
&& trx_sys->rseg_array[1] != NULL));
return(rseg);
}
/****************************************************************//**
Assign a read-only transaction a rollback-segment, if it is attempting
to write to a TEMPORARY table. */
UNIV_INTERN
void
trx_assign_rseg(
/*============*/
trx_t* trx) /*!< A read-only transaction that
needs to be assigned a RBS. */
{
ut_a(trx->rseg == 0);
ut_a(trx->read_only);
ut_a(!srv_read_only_mode);
ut_a(!trx_is_autocommit_non_locking(trx));
trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces);
}
/****************************************************************//**
Starts a transaction. */
static
void
trx_start_low(
/*==========*/
trx_t* trx) /*!< in: transaction */
{
ut_ad(trx->rseg == NULL);
ut_ad(trx->start_file != 0);
ut_ad(trx->start_line != 0);
ut_ad(!trx->is_recovered);
ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
/* Check whether it is an AUTOCOMMIT SELECT */
trx->auto_commit = (trx->api_trx && trx->api_auto_commit)
|| thd_trx_is_auto_commit(trx->mysql_thd);
trx->read_only =
(trx->api_trx && !trx->read_write)
|| (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd))
|| srv_read_only_mode;
if (!trx->auto_commit) {
++trx->will_lock;
} else if (trx->will_lock == 0) {
trx->read_only = TRUE;
}
if (!trx->read_only) {
trx->rseg = trx_assign_rseg_low(
srv_undo_logs, srv_undo_tablespaces);
}
/* The initial value for trx->no: TRX_ID_MAX is used in
read_view_open_now: */
trx->no = TRX_ID_MAX;
ut_a(ib_vector_is_empty(trx->autoinc_locks));
ut_a(ib_vector_is_empty(trx->lock.table_locks));
mutex_enter(&trx_sys->mutex);
/* If this transaction came from trx_allocate_for_mysql(),
trx->in_mysql_trx_list would hold. In that case, the trx->state
change must be protected by the trx_sys->mutex, so that
lock_print_info_all_transactions() will have a consistent view. */
trx->state = TRX_STATE_ACTIVE;
trx->id = trx_sys_get_new_trx_id();
/* Cache the state of fake_changes that transaction will use for
lifetime. Any change in session/global fake_changes configuration during
lifetime of transaction will not be honored by already started
transaction. */
trx->fake_changes = thd_fake_changes(trx->mysql_thd);
ut_ad(!trx->in_rw_trx_list);
ut_ad(!trx->in_ro_trx_list);
if (trx->read_only) {
/* Note: The trx_sys_t::ro_trx_list doesn't really need to
be ordered, we should exploit this using a list type that
doesn't need a list wide lock to increase concurrency. */
if (!trx_is_autocommit_non_locking(trx)) {
UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx);
ut_d(trx->in_ro_trx_list = TRUE);
}
} else {
ut_ad(trx->rseg != NULL
|| srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
ut_ad(!trx_is_autocommit_non_locking(trx));
UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
ut_d(trx->in_rw_trx_list = TRUE);
#ifdef UNIV_DEBUG
if (trx->id > trx_sys->rw_max_trx_id) {
trx_sys->rw_max_trx_id = trx->id;
}
#endif /* UNIV_DEBUG */
trx_reserve_descriptor(trx);
}
ut_ad(trx_sys_validate_trx_list());
mutex_exit(&trx_sys->mutex);
trx->start_time = ut_time();
MONITOR_INC(MONITOR_TRX_ACTIVE);
}
/****************************************************************//**
Set the transaction serialisation number. */
static
void
trx_serialisation_number_get(
/*=========================*/
trx_t* trx) /*!< in: transaction */
{
trx_rseg_t* rseg;
rseg = trx->rseg;
ut_ad(mutex_own(&rseg->mutex));
mutex_enter(&trx_sys->mutex);
trx->no = trx_sys_get_new_trx_id();
if (UNIV_LIKELY(!trx->in_trx_serial_list)) {
UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list,
trx);
trx->in_trx_serial_list = true;
}
/* If the rollack segment is not empty then the
new trx_t::no can't be less than any trx_t::no
already in the rollback segment. User threads only
produce events when a rollback segment is empty. */
if (rseg->last_page_no == FIL_NULL) {
void* ptr;
rseg_queue_t rseg_queue;
rseg_queue.rseg = rseg;
rseg_queue.trx_no = trx->no;
mutex_enter(&purge_sys->bh_mutex);
/* This is to reduce the pressure on the trx_sys_t::mutex
though in reality it should make very little (read no)
difference because this code path is only taken when the
rbs is empty. */
mutex_exit(&trx_sys->mutex);
ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
ut_a(ptr);
mutex_exit(&purge_sys->bh_mutex);
} else {
mutex_exit(&trx_sys->mutex);
}
}
/****************************************************************//**
Assign the transaction its history serialisation number and write the
update UNDO log record to the assigned rollback segment. */
static __attribute__((nonnull))
void
trx_write_serialisation_history(
/*============================*/
trx_t* trx, /*!< in/out: transaction */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
trx_rseg_t* rseg;
rseg = trx->rseg;
/* Change the undo log segment states from TRX_UNDO_ACTIVE
to some other state: these modifications to the file data
structure define the transaction as committed in the file
based domain, at the serialization point of the log sequence
number lsn obtained below. */
if (trx->update_undo != NULL) {
page_t* undo_hdr_page;
trx_undo_t* undo = trx->update_undo;
/* We have to hold the rseg mutex because update
log headers have to be put to the history list in the
(serialisation) order of the UNDO trx number. This is
required for the purge in-memory data structures too. */
mutex_enter(&rseg->mutex);
/* Assign the transaction serialisation number and also
update the purge min binary heap if this is the first
UNDO log being written to the assigned rollback segment. */
trx_serialisation_number_get(trx);
/* It is not necessary to obtain trx->undo_mutex here
because only a single OS thread is allowed to do the
transaction commit for this transaction. */
undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr);
trx_undo_update_cleanup(trx, undo_hdr_page, mtr);
} else {
mutex_enter(&rseg->mutex);
}
if (trx->insert_undo != NULL) {
trx_undo_set_state_at_finish(trx->insert_undo, mtr);
}
mutex_exit(&rseg->mutex);
MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
/* Update the latest MySQL binlog name and offset info
in trx sys header if MySQL binlogging is on or the database
server is a MySQL replication slave */
if (trx->mysql_log_file_name
&& trx->mysql_log_file_name[0] != '\0') {
trx_sys_update_mysql_binlog_offset(
trx->mysql_log_file_name,
trx->mysql_log_offset,
TRX_SYS_MYSQL_LOG_INFO, mtr);
trx->mysql_log_file_name = NULL;
}
}
/********************************************************************
Finalize a transaction containing updates for a FTS table. */
static __attribute__((nonnull))
void
trx_finalize_for_fts_table(
/*=======================*/
fts_trx_table_t* ftt) /* in: FTS trx table */
{
fts_t* fts = ftt->table->fts;
fts_doc_ids_t* doc_ids = ftt->added_doc_ids;
mutex_enter(&fts->bg_threads_mutex);
if (fts->fts_status & BG_THREAD_STOP) {
/* The table is about to be dropped, no use
adding anything to its work queue. */
mutex_exit(&fts->bg_threads_mutex);
} else {
mem_heap_t* heap;
mutex_exit(&fts->bg_threads_mutex);
ut_a(fts->add_wq);
heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
ib_wqueue_add(fts->add_wq, doc_ids, heap);
/* fts_trx_table_t no longer owns the list. */
ftt->added_doc_ids = NULL;
}
}
/******************************************************************//**
Finalize a transaction containing updates to FTS tables. */
static __attribute__((nonnull))
void
trx_finalize_for_fts(
/*=================*/
trx_t* trx, /*!< in/out: transaction */
bool is_commit) /*!< in: true if the transaction was
committed, false if it was rolled back. */
{
if (is_commit) {
const ib_rbt_node_t* node;
ib_rbt_t* tables;
fts_savepoint_t* savepoint;
savepoint = static_cast<fts_savepoint_t*>(
ib_vector_last(trx->fts_trx->savepoints));
tables = savepoint->tables;
for (node = rbt_first(tables);
node;
node = rbt_next(tables, node)) {
fts_trx_table_t** ftt;
ftt = rbt_value(fts_trx_table_t*, node);
if ((*ftt)->added_doc_ids) {
trx_finalize_for_fts_table(*ftt);
}
}
}
fts_trx_free(trx->fts_trx);
trx->fts_trx = NULL;
}
/**********************************************************************//**
If required, flushes the log to disk based on the value of
innodb_flush_log_at_trx_commit. */
static
void
trx_flush_log_if_needed_low(
/*========================*/
lsn_t lsn, /*!< in: lsn up to which logs are to be
flushed. */
trx_t* trx) /*!< in: transaction */
{
ulint flush_log_at_trx_commit;
flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit
? thd_flush_log_at_trx_commit(NULL)
: thd_flush_log_at_trx_commit(trx->mysql_thd);
switch (flush_log_at_trx_commit) {
case 0:
/* Do nothing */
break;
case 1:
case 3:
/* Write the log and optionally flush it to disk */
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
srv_unix_file_flush_method != SRV_UNIX_NOSYNC);
break;
case 2:
/* Write the log but do not flush it to disk */
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
break;
default:
ut_error;
}
}
/**********************************************************************//**
If required, flushes the log to disk based on the value of
innodb_flush_log_at_trx_commit. */
static __attribute__((nonnull))
void
trx_flush_log_if_needed(
/*====================*/
lsn_t lsn, /*!< in: lsn up to which logs are to be
flushed. */
trx_t* trx) /*!< in/out: transaction */
{
trx->op_info = "flushing log";
trx_flush_log_if_needed_low(lsn, trx);
trx->op_info = "";
}
/****************************************************************//**
Commits a transaction in memory. */
static __attribute__((nonnull))
void
trx_commit_in_memory(
/*=================*/
trx_t* trx, /*!< in/out: transaction */
lsn_t lsn) /*!< in: log sequence number of the mini-transaction
commit of trx_write_serialisation_history(), or 0
if the transaction did not modify anything */
{
trx->must_flush_log_later = FALSE;
if (trx_is_autocommit_non_locking(trx)) {
ut_ad(trx->read_only);
ut_a(!trx->is_recovered);
ut_ad(trx->rseg == NULL);
ut_ad(!trx->in_ro_trx_list);
ut_ad(!trx->in_rw_trx_list);
/* Note: We are asserting without holding the lock mutex. But
that is OK because this transaction is not waiting and cannot
be rolled back and no new locks can (or should not) be added
becuase it is flagged as a non-locking read-only transaction. */
ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
/* This state change is not protected by any mutex, therefore
there is an inherent race here around state transition during
printouts. We ignore this race for the sake of efficiency.
However, the trx_sys_t::mutex will protect the trx_t instance
and it cannot be removed from the mysql_trx_list and freed
without first acquiring the trx_sys_t::mutex. */
ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
trx->state = TRX_STATE_NOT_STARTED;
read_view_remove(trx->global_read_view, false);
MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
} else {
lock_trx_release_locks(trx);
/* Remove the transaction from the list of active
transactions now that it no longer holds any user locks. */
ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
mutex_enter(&trx_sys->mutex);
assert_trx_in_list(trx);
if (trx->read_only) {
UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx);
ut_d(trx->in_ro_trx_list = FALSE);
MONITOR_INC(MONITOR_TRX_RO_COMMIT);
} else {
UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
ut_d(trx->in_rw_trx_list = FALSE);
ut_ad(trx_sys->descr_n_used <=
UT_LIST_GET_LEN(trx_sys->rw_trx_list));
MONITOR_INC(MONITOR_TRX_RW_COMMIT);
}
/* If this transaction came from trx_allocate_for_mysql(),
trx->in_mysql_trx_list would hold. In that case, the
trx->state change must be protected by trx_sys->mutex, so that
lock_print_info_all_transactions() will have a consistent
view. */
trx->state = TRX_STATE_NOT_STARTED;
/* We already own the trx_sys_t::mutex, by doing it here we
avoid a potential context switch later. */
read_view_remove(trx->global_read_view, true);
ut_ad(trx_sys_validate_trx_list());
mutex_exit(&trx_sys->mutex);
}
if (trx->global_read_view != NULL) {
trx->global_read_view = NULL;
}
trx->read_view = NULL;
if (lsn) {
ulint flush_log_at_trx_commit;
if (trx->insert_undo != NULL) {
trx_undo_insert_cleanup(trx);
}
if (srv_use_global_flush_log_at_trx_commit) {
flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL);
} else {
flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd);
}
/* NOTE that we could possibly make a group commit more
efficient here: call os_thread_yield here to allow also other
trxs to come to commit! */
/*-------------------------------------*/
/* Depending on the my.cnf options, we may now write the log
buffer to the log files, making the transaction durable if
the OS does not crash. We may also flush the log files to
disk, making the transaction durable also at an OS crash or a
power outage.
The idea in InnoDB's group commit is that a group of
transactions gather behind a trx doing a physical disk write
to log files, and when that physical write has been completed,
one of those transactions does a write which commits the whole
group. Note that this group commit will only bring benefit if
there are > 2 users in the database. Then at least 2 users can
gather behind one doing the physical log write to disk.
If we are calling trx_commit() under prepare_commit_mutex, we
will delay possible log write and flush to a separate function
trx_commit_complete_for_mysql(), which is only called when the
thread has released the mutex. This is to make the
group commit algorithm to work. Otherwise, the prepare_commit
mutex would serialize all commits and prevent a group of
transactions from gathering. */
if (trx->flush_log_later) {
/* Do nothing yet */
trx->must_flush_log_later = TRUE;
} else if (flush_log_at_trx_commit == 0
|| thd_requested_durability(trx->mysql_thd)
== HA_IGNORE_DURABILITY) {
/* Do nothing */
} else {
trx_flush_log_if_needed(lsn, trx);
}
trx->commit_lsn = lsn;
/* Tell server some activity has happened, since the trx
does changes something. Background utility threads like
master thread, purge thread or page_cleaner thread might
have some work to do. */
srv_active_wake_master_thread();
}
/* undo_no is non-zero if we're doing the final commit. */
bool not_rollback = trx->undo_no != 0;
/* Free all savepoints, starting from the first. */
trx_named_savept_t* savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
trx_roll_savepoints_free(trx, savep);
trx->rseg = NULL;
trx->undo_no = 0;
trx->last_sql_stat_start.least_undo_no = 0;
trx->ddl = false;
#ifdef UNIV_DEBUG
ut_ad(trx->start_file != 0);
ut_ad(trx->start_line != 0);
trx->start_file = 0;
trx->start_line = 0;
#endif /* UNIV_DEBUG */
trx->will_lock = 0;
trx->read_only = FALSE;
trx->auto_commit = FALSE;
if (trx->fts_trx) {
trx_finalize_for_fts(trx, not_rollback);
}
ut_ad(trx->lock.wait_thr == NULL);
ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
ut_ad(!trx->in_ro_trx_list);
ut_ad(!trx->in_rw_trx_list);
trx->dict_operation = TRX_DICT_OP_NONE;
trx->error_state = DB_SUCCESS;
/* trx->in_mysql_trx_list would hold between
trx_allocate_for_mysql() and trx_free_for_mysql(). It does not
hold for recovered transactions or system transactions. */
}
/****************************************************************//**
Commits a transaction and a mini-transaction. */
UNIV_INTERN
void
trx_commit_low(
/*===========*/
trx_t* trx, /*!< in/out: transaction */
mtr_t* mtr) /*!< in/out: mini-transaction (will be committed),
or NULL if trx made no modifications */
{
lsn_t lsn;
assert_trx_nonlocking_or_in_list(trx);
ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
ut_ad(!mtr || mtr->state == MTR_ACTIVE);
ut_ad(!mtr == !(trx->insert_undo || trx->update_undo));
/* undo_no is non-zero if we're doing the final commit. */
if (trx->fts_trx && trx->undo_no != 0) {
dberr_t error;
ut_a(!trx_is_autocommit_non_locking(trx));
error = fts_commit(trx);
/* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY
instead of dying. This is a possible scenario if there
is a crash between insert to DELETED table committing
and transaction committing. The fix would be able to
return error from this function */
if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) {
/* FTS-FIXME: once we can return values from this
function, we should do so and signal an error
instead of just dying. */
ut_error;
}
}
if (mtr) {
trx_write_serialisation_history(trx, mtr);
/* The following call commits the mini-transaction, making the
whole transaction committed in the file-based world, at this
log sequence number. The transaction becomes 'durable' when
we write the log to disk, but in the logical sense the commit
in the file-based data structures (undo logs etc.) happens
here.
NOTE that transaction numbers, which are assigned only to
transactions with an update undo log, do not necessarily come
in exactly the same order as commit lsn's, if the transactions
have different rollback segments. To get exactly the same
order we should hold the kernel mutex up to this point,
adding to the contention of the kernel mutex. However, if
a transaction T2 is able to see modifications made by
a transaction T1, T2 will always get a bigger transaction
number and a bigger commit lsn than T1. */
/*--------------*/
mtr_commit(mtr);
/*--------------*/
lsn = mtr->end_lsn;
} else {
lsn = 0;
}
trx_commit_in_memory(trx, lsn);
}
/****************************************************************//**
Commits a transaction. */
UNIV_INTERN
void
trx_commit(
/*=======*/
trx_t* trx) /*!< in/out: transaction */
{
mtr_t local_mtr;
mtr_t* mtr;
if (trx->insert_undo || trx->update_undo) {
mtr = &local_mtr;
mtr_start(mtr);
} else {
mtr = NULL;
}
trx_commit_low(trx, mtr);
}
/****************************************************************//**
Cleans up a transaction at database startup. The cleanup is needed if
the transaction already got to the middle of a commit when the database
crashed, and we cannot roll it back. */
UNIV_INTERN
void
trx_cleanup_at_db_startup(
/*======================*/
trx_t* trx) /*!< in: transaction */
{
ut_ad(trx->is_recovered);
if (trx->insert_undo != NULL) {
trx_undo_insert_cleanup(trx);
}
trx->rseg = NULL;
trx->undo_no = 0;
trx->last_sql_stat_start.least_undo_no = 0;
mutex_enter(&trx_sys->mutex);
ut_a(!trx->read_only);
UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list));
assert_trx_in_rw_list(trx);
ut_d(trx->in_rw_trx_list = FALSE);
trx->state = TRX_STATE_NOT_STARTED;
trx_release_descriptor(trx);
mutex_exit(&trx_sys->mutex);
/* Change the transaction state without mutex protection, now
that it no longer is in the trx_list. Recovered transactions
are never placed in the mysql_trx_list. */
ut_ad(trx->is_recovered);
ut_ad(!trx->in_ro_trx_list);
ut_ad(!trx->in_rw_trx_list);
ut_ad(!trx->in_mysql_trx_list);
}
/********************************************************************//**
Assigns a read view for a consistent read query. All the consistent reads
within the same transaction will get the same read view, which is created
when this function is first called for a new started transaction.
@return consistent read view */
UNIV_INTERN
read_view_t*
trx_assign_read_view(
/*=================*/
trx_t* trx) /*!< in: active transaction */
{
ut_ad(trx->state == TRX_STATE_ACTIVE);
if (trx->read_view != NULL) {
return(trx->read_view);
}
trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view);
trx->global_read_view = trx->read_view;
return(trx->read_view);
}
/********************************************************************//**
Clones the read view from another transaction. All consistent reads within
the receiver transaction will get the same read view as the donor transaction
@return read view clone */
UNIV_INTERN
read_view_t*
trx_clone_read_view(
/*================*/
trx_t* trx, /*!< in: receiver transaction */
trx_t* from_trx) /*!< in: donor transaction */
{
ut_ad(lock_mutex_own());
ut_ad(mutex_own(&trx_sys->mutex));
ut_ad(trx_mutex_own(from_trx));
ut_ad(trx->read_view == NULL);
if (from_trx->state != TRX_STATE_ACTIVE ||
from_trx->read_view == NULL) {
return(NULL);
}
trx->read_view = read_view_clone(from_trx->read_view,
trx->prebuilt_view);
read_view_add(trx->read_view);
trx->global_read_view = trx->read_view;
return(trx->read_view);
}
/****************************************************************//**
Prepares a transaction for commit/rollback. */
UNIV_INTERN
void
trx_commit_or_rollback_prepare(
/*===========================*/
trx_t* trx) /*!< in/out: transaction */
{
/* We are reading trx->state without holding trx_sys->mutex
here, because the commit or rollback should be invoked for a
running (or recovered prepared) transaction that is associated
with the current thread. */
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
trx_start_low(trx);
/* fall through */
case TRX_STATE_ACTIVE:
case TRX_STATE_PREPARED:
/* If the trx is in a lock wait state, moves the waiting
query thread to the suspended state */
if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
ulint sec;
ulint ms;
ib_uint64_t now;
ut_a(trx->lock.wait_thr != NULL);
trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
trx->lock.wait_thr = NULL;
if (UNIV_UNLIKELY(trx->take_stats)) {
ut_usectime(&sec, &ms);
now = (ib_uint64_t)sec * 1000000 + ms;
trx->lock_que_wait_timer
+= (ulint)
(now - trx->lock_que_wait_ustarted);
}
trx->lock.que_state = TRX_QUE_RUNNING;
}
ut_a(trx->lock.n_active_thrs == 1);
return;
case TRX_STATE_COMMITTED_IN_MEMORY:
break;
}
ut_error;
}
/*********************************************************************//**
Creates a commit command node struct.
@return own: commit node struct */
UNIV_INTERN
commit_node_t*
trx_commit_node_create(
/*===================*/
mem_heap_t* heap) /*!< in: mem heap where created */
{
commit_node_t* node;
node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
node->common.type = QUE_NODE_COMMIT;
node->state = COMMIT_NODE_SEND;
return(node);
}
/***********************************************************//**
Performs an execution step for a commit type node in a query graph.
@return query thread to run next, or NULL */
UNIV_INTERN
que_thr_t*
trx_commit_step(
/*============*/
que_thr_t* thr) /*!< in: query thread */
{
commit_node_t* node;
node = static_cast<commit_node_t*>(thr->run_node);
ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
if (thr->prev_node == que_node_get_parent(node)) {
node->state = COMMIT_NODE_SEND;
}
if (node->state == COMMIT_NODE_SEND) {
trx_t* trx;
node->state = COMMIT_NODE_WAIT;
trx = thr_get_trx(thr);
ut_a(trx->lock.wait_thr == NULL);
ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
trx_commit_or_rollback_prepare(trx);
trx->lock.que_state = TRX_QUE_COMMITTING;
trx_commit(trx);
ut_ad(trx->lock.wait_thr == NULL);
trx->lock.que_state = TRX_QUE_RUNNING;
thr = NULL;
} else {
ut_ad(node->state == COMMIT_NODE_WAIT);
node->state = COMMIT_NODE_SEND;
thr->run_node = que_node_get_parent(node);
}
return(thr);
}
/**********************************************************************//**
Does the transaction commit for MySQL.
@return DB_SUCCESS or error number */
UNIV_INTERN
dberr_t
trx_commit_for_mysql(
/*=================*/
trx_t* trx) /*!< in/out: transaction */
{
/* Because we do not do the commit by sending an Innobase
sig to the transaction, we must here make sure that trx has been
started. */
ut_a(trx);
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
/* Update the info whether we should skip XA steps that eat
CPU time.
For the duration of the transaction trx->support_xa is
not reread from thd so any changes in the value take
effect in the next transaction. This is to avoid a
scenario where some undo log records generated by a
transaction contain XA information and other undo log
records, generated by the same transaction do not. */
trx->support_xa = thd_supports_xa(trx->mysql_thd);
ut_d(trx->start_file = __FILE__);
ut_d(trx->start_line = __LINE__);
trx_start_low(trx);
/* fall through */
case TRX_STATE_ACTIVE:
case TRX_STATE_PREPARED:
trx->op_info = "committing";
trx_commit(trx);
MONITOR_DEC(MONITOR_TRX_ACTIVE);
trx->op_info = "";
return(DB_SUCCESS);
case TRX_STATE_COMMITTED_IN_MEMORY:
break;
}
ut_error;
return(DB_CORRUPTION);
}
/**********************************************************************//**
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE. */
UNIV_INTERN
void
trx_commit_complete_for_mysql(
/*==========================*/
trx_t* trx) /*!< in/out: transaction */
{
ut_a(trx);
if (!trx->must_flush_log_later
|| thd_requested_durability(trx->mysql_thd)
== HA_IGNORE_DURABILITY) {
return;
}
ulint flush_log_at_trx_commit;
flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit
? thd_flush_log_at_trx_commit(NULL)
: thd_flush_log_at_trx_commit(trx->mysql_thd);
if (flush_log_at_trx_commit == 1 && trx->active_commit_ordered) {
return;
}
trx_flush_log_if_needed(trx->commit_lsn, trx);
trx->must_flush_log_later = FALSE;
}
/**********************************************************************//**
Marks the latest SQL statement ended. */
UNIV_INTERN
void
trx_mark_sql_stat_end(
/*==================*/
trx_t* trx) /*!< in: trx handle */
{
ut_a(trx);
switch (trx->state) {
case TRX_STATE_PREPARED:
case TRX_STATE_COMMITTED_IN_MEMORY:
break;
case TRX_STATE_NOT_STARTED:
trx->undo_no = 0;
/* fall through */
case TRX_STATE_ACTIVE:
trx->last_sql_stat_start.least_undo_no = trx->undo_no;
if (trx->fts_trx) {
fts_savepoint_laststmt_refresh(trx);
}
return;
}
ut_error;
}
/**********************************************************************//**
Prints info about a transaction.
Caller must hold trx_sys->mutex. */
UNIV_INTERN
void
trx_print_low(
/*==========*/
FILE* f,
/*!< in: output stream */
const trx_t* trx,
/*!< in: transaction */
ulint max_query_len,
/*!< in: max query length to print,
or 0 to use the default max length */
ulint n_rec_locks,
/*!< in: lock_number_of_rows_locked(&trx->lock) */
ulint n_trx_locks,
/*!< in: length of trx->lock.trx_locks */
ulint heap_size)
/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
{
ibool newline;
const char* op_info;
ut_ad(mutex_own(&trx_sys->mutex));
fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
/* trx->state cannot change from or to NOT_STARTED while we
are holding the trx_sys->mutex. It may change from ACTIVE to
PREPARED or COMMITTED. */
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
fputs(", not started", f);
goto state_ok;
case TRX_STATE_ACTIVE:
fprintf(f, ", ACTIVE %lu sec",
(ulong) difftime(time(NULL), trx->start_time));
goto state_ok;
case TRX_STATE_PREPARED:
fprintf(f, ", ACTIVE (PREPARED) %lu sec",
(ulong) difftime(time(NULL), trx->start_time));
goto state_ok;
case TRX_STATE_COMMITTED_IN_MEMORY:
fputs(", COMMITTED IN MEMORY", f);
goto state_ok;
}
fprintf(f, ", state %lu", (ulong) trx->state);
ut_ad(0);
state_ok:
/* prevent a race condition */
op_info = trx->op_info;
if (*op_info) {
putc(' ', f);
fputs(op_info, f);
}
if (trx->is_recovered) {
fputs(" recovered trx", f);
}
if (trx->declared_to_be_inside_innodb) {
fprintf(f, ", thread declared inside InnoDB %lu",
(ulong) trx->n_tickets_to_enter_innodb);
}
putc('\n', f);
if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
fprintf(f, "mysql tables in use %lu, locked %lu\n",
(ulong) trx->n_mysql_tables_in_use,
(ulong) trx->mysql_n_tables_locked);
}
newline = TRUE;
/* trx->lock.que_state of an ACTIVE transaction may change
while we are not holding trx->mutex. We perform a dirty read
for performance reasons. */
switch (trx->lock.que_state) {
case TRX_QUE_RUNNING:
newline = FALSE; break;
case TRX_QUE_LOCK_WAIT:
fputs("LOCK WAIT ", f); break;
case TRX_QUE_ROLLING_BACK:
fputs("ROLLING BACK ", f); break;
case TRX_QUE_COMMITTING:
fputs("COMMITTING ", f); break;
default:
fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
}
if (n_trx_locks > 0 || heap_size > 400) {
newline = TRUE;
fprintf(f, "%lu lock struct(s), heap size %lu,"
" %lu row lock(s)",
(ulong) n_trx_locks,
(ulong) heap_size,
(ulong) n_rec_locks);
}
if (trx->has_search_latch) {
newline = TRUE;
fputs(", holds adaptive hash latch", f);
}
if (trx->undo_no != 0) {
newline = TRUE;
fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
}
if (newline) {
putc('\n', f);
}
if (trx->mysql_thd != NULL) {
innobase_mysql_print_thd(
f, trx->mysql_thd, static_cast<uint>(max_query_len));
}
}
/**********************************************************************//**
Prints info about a transaction.
The caller must hold lock_sys->mutex and trx_sys->mutex.
When possible, use trx_print() instead. */
UNIV_INTERN
void
trx_print_latched(
/*==============*/
FILE* f, /*!< in: output stream */
const trx_t* trx, /*!< in: transaction */
ulint max_query_len) /*!< in: max query length to print,
or 0 to use the default max length */
{
ut_ad(lock_mutex_own());
ut_ad(mutex_own(&trx_sys->mutex));
trx_print_low(f, trx, max_query_len,
lock_number_of_rows_locked(&trx->lock),
UT_LIST_GET_LEN(trx->lock.trx_locks),
mem_heap_get_size(trx->lock.lock_heap));
}
/**********************************************************************//**
Prints info about a transaction.
Acquires and releases lock_sys->mutex and trx_sys->mutex. */
UNIV_INTERN
void
trx_print(
/*======*/
FILE* f, /*!< in: output stream */
const trx_t* trx, /*!< in: transaction */
ulint max_query_len) /*!< in: max query length to print,
or 0 to use the default max length */
{
ulint n_rec_locks;
ulint n_trx_locks;
ulint heap_size;
lock_mutex_enter();
n_rec_locks = lock_number_of_rows_locked(&trx->lock);
n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
heap_size = mem_heap_get_size(trx->lock.lock_heap);
lock_mutex_exit();
mutex_enter(&trx_sys->mutex);
trx_print_low(f, trx, max_query_len,
n_rec_locks, n_trx_locks, heap_size);
mutex_exit(&trx_sys->mutex);
}
#ifdef UNIV_DEBUG
/**********************************************************************//**
Asserts that a transaction has been started.
The caller must hold trx_sys->mutex.
@return TRUE if started */
UNIV_INTERN
ibool
trx_assert_started(
/*===============*/
const trx_t* trx) /*!< in: transaction */
{
ut_ad(mutex_own(&trx_sys->mutex));
/* Non-locking autocommits should not hold any locks and this
function is only called from the locking code. */
assert_trx_in_list(trx);
/* trx->state can change from or to NOT_STARTED while we are holding
trx_sys->mutex for non-locking autocommit selects but not for other
types of transactions. It may change from ACTIVE to PREPARED. Unless
we are holding lock_sys->mutex, it may also change to COMMITTED. */
switch (trx->state) {
case TRX_STATE_PREPARED:
return(TRUE);
case TRX_STATE_ACTIVE:
case TRX_STATE_COMMITTED_IN_MEMORY:
return(TRUE);
case TRX_STATE_NOT_STARTED:
break;
}
ut_error;
return(FALSE);
}
#endif /* UNIV_DEBUG */
/*******************************************************************//**
Compares the "weight" (or size) of two transactions. The heavier the weight,
the more reluctant we will be to choose the transaction as a deadlock victim.
@return TRUE if weight(a) >= weight(b) */
UNIV_INTERN
ibool
trx_weight_ge(
/*==========*/
const trx_t* a, /*!< in: the first transaction to be compared */
const trx_t* b) /*!< in: the second transaction to be compared */
{
int pref;
/* First ask the upper server layer if it has any preference for which
to prefer as a deadlock victim. */
pref= thd_deadlock_victim_preference(a->mysql_thd, b->mysql_thd);
if (pref < 0) {
return FALSE;
} else if (pref > 0) {
return TRUE;
}
/* Upper server layer had no preference, we fall back to comparing the
number of altered/locked rows. */
#if 0
fprintf(stderr,
"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
__func__,
a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks),
b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks));
#endif
return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
}
/****************************************************************//**
Prepares a transaction. */
static
void
trx_prepare(
/*========*/
trx_t* trx) /*!< in/out: transaction */
{
trx_rseg_t* rseg;
lsn_t lsn;
mtr_t mtr;
rseg = trx->rseg;
/* Only fresh user transactions can be prepared.
Recovered transactions cannot. */
ut_a(!trx->is_recovered);
if (trx->insert_undo != NULL || trx->update_undo != NULL) {
mtr_start(&mtr);
/* Change the undo log segment states from TRX_UNDO_ACTIVE
to TRX_UNDO_PREPARED: these modifications to the file data
structure define the transaction as prepared in the
file-based world, at the serialization point of lsn. */
mutex_enter(&rseg->mutex);
if (trx->insert_undo != NULL) {
/* It is not necessary to obtain trx->undo_mutex here
because only a single OS thread is allowed to do the
transaction prepare for this transaction. */
trx_undo_set_state_at_prepare(trx, trx->insert_undo,
&mtr);
}
if (trx->update_undo) {
trx_undo_set_state_at_prepare(
trx, trx->update_undo, &mtr);
}
mutex_exit(&rseg->mutex);
/*--------------*/
mtr_commit(&mtr); /* This mtr commit makes the
transaction prepared in the file-based
world */
/*--------------*/
lsn = mtr.end_lsn;
ut_ad(lsn);
} else {
lsn = 0;
}
/*--------------------------------------*/
ut_a(trx->state == TRX_STATE_ACTIVE);
mutex_enter(&trx_sys->mutex);
trx->state = TRX_STATE_PREPARED;
trx_sys->n_prepared_trx++;
mutex_exit(&trx_sys->mutex);
/*--------------------------------------*/
if (lsn) {
/* Depending on the my.cnf options, we may now write the log
buffer to the log files, making the prepared state of the
transaction durable if the OS does not crash. We may also
flush the log files to disk, making the prepared state of the
transaction durable also at an OS crash or a power outage.
The idea in InnoDB's group prepare is that a group of
transactions gather behind a trx doing a physical disk write
to log files, and when that physical write has been completed,
one of those transactions does a write which prepares the whole
group. Note that this group prepare will only bring benefit if
there are > 2 users in the database. Then at least 2 users can
gather behind one doing the physical log write to disk.
TODO: find out if MySQL holds some mutex when calling this.
That would spoil our group prepare algorithm. */
trx_flush_log_if_needed(lsn, trx);
}
}
/**********************************************************************//**
Does the transaction prepare for MySQL. */
UNIV_INTERN
void
trx_prepare_for_mysql(
/*==================*/
trx_t* trx) /*!< in/out: trx handle */
{
trx_start_if_not_started_xa(trx);
trx->op_info = "preparing";
trx_prepare(trx);
trx->op_info = "";
}
/**********************************************************************//**
This function is used to find number of prepared transactions and
their transaction objects for a recovery.
@return number of prepared transactions stored in xid_list */
UNIV_INTERN
int
trx_recover_for_mysql(
/*==================*/
XID* xid_list, /*!< in/out: prepared transactions */
ulint len) /*!< in: number of slots in xid_list */
{
const trx_t* trx;
ulint count = 0;
ut_ad(xid_list);
ut_ad(len);
/* We should set those transactions which are in the prepared state
to the xid_list */
mutex_enter(&trx_sys->mutex);
for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
trx != NULL;
trx = UT_LIST_GET_NEXT(trx_list, trx)) {
assert_trx_in_rw_list(trx);
/* The state of a read-write transaction cannot change
from or to NOT_STARTED while we are holding the
trx_sys->mutex. It may change to PREPARED, but not if
trx->is_recovered. It may also change to COMMITTED. */
if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
xid_list[count] = trx->xid;
if (count == 0) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Starting recovery for"
" XA transactions...\n");
}
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Transaction " TRX_ID_FMT " in"
" prepared state after recovery\n",
trx->id);
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Transaction contains changes"
" to " TRX_ID_FMT " rows\n",
trx->undo_no);
count++;
if (count == len) {
break;
}
}
}
mutex_exit(&trx_sys->mutex);
if (count > 0){
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: %d transactions in prepared state"
" after recovery\n",
int (count));
}
return(int (count));
}
/*******************************************************************//**
This function is used to find one X/Open XA distributed transaction
which is in the prepared state
@return trx on match, the trx->xid will be invalidated;
note that the trx may have been committed, unless the caller is
holding lock_sys->mutex */
static __attribute__((nonnull, warn_unused_result))
trx_t*
trx_get_trx_by_xid_low(
/*===================*/
const XID* xid) /*!< in: X/Open XA transaction
identifier */
{
trx_t* trx;
ut_ad(mutex_own(&trx_sys->mutex));
for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
trx != NULL;
trx = UT_LIST_GET_NEXT(trx_list, trx)) {
assert_trx_in_rw_list(trx);
/* Compare two X/Open XA transaction id's: their
length should be the same and binary comparison
of gtrid_length+bqual_length bytes should be
the same */
if (trx->is_recovered
&& trx_state_eq(trx, TRX_STATE_PREPARED)
&& xid->gtrid_length == trx->xid.gtrid_length
&& xid->bqual_length == trx->xid.bqual_length
&& memcmp(xid->data, trx->xid.data,
xid->gtrid_length + xid->bqual_length) == 0) {
/* Invalidate the XID, so that subsequent calls
will not find it. */
memset(&trx->xid, 0, sizeof(trx->xid));
trx->xid.formatID = -1;
break;
}
}
return(trx);
}
/*******************************************************************//**
This function is used to find one X/Open XA distributed transaction
which is in the prepared state
@return trx or NULL; on match, the trx->xid will be invalidated;
note that the trx may have been committed, unless the caller is
holding lock_sys->mutex */
UNIV_INTERN
trx_t*
trx_get_trx_by_xid(
/*===============*/
const XID* xid) /*!< in: X/Open XA transaction identifier */
{
trx_t* trx;
if (xid == NULL) {
return(NULL);
}
mutex_enter(&trx_sys->mutex);
/* Recovered/Resurrected transactions are always only on the
trx_sys_t::rw_trx_list. */
trx = trx_get_trx_by_xid_low(xid);
mutex_exit(&trx_sys->mutex);
return(trx);
}
/*************************************************************//**
Starts the transaction if it is not yet started. */
UNIV_INTERN
void
trx_start_if_not_started_xa_low(
/*============================*/
trx_t* trx) /*!< in: transaction */
{
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
/* Update the info whether we should skip XA steps
that eat CPU time.
For the duration of the transaction trx->support_xa is
not reread from thd so any changes in the value take
effect in the next transaction. This is to avoid a
scenario where some undo generated by a transaction,
has XA stuff, and other undo, generated by the same
transaction, doesn't. */
trx->support_xa = thd_supports_xa(trx->mysql_thd);
trx_start_low(trx);
/* fall through */
case TRX_STATE_ACTIVE:
return;
case TRX_STATE_PREPARED:
case TRX_STATE_COMMITTED_IN_MEMORY:
break;
}
ut_error;
}
/*************************************************************//**
Starts the transaction if it is not yet started. */
UNIV_INTERN
void
trx_start_if_not_started_low(
/*=========================*/
trx_t* trx) /*!< in: transaction */
{
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
trx_start_low(trx);
/* fall through */
case TRX_STATE_ACTIVE:
return;
case TRX_STATE_PREPARED:
case TRX_STATE_COMMITTED_IN_MEMORY:
break;
}
ut_error;
}
/*************************************************************//**
Starts the transaction for a DDL operation. */
UNIV_INTERN
void
trx_start_for_ddl_low(
/*==================*/
trx_t* trx, /*!< in/out: transaction */
trx_dict_op_t op) /*!< in: dictionary operation type */
{
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
/* Flag this transaction as a dictionary operation, so that
the data dictionary will be locked in crash recovery. */
trx_set_dict_operation(trx, op);
/* Ensure it is not flagged as an auto-commit-non-locking
transation. */
trx->will_lock = 1;
trx->ddl = true;
trx_start_low(trx);
return;
case TRX_STATE_ACTIVE:
/* We have this start if not started idiom, therefore we
can't add stronger checks here. */
trx->ddl = true;
ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
ut_ad(trx->will_lock > 0);
return;
case TRX_STATE_PREPARED:
case TRX_STATE_COMMITTED_IN_MEMORY:
break;
}
ut_error;
}