/***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** @file trx/trx0trx.cc The transaction Created 3/26/1996 Heikki Tuuri *******************************************************/ #include "btr0types.h" #include "trx0trx.h" #ifdef UNIV_NONINL #include "trx0trx.ic" #endif #include "trx0undo.h" #include "trx0rseg.h" #include "log0log.h" #include "que0que.h" #include "lock0lock.h" #include "trx0roll.h" #include "usr0sess.h" #include "read0read.h" #include "srv0srv.h" #include "srv0start.h" #include "btr0sea.h" #include "os0proc.h" #include "trx0xa.h" #include "trx0rec.h" #include "trx0purge.h" #include "ha_prototypes.h" #include "srv0mon.h" #include "ut0vec.h" #include /** Set of table_id */ typedef std::set table_id_set; /** Dummy session used currently in MySQL interface */ UNIV_INTERN sess_t* trx_dummy_sess = NULL; #ifdef UNIV_PFS_MUTEX /* Key to register the mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t trx_mutex_key; /* Key to register the mutex with performance schema */ UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key; #endif /* UNIV_PFS_MUTEX */ /*************************************************************//** Set detailed error message for the transaction. */ UNIV_INTERN void trx_set_detailed_error( /*===================*/ trx_t* trx, /*!< in: transaction struct */ const char* msg) /*!< in: detailed error message */ { ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error)); } /*************************************************************//** Set detailed error message for the transaction from a file. Note that the file is rewinded before reading from it. */ UNIV_INTERN void trx_set_detailed_error_from_file( /*=============================*/ trx_t* trx, /*!< in: transaction struct */ FILE* file) /*!< in: file to read message from */ { os_file_read_string(file, trx->detailed_error, sizeof(trx->detailed_error)); } /*************************************************************//** Callback function for trx_find_descriptor() to compare trx IDs. */ UNIV_INTERN int trx_descr_cmp( /*==========*/ const void *a, /*!< in: pointer to first comparison argument */ const void *b) /*!< in: pointer to second comparison argument */ { const trx_id_t* da = (const trx_id_t*) a; const trx_id_t* db = (const trx_id_t*) b; if (*da < *db) { return -1; } else if (*da > *db) { return 1; } return 0; } /*************************************************************//** Reserve a slot for a given trx in the global descriptors array. */ UNIV_INLINE void trx_reserve_descriptor( /*===================*/ const trx_t* trx) /*!< in: trx pointer */ { ulint n_used; ulint n_max; trx_id_t* descr; ut_ad(mutex_own(&trx_sys->mutex) || srv_is_being_started); ut_ad(srv_is_being_started || !trx_find_descriptor(trx_sys->descriptors, trx_sys->descr_n_used, trx->id)); n_used = trx_sys->descr_n_used + 1; n_max = trx_sys->descr_n_max; if (UNIV_UNLIKELY(n_used > n_max)) { n_max = n_max * 2; trx_sys->descriptors = static_cast( ut_realloc(trx_sys->descriptors, n_max * sizeof(trx_id_t))); trx_sys->descr_n_max = n_max; srv_descriptors_memory = n_max * sizeof(trx_id_t); } descr = trx_sys->descriptors + n_used - 1; if (UNIV_UNLIKELY(n_used > 1 && trx->id < descr[-1])) { /* Find the slot where it should be inserted. We could use a binary search, but in reality linear search should be faster, because the slot we are looking for is near the array end. */ trx_id_t* tdescr; for (tdescr = descr - 1; tdescr >= trx_sys->descriptors && *tdescr > trx->id; tdescr--) { } tdescr++; ut_memmove(tdescr + 1, tdescr, (descr - tdescr) * sizeof(trx_id_t)); descr = tdescr; } *descr = trx->id; trx_sys->descr_n_used = n_used; } /*************************************************************//** Release a slot for a given trx in the global descriptors array. */ UNIV_INTERN void trx_release_descriptor( /*===================*/ trx_t* trx) /*!< in: trx pointer */ { ulint size; trx_id_t* descr; ut_ad(mutex_own(&trx_sys->mutex)); if (UNIV_LIKELY(trx->in_trx_serial_list)) { UT_LIST_REMOVE(trx_serial_list, trx_sys->trx_serial_list, trx); trx->in_trx_serial_list = false; } descr = trx_find_descriptor(trx_sys->descriptors, trx_sys->descr_n_used, trx->id); if (UNIV_UNLIKELY(descr == NULL)) { return; } size = (trx_sys->descriptors + trx_sys->descr_n_used - 1 - descr) * sizeof(trx_id_t); if (UNIV_LIKELY(size > 0)) { ut_memmove(descr, descr + 1, size); } trx_sys->descr_n_used--; } /****************************************************************//** Creates and initializes a transaction object. It must be explicitly started with trx_start_if_not_started() before using it. The default isolation level is TRX_ISO_REPEATABLE_READ. @return transaction instance, should never be NULL */ static trx_t* trx_create(void) /*============*/ { trx_t* trx; mem_heap_t* heap; ib_alloc_t* heap_alloc; trx = static_cast(mem_zalloc(sizeof(*trx))); mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX); trx->magic_n = TRX_MAGIC_N; trx->state = TRX_STATE_NOT_STARTED; trx->isolation_level = TRX_ISO_REPEATABLE_READ; trx->no = TRX_ID_MAX; trx->in_trx_serial_list = false; trx->support_xa = TRUE; trx->fake_changes = FALSE; trx->check_foreigns = TRUE; trx->check_unique_secondary = TRUE; trx->dict_operation = TRX_DICT_OP_NONE; trx->idle_start = 0; trx->last_stmt_start = 0; mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO); trx->error_state = DB_SUCCESS; trx->lock.que_state = TRX_QUE_RUNNING; trx->lock.lock_heap = mem_heap_create_typed( 256, MEM_HEAP_FOR_LOCK_HEAP); trx->search_latch_timeout = BTR_SEA_TIMEOUT; trx->io_reads = 0; trx->io_read = 0; trx->io_reads_wait_timer = 0; trx->lock_que_wait_timer = 0; trx->innodb_que_wait_timer = 0; trx->distinct_page_access = 0; trx->distinct_page_access_hash = NULL; trx->take_stats = FALSE; trx->xid.formatID = -1; trx->op_info = ""; trx->api_trx = false; trx->api_auto_commit = false; trx->read_write = true; heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8); heap_alloc = ib_heap_allocator_create(heap); /* Remember to free the vector explicitly in trx_free(). */ trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4); /* Remember to free the vector explicitly in trx_free(). */ heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128); heap_alloc = ib_heap_allocator_create(heap); trx->lock.table_locks = ib_vector_create( heap_alloc, sizeof(void**), 32); return(trx); } /********************************************************************//** Creates a transaction object for background operations by the master thread. @return own: transaction object */ UNIV_INTERN trx_t* trx_allocate_for_background(void) /*=============================*/ { trx_t* trx; trx = trx_create(); trx->sess = trx_dummy_sess; return(trx); } /********************************************************************//** Creates a transaction object for MySQL. @return own: transaction object */ UNIV_INTERN trx_t* trx_allocate_for_mysql(void) /*========================*/ { trx_t* trx; trx = trx_allocate_for_background(); mutex_enter(&trx_sys->mutex); ut_d(trx->in_mysql_trx_list = TRUE); UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); mutex_exit(&trx_sys->mutex); if (UNIV_UNLIKELY(trx->take_stats)) { trx->distinct_page_access_hash = static_cast(mem_alloc(DPAH_SIZE)); memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); } return(trx); } /********************************************************************//** Frees a transaction object without releasing the corresponding descriptor. Should be used by callers that already own trx_sys->mutex. */ static void trx_free_low( /*=========*/ trx_t* trx) /*!< in, own: trx object */ { ut_a(trx->magic_n == TRX_MAGIC_N); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); ut_ad(!trx->in_mysql_trx_list); mutex_free(&trx->undo_mutex); if (trx->undo_no_arr != NULL) { trx_undo_arr_free(trx->undo_no_arr); } ut_a(trx->lock.wait_lock == NULL); ut_a(trx->lock.wait_thr == NULL); ut_a(!trx->has_search_latch); #ifdef UNIV_SYNC_DEBUG ut_ad(!btr_search_own_any()); #endif ut_a(trx->dict_operation_lock_mode == 0); if (trx->lock.lock_heap) { mem_heap_free(trx->lock.lock_heap); } ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); ut_a(ib_vector_is_empty(trx->autoinc_locks)); /* We allocated a dedicated heap for the vector. */ ib_vector_free(trx->autoinc_locks); if (trx->lock.table_locks != NULL) { /* We allocated a dedicated heap for the vector. */ ib_vector_free(trx->lock.table_locks); } mutex_free(&trx->mutex); read_view_free(trx->prebuilt_view); mem_free(trx); } /********************************************************************//** Frees a transaction object. */ static void trx_free( /*=========*/ trx_t* trx) /*!< in, own: trx object */ { mutex_enter(&trx_sys->mutex); trx_release_descriptor(trx); mutex_exit(&trx_sys->mutex); trx_free_low(trx); } /********************************************************************//** Frees a transaction object of a background operation of the master thread. */ UNIV_INTERN void trx_free_for_background( /*====================*/ trx_t* trx) /*!< in, own: trx object */ { if (trx->distinct_page_access_hash) { mem_free(trx->distinct_page_access_hash); trx->distinct_page_access_hash= NULL; } if (trx->declared_to_be_inside_innodb) { ib_logf(IB_LOG_LEVEL_ERROR, "Freeing a trx (%p, " TRX_ID_FMT ") which is declared " "to be processing inside InnoDB", trx, trx->id); trx_print(stderr, trx, 600); putc('\n', stderr); /* This is an error but not a fatal error. We must keep the counters like srv_conc_n_threads accurate. */ srv_conc_force_exit_innodb(trx); } if (trx->n_mysql_tables_in_use != 0 || trx->mysql_n_tables_locked != 0) { ib_logf(IB_LOG_LEVEL_ERROR, "MySQL is freeing a thd though " "trx->n_mysql_tables_in_use is %lu and " "trx->mysql_n_tables_locked is %lu.", (ulong) trx->n_mysql_tables_in_use, (ulong) trx->mysql_n_tables_locked); trx_print(stderr, trx, 600); ut_print_buf(stderr, trx, sizeof(trx_t)); putc('\n', stderr); } ut_a(trx->state == TRX_STATE_NOT_STARTED); ut_a(trx->insert_undo == NULL); ut_a(trx->update_undo == NULL); ut_a(trx->read_view == NULL); trx_free(trx); } /********************************************************************//** At shutdown, frees a transaction object that is in the PREPARED state. */ UNIV_INTERN void trx_free_prepared( /*==============*/ trx_t* trx) /*!< in, own: trx object */ { ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)); ut_a(trx->magic_n == TRX_MAGIC_N); lock_trx_release_locks(trx); trx_undo_free_prepared(trx); assert_trx_in_rw_list(trx); ut_a(!trx->read_only); UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); ut_d(trx->in_rw_trx_list = FALSE); mutex_enter(&trx_sys->mutex); trx_release_descriptor(trx); mutex_exit(&trx_sys->mutex); /* Undo trx_resurrect_table_locks(). */ UT_LIST_INIT(trx->lock.trx_locks); trx_free_low(trx); ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); } /********************************************************************//** Frees a transaction object for MySQL. */ UNIV_INTERN void trx_free_for_mysql( /*===============*/ trx_t* trx) /*!< in, own: trx object */ { if (trx->distinct_page_access_hash) { mem_free(trx->distinct_page_access_hash); trx->distinct_page_access_hash= NULL; } mutex_enter(&trx_sys->mutex); ut_ad(trx->in_mysql_trx_list); ut_d(trx->in_mysql_trx_list = FALSE); UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); ut_ad(trx_sys_validate_trx_list()); mutex_exit(&trx_sys->mutex); trx_free_for_background(trx); } /****************************************************************//** Inserts the trx handle in the trx system trx list in the right position. The list is sorted on the trx id so that the biggest id is at the list start. This function is used at the database startup to insert incomplete transactions to the list. */ static void trx_list_rw_insert_ordered( /*=======================*/ trx_t* trx) /*!< in: trx handle */ { trx_t* trx2; ut_ad(!trx->read_only); ut_d(trx->start_file = __FILE__); ut_d(trx->start_line = __LINE__); ut_a(srv_is_being_started); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); ut_ad(trx->state != TRX_STATE_NOT_STARTED); ut_ad(trx->is_recovered); for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); trx2 != NULL; trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) { assert_trx_in_rw_list(trx2); if (trx->id >= trx2->id) { ut_ad(trx->id > trx2->id); break; } } if (trx2 != NULL) { trx2 = UT_LIST_GET_PREV(trx_list, trx2); if (trx2 == NULL) { UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); } else { UT_LIST_INSERT_AFTER( trx_list, trx_sys->rw_trx_list, trx2, trx); } } else { UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx); } #ifdef UNIV_DEBUG if (trx->id > trx_sys->rw_max_trx_id) { trx_sys->rw_max_trx_id = trx->id; } #endif /* UNIV_DEBUG */ ut_ad(!trx->in_rw_trx_list); ut_d(trx->in_rw_trx_list = TRUE); } /****************************************************************//** Resurrect the table locks for a resurrected transaction. */ static void trx_resurrect_table_locks( /*======================*/ trx_t* trx, /*!< in/out: transaction */ const trx_undo_t* undo) /*!< in: undo log */ { mtr_t mtr; page_t* undo_page; trx_undo_rec_t* undo_rec; table_id_set tables; ut_ad(undo == trx->insert_undo || undo == trx->update_undo); if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) || undo->empty) { return; } mtr_start(&mtr); /* trx_rseg_mem_create() may have acquired an X-latch on this page, so we cannot acquire an S-latch. */ undo_page = trx_undo_page_get( undo->space, undo->zip_size, undo->top_page_no, &mtr); undo_rec = undo_page + undo->top_offset; do { ulint type; ulint cmpl_info; bool updated_extern; undo_no_t undo_no; table_id_t table_id; page_t* undo_rec_page = page_align(undo_rec); if (undo_rec_page != undo_page) { if (!mtr_memo_release(&mtr, buf_block_align(undo_page), MTR_MEMO_PAGE_X_FIX)) { /* The page of the previous undo_rec should have been latched by trx_undo_page_get() or trx_undo_get_prev_rec(). */ ut_ad(0); } undo_page = undo_rec_page; } trx_undo_rec_get_pars( undo_rec, &type, &cmpl_info, &updated_extern, &undo_no, &table_id); tables.insert(table_id); undo_rec = trx_undo_get_prev_rec( undo_rec, undo->hdr_page_no, undo->hdr_offset, false, &mtr); } while (undo_rec); mtr_commit(&mtr); for (table_id_set::const_iterator i = tables.begin(); i != tables.end(); i++) { if (dict_table_t* table = dict_table_open_on_id( *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) { if (table->ibd_file_missing || dict_table_is_temporary(table)) { mutex_enter(&dict_sys->mutex); dict_table_close(table, TRUE, FALSE); dict_table_remove_from_cache(table); mutex_exit(&dict_sys->mutex); continue; } lock_table_ix_resurrect(table, trx); DBUG_PRINT("ib_trx", ("resurrect" TRX_ID_FMT " table '%s' IX lock from %s undo", trx->id, table->name, undo == trx->insert_undo ? "insert" : "update")); dict_table_close(table, FALSE, FALSE); } } } /****************************************************************//** Resurrect the transactions that were doing inserts the time of the crash, they need to be undone. @return trx_t instance */ static trx_t* trx_resurrect_insert( /*=================*/ trx_undo_t* undo, /*!< in: entry to UNDO */ trx_rseg_t* rseg) /*!< in: rollback segment */ { trx_t* trx; trx = trx_allocate_for_background(); trx->rseg = rseg; trx->xid = undo->xid; trx->id = undo->trx_id; trx->insert_undo = undo; trx->is_recovered = TRUE; /* This is single-threaded startup code, we do not need the protection of trx->mutex or trx_sys->mutex here. */ if (undo->state != TRX_UNDO_ACTIVE) { /* Prepared transactions are left in the prepared state waiting for a commit or abort decision from MySQL */ if (undo->state == TRX_UNDO_PREPARED) { fprintf(stderr, "InnoDB: Transaction " TRX_ID_FMT " was in the" " XA prepared state.\n", trx->id); if (srv_force_recovery == 0) { trx->state = TRX_STATE_PREPARED; trx_sys->n_prepared_trx++; trx_sys->n_prepared_recovered_trx++; } else { fprintf(stderr, "InnoDB: Since innodb_force_recovery" " > 0, we will rollback it anyway.\n"); trx->state = TRX_STATE_ACTIVE; } } else { trx->state = TRX_STATE_COMMITTED_IN_MEMORY; } /* We give a dummy value for the trx no; this should have no relevance since purge is not interested in committed transaction numbers, unless they are in the history list, in which case it looks the number from the disk based undo log structure */ trx->no = trx->id; } else { trx->state = TRX_STATE_ACTIVE; /* A running transaction always has the number field inited to TRX_ID_MAX */ trx->no = TRX_ID_MAX; } /* trx_start_low() is not called with resurrect, so need to initialize start time here.*/ if (trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED) { trx->start_time = ut_time(); } if (undo->dict_operation) { trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); trx->table_id = undo->table_id; } if (!undo->empty) { trx->undo_no = undo->top_undo_no + 1; } return(trx); } /****************************************************************//** Prepared transactions are left in the prepared state waiting for a commit or abort decision from MySQL */ static void trx_resurrect_update_in_prepared_state( /*===================================*/ trx_t* trx, /*!< in,out: transaction */ const trx_undo_t* undo) /*!< in: update UNDO record */ { /* This is single-threaded startup code, we do not need the protection of trx->mutex or trx_sys->mutex here. */ if (undo->state == TRX_UNDO_PREPARED) { fprintf(stderr, "InnoDB: Transaction " TRX_ID_FMT " was in the XA prepared state.\n", trx->id); if (srv_force_recovery == 0) { if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) { trx_sys->n_prepared_trx++; trx_sys->n_prepared_recovered_trx++; } else { ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); } trx->state = TRX_STATE_PREPARED; } else { fprintf(stderr, "InnoDB: Since innodb_force_recovery" " > 0, we will rollback it anyway.\n"); trx->state = TRX_STATE_ACTIVE; } } else { trx->state = TRX_STATE_COMMITTED_IN_MEMORY; } } /****************************************************************//** Resurrect the transactions that were doing updates the time of the crash, they need to be undone. */ static void trx_resurrect_update( /*=================*/ trx_t* trx, /*!< in/out: transaction */ trx_undo_t* undo, /*!< in/out: update UNDO record */ trx_rseg_t* rseg) /*!< in/out: rollback segment */ { trx->rseg = rseg; trx->xid = undo->xid; trx->id = undo->trx_id; trx->update_undo = undo; trx->is_recovered = TRUE; /* This is single-threaded startup code, we do not need the protection of trx->mutex or trx_sys->mutex here. */ if (undo->state != TRX_UNDO_ACTIVE) { trx_resurrect_update_in_prepared_state(trx, undo); /* We give a dummy value for the trx number */ trx->no = trx->id; } else { trx->state = TRX_STATE_ACTIVE; /* A running transaction always has the number field inited to TRX_ID_MAX */ trx->no = TRX_ID_MAX; } /* trx_start_low() is not called with resurrect, so need to initialize start time here.*/ if (trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED) { trx->start_time = ut_time(); } if (undo->dict_operation) { trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); trx->table_id = undo->table_id; } if (!undo->empty && undo->top_undo_no >= trx->undo_no) { trx->undo_no = undo->top_undo_no + 1; } } /****************************************************************//** Creates trx objects for transactions and initializes the trx list of trx_sys at database start. Rollback segment and undo log lists must already exist when this function is called, because the lists of transactions to be rolled back or cleaned up are built based on the undo log lists. */ UNIV_INTERN void trx_lists_init_at_db_start(void) /*============================*/ { ulint i; ut_a(srv_is_being_started); UT_LIST_INIT(trx_sys->ro_trx_list); UT_LIST_INIT(trx_sys->rw_trx_list); UT_LIST_INIT(trx_sys->trx_serial_list); /* Look from the rollback segments if there exist undo logs for transactions */ for (i = 0; i < TRX_SYS_N_RSEGS; ++i) { trx_undo_t* undo; trx_rseg_t* rseg; rseg = trx_sys->rseg_array[i]; if (rseg == NULL) { continue; } /* Resurrect transactions that were doing inserts. */ for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); undo != NULL; undo = UT_LIST_GET_NEXT(undo_list, undo)) { trx_t* trx; trx = trx_resurrect_insert(undo, rseg); if (trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED) { trx_reserve_descriptor(trx); } trx_list_rw_insert_ordered(trx); trx_resurrect_table_locks(trx, undo); } /* Ressurrect transactions that were doing updates. */ for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list); undo != NULL; undo = UT_LIST_GET_NEXT(undo_list, undo)) { trx_t* trx; ibool trx_created; /* Check the trx_sys->rw_trx_list first. */ mutex_enter(&trx_sys->mutex); trx = trx_get_rw_trx_by_id(undo->trx_id); mutex_exit(&trx_sys->mutex); if (trx == NULL) { trx = trx_allocate_for_background(); trx_created = TRUE; } else { trx_created = FALSE; } trx_resurrect_update(trx, undo, rseg); if (trx_created) { if (trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED) { trx_reserve_descriptor(trx); } trx_list_rw_insert_ordered(trx); } trx_resurrect_table_locks(trx, undo); } } } /******************************************************************//** Assigns a rollback segment to a transaction in a round-robin fashion. @return assigned rollback segment instance */ static trx_rseg_t* trx_assign_rseg_low( /*================*/ ulong max_undo_logs, /*!< in: maximum number of UNDO logs to use */ ulint n_tablespaces) /*!< in: number of rollback tablespaces */ { ulint i; trx_rseg_t* rseg; static ulint latest_rseg = 0; if (srv_read_only_mode) { ut_a(max_undo_logs == ULONG_UNDEFINED); return(NULL); } /* This breaks true round robin but that should be OK. */ ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS); i = latest_rseg++; i %= max_undo_logs; /* Note: The assumption here is that there can't be any gaps in the array. Once we implement more flexible rollback segment management this may not hold. The assertion checks for that case. */ if (trx_sys->rseg_array[0] == NULL) { return(NULL); } /* Skip the system tablespace if we have more than one tablespace defined for rollback segments. We want all UNDO records to be in the non-system tablespaces. */ do { rseg = trx_sys->rseg_array[i]; ut_a(rseg == NULL || i == rseg->id); i = (rseg == NULL) ? 0 : i + 1; } while (rseg == NULL || (rseg->space == 0 && n_tablespaces > 0 && trx_sys->rseg_array[1] != NULL)); return(rseg); } /****************************************************************//** Assign a read-only transaction a rollback-segment, if it is attempting to write to a TEMPORARY table. */ UNIV_INTERN void trx_assign_rseg( /*============*/ trx_t* trx) /*!< A read-only transaction that needs to be assigned a RBS. */ { ut_a(trx->rseg == 0); ut_a(trx->read_only); ut_a(!srv_read_only_mode); ut_a(!trx_is_autocommit_non_locking(trx)); trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces); } /****************************************************************//** Starts a transaction. */ static void trx_start_low( /*==========*/ trx_t* trx) /*!< in: transaction */ { ut_ad(trx->rseg == NULL); ut_ad(trx->start_file != 0); ut_ad(trx->start_line != 0); ut_ad(!trx->is_recovered); ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); /* Check whether it is an AUTOCOMMIT SELECT */ trx->auto_commit = (trx->api_trx && trx->api_auto_commit) || thd_trx_is_auto_commit(trx->mysql_thd); trx->read_only = (trx->api_trx && !trx->read_write) || (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd)) || srv_read_only_mode; if (!trx->auto_commit) { ++trx->will_lock; } else if (trx->will_lock == 0) { trx->read_only = TRUE; } if (!trx->read_only) { trx->rseg = trx_assign_rseg_low( srv_undo_logs, srv_undo_tablespaces); } /* The initial value for trx->no: TRX_ID_MAX is used in read_view_open_now: */ trx->no = TRX_ID_MAX; ut_a(ib_vector_is_empty(trx->autoinc_locks)); ut_a(ib_vector_is_empty(trx->lock.table_locks)); mutex_enter(&trx_sys->mutex); /* If this transaction came from trx_allocate_for_mysql(), trx->in_mysql_trx_list would hold. In that case, the trx->state change must be protected by the trx_sys->mutex, so that lock_print_info_all_transactions() will have a consistent view. */ trx->state = TRX_STATE_ACTIVE; trx->id = trx_sys_get_new_trx_id(); /* Cache the state of fake_changes that transaction will use for lifetime. Any change in session/global fake_changes configuration during lifetime of transaction will not be honored by already started transaction. */ trx->fake_changes = thd_fake_changes(trx->mysql_thd); ut_ad(!trx->in_rw_trx_list); ut_ad(!trx->in_ro_trx_list); if (trx->read_only) { /* Note: The trx_sys_t::ro_trx_list doesn't really need to be ordered, we should exploit this using a list type that doesn't need a list wide lock to increase concurrency. */ if (!trx_is_autocommit_non_locking(trx)) { UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx); ut_d(trx->in_ro_trx_list = TRUE); } } else { ut_ad(trx->rseg != NULL || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); ut_ad(!trx_is_autocommit_non_locking(trx)); UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx); ut_d(trx->in_rw_trx_list = TRUE); #ifdef UNIV_DEBUG if (trx->id > trx_sys->rw_max_trx_id) { trx_sys->rw_max_trx_id = trx->id; } #endif /* UNIV_DEBUG */ trx_reserve_descriptor(trx); } ut_ad(trx_sys_validate_trx_list()); mutex_exit(&trx_sys->mutex); trx->start_time = ut_time(); MONITOR_INC(MONITOR_TRX_ACTIVE); } /****************************************************************//** Set the transaction serialisation number. */ static void trx_serialisation_number_get( /*=========================*/ trx_t* trx) /*!< in: transaction */ { trx_rseg_t* rseg; rseg = trx->rseg; ut_ad(mutex_own(&rseg->mutex)); mutex_enter(&trx_sys->mutex); trx->no = trx_sys_get_new_trx_id(); if (UNIV_LIKELY(!trx->in_trx_serial_list)) { UT_LIST_ADD_LAST(trx_serial_list, trx_sys->trx_serial_list, trx); trx->in_trx_serial_list = true; } /* If the rollack segment is not empty then the new trx_t::no can't be less than any trx_t::no already in the rollback segment. User threads only produce events when a rollback segment is empty. */ if (rseg->last_page_no == FIL_NULL) { void* ptr; rseg_queue_t rseg_queue; rseg_queue.rseg = rseg; rseg_queue.trx_no = trx->no; mutex_enter(&purge_sys->bh_mutex); /* This is to reduce the pressure on the trx_sys_t::mutex though in reality it should make very little (read no) difference because this code path is only taken when the rbs is empty. */ mutex_exit(&trx_sys->mutex); ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); ut_a(ptr); mutex_exit(&purge_sys->bh_mutex); } else { mutex_exit(&trx_sys->mutex); } } /****************************************************************//** Assign the transaction its history serialisation number and write the update UNDO log record to the assigned rollback segment. */ static MY_ATTRIBUTE((nonnull)) void trx_write_serialisation_history( /*============================*/ trx_t* trx, /*!< in/out: transaction */ mtr_t* mtr) /*!< in/out: mini-transaction */ { trx_rseg_t* rseg; rseg = trx->rseg; /* Change the undo log segment states from TRX_UNDO_ACTIVE to some other state: these modifications to the file data structure define the transaction as committed in the file based domain, at the serialization point of the log sequence number lsn obtained below. */ if (trx->update_undo != NULL) { page_t* undo_hdr_page; trx_undo_t* undo = trx->update_undo; /* We have to hold the rseg mutex because update log headers have to be put to the history list in the (serialisation) order of the UNDO trx number. This is required for the purge in-memory data structures too. */ mutex_enter(&rseg->mutex); /* Assign the transaction serialisation number and also update the purge min binary heap if this is the first UNDO log being written to the assigned rollback segment. */ trx_serialisation_number_get(trx); /* It is not necessary to obtain trx->undo_mutex here because only a single OS thread is allowed to do the transaction commit for this transaction. */ undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr); trx_undo_update_cleanup(trx, undo_hdr_page, mtr); } else { mutex_enter(&rseg->mutex); } if (trx->insert_undo != NULL) { trx_undo_set_state_at_finish(trx->insert_undo, mtr); } mutex_exit(&rseg->mutex); MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); /* Update the latest MySQL binlog name and offset info in trx sys header if MySQL binlogging is on or the database server is a MySQL replication slave */ if (trx->mysql_log_file_name && trx->mysql_log_file_name[0] != '\0') { trx_sys_update_mysql_binlog_offset( trx->mysql_log_file_name, trx->mysql_log_offset, TRX_SYS_MYSQL_LOG_INFO, mtr); trx->mysql_log_file_name = NULL; } } /******************************************************************** Finalize a transaction containing updates for a FTS table. */ static MY_ATTRIBUTE((nonnull)) void trx_finalize_for_fts_table( /*=======================*/ fts_trx_table_t* ftt) /* in: FTS trx table */ { fts_t* fts = ftt->table->fts; fts_doc_ids_t* doc_ids = ftt->added_doc_ids; mutex_enter(&fts->bg_threads_mutex); if (fts->fts_status & BG_THREAD_STOP) { /* The table is about to be dropped, no use adding anything to its work queue. */ mutex_exit(&fts->bg_threads_mutex); } else { mem_heap_t* heap; mutex_exit(&fts->bg_threads_mutex); ut_a(fts->add_wq); heap = static_cast(doc_ids->self_heap->arg); ib_wqueue_add(fts->add_wq, doc_ids, heap); /* fts_trx_table_t no longer owns the list. */ ftt->added_doc_ids = NULL; } } /******************************************************************//** Finalize a transaction containing updates to FTS tables. */ static MY_ATTRIBUTE((nonnull)) void trx_finalize_for_fts( /*=================*/ trx_t* trx, /*!< in/out: transaction */ bool is_commit) /*!< in: true if the transaction was committed, false if it was rolled back. */ { if (is_commit) { const ib_rbt_node_t* node; ib_rbt_t* tables; fts_savepoint_t* savepoint; savepoint = static_cast( ib_vector_last(trx->fts_trx->savepoints)); tables = savepoint->tables; for (node = rbt_first(tables); node; node = rbt_next(tables, node)) { fts_trx_table_t** ftt; ftt = rbt_value(fts_trx_table_t*, node); if ((*ftt)->added_doc_ids) { trx_finalize_for_fts_table(*ftt); } } } fts_trx_free(trx->fts_trx); trx->fts_trx = NULL; } /**********************************************************************//** If required, flushes the log to disk based on the value of innodb_flush_log_at_trx_commit. */ static void trx_flush_log_if_needed_low( /*========================*/ lsn_t lsn, /*!< in: lsn up to which logs are to be flushed. */ trx_t* trx) /*!< in: transaction */ { ulint flush_log_at_trx_commit; flush_log_at_trx_commit = srv_use_global_flush_log_at_trx_commit ? thd_flush_log_at_trx_commit(NULL) : thd_flush_log_at_trx_commit(trx->mysql_thd); switch (flush_log_at_trx_commit) { case 0: /* Do nothing */ break; case 1: /* Write the log and optionally flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, srv_unix_file_flush_method != SRV_UNIX_NOSYNC); break; case 2: /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); break; default: ut_error; } } /**********************************************************************//** If required, flushes the log to disk based on the value of innodb_flush_log_at_trx_commit. */ static MY_ATTRIBUTE((nonnull)) void trx_flush_log_if_needed( /*====================*/ lsn_t lsn, /*!< in: lsn up to which logs are to be flushed. */ trx_t* trx) /*!< in/out: transaction */ { trx->op_info = "flushing log"; trx_flush_log_if_needed_low(lsn, trx); trx->op_info = ""; } /****************************************************************//** Commits a transaction in memory. */ static MY_ATTRIBUTE((nonnull)) void trx_commit_in_memory( /*=================*/ trx_t* trx, /*!< in/out: transaction */ lsn_t lsn) /*!< in: log sequence number of the mini-transaction commit of trx_write_serialisation_history(), or 0 if the transaction did not modify anything */ { trx->must_flush_log_later = FALSE; if (trx_is_autocommit_non_locking(trx)) { ut_ad(trx->read_only); ut_a(!trx->is_recovered); ut_ad(trx->rseg == NULL); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); /* Note: We are asserting without holding the lock mutex. But that is OK because this transaction is not waiting and cannot be rolled back and no new locks can (or should not) be added becuase it is flagged as a non-locking read-only transaction. */ ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); /* This state change is not protected by any mutex, therefore there is an inherent race here around state transition during printouts. We ignore this race for the sake of efficiency. However, the trx_sys_t::mutex will protect the trx_t instance and it cannot be removed from the mysql_trx_list and freed without first acquiring the trx_sys_t::mutex. */ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); trx->state = TRX_STATE_NOT_STARTED; read_view_remove(trx->global_read_view, false); MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); } else { lock_trx_release_locks(trx); /* Remove the transaction from the list of active transactions now that it no longer holds any user locks. */ ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); mutex_enter(&trx_sys->mutex); assert_trx_in_list(trx); if (trx->read_only) { UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx); ut_d(trx->in_ro_trx_list = FALSE); MONITOR_INC(MONITOR_TRX_RO_COMMIT); } else { UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); ut_d(trx->in_rw_trx_list = FALSE); ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); MONITOR_INC(MONITOR_TRX_RW_COMMIT); } /* If this transaction came from trx_allocate_for_mysql(), trx->in_mysql_trx_list would hold. In that case, the trx->state change must be protected by trx_sys->mutex, so that lock_print_info_all_transactions() will have a consistent view. */ trx->state = TRX_STATE_NOT_STARTED; /* We already own the trx_sys_t::mutex, by doing it here we avoid a potential context switch later. */ read_view_remove(trx->global_read_view, true); ut_ad(trx_sys_validate_trx_list()); mutex_exit(&trx_sys->mutex); } if (trx->global_read_view != NULL) { trx->global_read_view = NULL; } trx->read_view = NULL; if (lsn) { ulint flush_log_at_trx_commit; if (trx->insert_undo != NULL) { trx_undo_insert_cleanup(trx); } if (srv_use_global_flush_log_at_trx_commit) { flush_log_at_trx_commit = thd_flush_log_at_trx_commit(NULL); } else { flush_log_at_trx_commit = thd_flush_log_at_trx_commit(trx->mysql_thd); } /* NOTE that we could possibly make a group commit more efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ /*-------------------------------------*/ /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the transaction durable if the OS does not crash. We may also flush the log files to disk, making the transaction durable also at an OS crash or a power outage. The idea in InnoDB's group commit is that a group of transactions gather behind a trx doing a physical disk write to log files, and when that physical write has been completed, one of those transactions does a write which commits the whole group. Note that this group commit will only bring benefit if there are > 2 users in the database. Then at least 2 users can gather behind one doing the physical log write to disk. If we are calling trx_commit() under prepare_commit_mutex, we will delay possible log write and flush to a separate function trx_commit_complete_for_mysql(), which is only called when the thread has released the mutex. This is to make the group commit algorithm to work. Otherwise, the prepare_commit mutex would serialize all commits and prevent a group of transactions from gathering. */ if (trx->flush_log_later) { /* Do nothing yet */ trx->must_flush_log_later = TRUE; } else if (flush_log_at_trx_commit == 0 || thd_requested_durability(trx->mysql_thd) == HA_IGNORE_DURABILITY) { /* Do nothing */ } else { trx_flush_log_if_needed(lsn, trx); } trx->commit_lsn = lsn; /* Tell server some activity has happened, since the trx does changes something. Background utility threads like master thread, purge thread or page_cleaner thread might have some work to do. */ srv_active_wake_master_thread(); } /* undo_no is non-zero if we're doing the final commit. */ bool not_rollback = trx->undo_no != 0; /* Free all savepoints, starting from the first. */ trx_named_savept_t* savep = UT_LIST_GET_FIRST(trx->trx_savepoints); trx_roll_savepoints_free(trx, savep); trx->rseg = NULL; trx->undo_no = 0; trx->last_sql_stat_start.least_undo_no = 0; trx->ddl = false; #ifdef UNIV_DEBUG ut_ad(trx->start_file != 0); ut_ad(trx->start_line != 0); trx->start_file = 0; trx->start_line = 0; #endif /* UNIV_DEBUG */ trx->will_lock = 0; trx->read_only = FALSE; trx->auto_commit = FALSE; if (trx->fts_trx) { trx_finalize_for_fts(trx, not_rollback); } ut_ad(trx->lock.wait_thr == NULL); ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); trx->dict_operation = TRX_DICT_OP_NONE; trx->error_state = DB_SUCCESS; /* trx->in_mysql_trx_list would hold between trx_allocate_for_mysql() and trx_free_for_mysql(). It does not hold for recovered transactions or system transactions. */ } /****************************************************************//** Commits a transaction and a mini-transaction. */ UNIV_INTERN void trx_commit_low( /*===========*/ trx_t* trx, /*!< in/out: transaction */ mtr_t* mtr) /*!< in/out: mini-transaction (will be committed), or NULL if trx made no modifications */ { lsn_t lsn; assert_trx_nonlocking_or_in_list(trx); ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); ut_ad(!mtr || mtr->state == MTR_ACTIVE); ut_ad(!mtr == !(trx->insert_undo || trx->update_undo)); /* undo_no is non-zero if we're doing the final commit. */ if (trx->fts_trx && trx->undo_no != 0) { dberr_t error; ut_a(!trx_is_autocommit_non_locking(trx)); error = fts_commit(trx); /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of dying. This is a possible scenario if there is a crash between insert to DELETED table committing and transaction committing. The fix would be able to return error from this function */ if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) { /* FTS-FIXME: once we can return values from this function, we should do so and signal an error instead of just dying. */ ut_error; } } if (mtr) { trx_write_serialisation_history(trx, mtr); /* The following call commits the mini-transaction, making the whole transaction committed in the file-based world, at this log sequence number. The transaction becomes 'durable' when we write the log to disk, but in the logical sense the commit in the file-based data structures (undo logs etc.) happens here. NOTE that transaction numbers, which are assigned only to transactions with an update undo log, do not necessarily come in exactly the same order as commit lsn's, if the transactions have different rollback segments. To get exactly the same order we should hold the kernel mutex up to this point, adding to the contention of the kernel mutex. However, if a transaction T2 is able to see modifications made by a transaction T1, T2 will always get a bigger transaction number and a bigger commit lsn than T1. */ /*--------------*/ mtr_commit(mtr); /*--------------*/ lsn = mtr->end_lsn; } else { lsn = 0; } trx_commit_in_memory(trx, lsn); } /****************************************************************//** Commits a transaction. */ UNIV_INTERN void trx_commit( /*=======*/ trx_t* trx) /*!< in/out: transaction */ { mtr_t local_mtr; mtr_t* mtr; if (trx->insert_undo || trx->update_undo) { mtr = &local_mtr; mtr_start(mtr); } else { mtr = NULL; } trx_commit_low(trx, mtr); } /****************************************************************//** Cleans up a transaction at database startup. The cleanup is needed if the transaction already got to the middle of a commit when the database crashed, and we cannot roll it back. */ UNIV_INTERN void trx_cleanup_at_db_startup( /*======================*/ trx_t* trx) /*!< in: transaction */ { ut_ad(trx->is_recovered); if (trx->insert_undo != NULL) { trx_undo_insert_cleanup(trx); } trx->rseg = NULL; trx->undo_no = 0; trx->last_sql_stat_start.least_undo_no = 0; mutex_enter(&trx_sys->mutex); ut_a(!trx->read_only); UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx); ut_ad(trx_sys->descr_n_used <= UT_LIST_GET_LEN(trx_sys->rw_trx_list)); assert_trx_in_rw_list(trx); ut_d(trx->in_rw_trx_list = FALSE); trx->state = TRX_STATE_NOT_STARTED; trx_release_descriptor(trx); mutex_exit(&trx_sys->mutex); /* Change the transaction state without mutex protection, now that it no longer is in the trx_list. Recovered transactions are never placed in the mysql_trx_list. */ ut_ad(trx->is_recovered); ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); ut_ad(!trx->in_mysql_trx_list); } /********************************************************************//** Assigns a read view for a consistent read query. All the consistent reads within the same transaction will get the same read view, which is created when this function is first called for a new started transaction. @return consistent read view */ UNIV_INTERN read_view_t* trx_assign_read_view( /*=================*/ trx_t* trx) /*!< in: active transaction */ { ut_ad(trx->state == TRX_STATE_ACTIVE); if (trx->read_view != NULL) { return(trx->read_view); } trx->read_view = read_view_open_now(trx->id, trx->prebuilt_view); trx->global_read_view = trx->read_view; return(trx->read_view); } /********************************************************************//** Clones the read view from another transaction. All consistent reads within the receiver transaction will get the same read view as the donor transaction @return read view clone */ UNIV_INTERN read_view_t* trx_clone_read_view( /*================*/ trx_t* trx, /*!< in: receiver transaction */ trx_t* from_trx) /*!< in: donor transaction */ { ut_ad(lock_mutex_own()); ut_ad(mutex_own(&trx_sys->mutex)); ut_ad(trx_mutex_own(from_trx)); ut_ad(trx->read_view == NULL); if (from_trx->state != TRX_STATE_ACTIVE || from_trx->read_view == NULL) { return(NULL); } trx->read_view = read_view_clone(from_trx->read_view, trx->prebuilt_view); read_view_add(trx->read_view); trx->global_read_view = trx->read_view; return(trx->read_view); } /****************************************************************//** Prepares a transaction for commit/rollback. */ UNIV_INTERN void trx_commit_or_rollback_prepare( /*===========================*/ trx_t* trx) /*!< in/out: transaction */ { /* We are reading trx->state without holding trx_sys->mutex here, because the commit or rollback should be invoked for a running (or recovered prepared) transaction that is associated with the current thread. */ switch (trx->state) { case TRX_STATE_NOT_STARTED: trx_start_low(trx); /* fall through */ case TRX_STATE_ACTIVE: case TRX_STATE_PREPARED: /* If the trx is in a lock wait state, moves the waiting query thread to the suspended state */ if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { ulint sec; ulint ms; ib_uint64_t now; ut_a(trx->lock.wait_thr != NULL); trx->lock.wait_thr->state = QUE_THR_SUSPENDED; trx->lock.wait_thr = NULL; if (UNIV_UNLIKELY(trx->take_stats)) { ut_usectime(&sec, &ms); now = (ib_uint64_t)sec * 1000000 + ms; trx->lock_que_wait_timer += (ulint) (now - trx->lock_que_wait_ustarted); } trx->lock.que_state = TRX_QUE_RUNNING; } ut_a(trx->lock.n_active_thrs == 1); return; case TRX_STATE_COMMITTED_IN_MEMORY: break; } ut_error; } /*********************************************************************//** Creates a commit command node struct. @return own: commit node struct */ UNIV_INTERN commit_node_t* trx_commit_node_create( /*===================*/ mem_heap_t* heap) /*!< in: mem heap where created */ { commit_node_t* node; node = static_cast(mem_heap_alloc(heap, sizeof(*node))); node->common.type = QUE_NODE_COMMIT; node->state = COMMIT_NODE_SEND; return(node); } /***********************************************************//** Performs an execution step for a commit type node in a query graph. @return query thread to run next, or NULL */ UNIV_INTERN que_thr_t* trx_commit_step( /*============*/ que_thr_t* thr) /*!< in: query thread */ { commit_node_t* node; node = static_cast(thr->run_node); ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); if (thr->prev_node == que_node_get_parent(node)) { node->state = COMMIT_NODE_SEND; } if (node->state == COMMIT_NODE_SEND) { trx_t* trx; node->state = COMMIT_NODE_WAIT; trx = thr_get_trx(thr); ut_a(trx->lock.wait_thr == NULL); ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT); trx_commit_or_rollback_prepare(trx); trx->lock.que_state = TRX_QUE_COMMITTING; trx_commit(trx); ut_ad(trx->lock.wait_thr == NULL); trx->lock.que_state = TRX_QUE_RUNNING; thr = NULL; } else { ut_ad(node->state == COMMIT_NODE_WAIT); node->state = COMMIT_NODE_SEND; thr->run_node = que_node_get_parent(node); } return(thr); } /**********************************************************************//** Does the transaction commit for MySQL. @return DB_SUCCESS or error number */ UNIV_INTERN dberr_t trx_commit_for_mysql( /*=================*/ trx_t* trx) /*!< in/out: transaction */ { /* Because we do not do the commit by sending an Innobase sig to the transaction, we must here make sure that trx has been started. */ ut_a(trx); switch (trx->state) { case TRX_STATE_NOT_STARTED: /* Update the info whether we should skip XA steps that eat CPU time. For the duration of the transaction trx->support_xa is not reread from thd so any changes in the value take effect in the next transaction. This is to avoid a scenario where some undo log records generated by a transaction contain XA information and other undo log records, generated by the same transaction do not. */ trx->support_xa = thd_supports_xa(trx->mysql_thd); ut_d(trx->start_file = __FILE__); ut_d(trx->start_line = __LINE__); trx_start_low(trx); /* fall through */ case TRX_STATE_ACTIVE: case TRX_STATE_PREPARED: trx->op_info = "committing"; trx_commit(trx); MONITOR_DEC(MONITOR_TRX_ACTIVE); trx->op_info = ""; return(DB_SUCCESS); case TRX_STATE_COMMITTED_IN_MEMORY: break; } ut_error; return(DB_CORRUPTION); } /**********************************************************************//** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ UNIV_INTERN void trx_commit_complete_for_mysql( /*==========================*/ trx_t* trx) /*!< in/out: transaction */ { ut_a(trx); if (!trx->must_flush_log_later || thd_requested_durability(trx->mysql_thd) == HA_IGNORE_DURABILITY) { return; } trx_flush_log_if_needed(trx->commit_lsn, trx); trx->must_flush_log_later = FALSE; } /**********************************************************************//** Marks the latest SQL statement ended. */ UNIV_INTERN void trx_mark_sql_stat_end( /*==================*/ trx_t* trx) /*!< in: trx handle */ { ut_a(trx); switch (trx->state) { case TRX_STATE_PREPARED: case TRX_STATE_COMMITTED_IN_MEMORY: break; case TRX_STATE_NOT_STARTED: trx->undo_no = 0; /* fall through */ case TRX_STATE_ACTIVE: trx->last_sql_stat_start.least_undo_no = trx->undo_no; if (trx->fts_trx) { fts_savepoint_laststmt_refresh(trx); } return; } ut_error; } /**********************************************************************//** Prints info about a transaction. Caller must hold trx_sys->mutex. */ UNIV_INTERN void trx_print_low( /*==========*/ FILE* f, /*!< in: output stream */ const trx_t* trx, /*!< in: transaction */ ulint max_query_len, /*!< in: max query length to print, or 0 to use the default max length */ ulint n_rec_locks, /*!< in: lock_number_of_rows_locked(&trx->lock) */ ulint n_trx_locks, /*!< in: length of trx->lock.trx_locks */ ulint heap_size) /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ { ibool newline; const char* op_info; ut_ad(mutex_own(&trx_sys->mutex)); fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id); /* trx->state cannot change from or to NOT_STARTED while we are holding the trx_sys->mutex. It may change from ACTIVE to PREPARED or COMMITTED. */ switch (trx->state) { case TRX_STATE_NOT_STARTED: fputs(", not started", f); goto state_ok; case TRX_STATE_ACTIVE: fprintf(f, ", ACTIVE %lu sec", (ulong) difftime(time(NULL), trx->start_time)); goto state_ok; case TRX_STATE_PREPARED: fprintf(f, ", ACTIVE (PREPARED) %lu sec", (ulong) difftime(time(NULL), trx->start_time)); goto state_ok; case TRX_STATE_COMMITTED_IN_MEMORY: fputs(", COMMITTED IN MEMORY", f); goto state_ok; } fprintf(f, ", state %lu", (ulong) trx->state); ut_ad(0); state_ok: /* prevent a race condition */ op_info = trx->op_info; if (*op_info) { putc(' ', f); fputs(op_info, f); } if (trx->is_recovered) { fputs(" recovered trx", f); } if (trx->declared_to_be_inside_innodb) { fprintf(f, ", thread declared inside InnoDB %lu", (ulong) trx->n_tickets_to_enter_innodb); } putc('\n', f); if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { fprintf(f, "mysql tables in use %lu, locked %lu\n", (ulong) trx->n_mysql_tables_in_use, (ulong) trx->mysql_n_tables_locked); } newline = TRUE; /* trx->lock.que_state of an ACTIVE transaction may change while we are not holding trx->mutex. We perform a dirty read for performance reasons. */ switch (trx->lock.que_state) { case TRX_QUE_RUNNING: newline = FALSE; break; case TRX_QUE_LOCK_WAIT: fputs("LOCK WAIT ", f); break; case TRX_QUE_ROLLING_BACK: fputs("ROLLING BACK ", f); break; case TRX_QUE_COMMITTING: fputs("COMMITTING ", f); break; default: fprintf(f, "que state %lu ", (ulong) trx->lock.que_state); } if (n_trx_locks > 0 || heap_size > 400) { newline = TRUE; fprintf(f, "%lu lock struct(s), heap size %lu," " %lu row lock(s)", (ulong) n_trx_locks, (ulong) heap_size, (ulong) n_rec_locks); } if (trx->has_search_latch) { newline = TRUE; fputs(", holds adaptive hash latch", f); } if (trx->undo_no != 0) { newline = TRUE; fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no); } if (newline) { putc('\n', f); } if (trx->mysql_thd != NULL) { innobase_mysql_print_thd( f, trx->mysql_thd, static_cast(max_query_len)); } } /**********************************************************************//** Prints info about a transaction. The caller must hold lock_sys->mutex and trx_sys->mutex. When possible, use trx_print() instead. */ UNIV_INTERN void trx_print_latched( /*==============*/ FILE* f, /*!< in: output stream */ const trx_t* trx, /*!< in: transaction */ ulint max_query_len) /*!< in: max query length to print, or 0 to use the default max length */ { ut_ad(lock_mutex_own()); ut_ad(mutex_own(&trx_sys->mutex)); trx_print_low(f, trx, max_query_len, lock_number_of_rows_locked(&trx->lock), UT_LIST_GET_LEN(trx->lock.trx_locks), mem_heap_get_size(trx->lock.lock_heap)); } /**********************************************************************//** Prints info about a transaction. Acquires and releases lock_sys->mutex and trx_sys->mutex. */ UNIV_INTERN void trx_print( /*======*/ FILE* f, /*!< in: output stream */ const trx_t* trx, /*!< in: transaction */ ulint max_query_len) /*!< in: max query length to print, or 0 to use the default max length */ { ulint n_rec_locks; ulint n_trx_locks; ulint heap_size; lock_mutex_enter(); n_rec_locks = lock_number_of_rows_locked(&trx->lock); n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); heap_size = mem_heap_get_size(trx->lock.lock_heap); lock_mutex_exit(); mutex_enter(&trx_sys->mutex); trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size); mutex_exit(&trx_sys->mutex); } #ifdef UNIV_DEBUG /**********************************************************************//** Asserts that a transaction has been started. The caller must hold trx_sys->mutex. @return TRUE if started */ UNIV_INTERN ibool trx_assert_started( /*===============*/ const trx_t* trx) /*!< in: transaction */ { ut_ad(mutex_own(&trx_sys->mutex)); /* Non-locking autocommits should not hold any locks and this function is only called from the locking code. */ assert_trx_in_list(trx); /* trx->state can change from or to NOT_STARTED while we are holding trx_sys->mutex for non-locking autocommit selects but not for other types of transactions. It may change from ACTIVE to PREPARED. Unless we are holding lock_sys->mutex, it may also change to COMMITTED. */ switch (trx->state) { case TRX_STATE_PREPARED: return(TRUE); case TRX_STATE_ACTIVE: case TRX_STATE_COMMITTED_IN_MEMORY: return(TRUE); case TRX_STATE_NOT_STARTED: break; } ut_error; return(FALSE); } #endif /* UNIV_DEBUG */ /*******************************************************************//** Compares the "weight" (or size) of two transactions. Transactions that have edited non-transactional tables are considered heavier than ones that have not. @return TRUE if weight(a) >= weight(b) */ UNIV_INTERN ibool trx_weight_ge( /*==========*/ const trx_t* a, /*!< in: the first transaction to be compared */ const trx_t* b) /*!< in: the second transaction to be compared */ { ibool a_notrans_edit; ibool b_notrans_edit; /* If mysql_thd is NULL for a transaction we assume that it has not edited non-transactional tables. */ a_notrans_edit = a->mysql_thd != NULL && thd_has_edited_nontrans_tables(a->mysql_thd); b_notrans_edit = b->mysql_thd != NULL && thd_has_edited_nontrans_tables(b->mysql_thd); if (a_notrans_edit != b_notrans_edit) { return(a_notrans_edit); } /* Either both had edited non-transactional tables or both had not, we fall back to comparing the number of altered/locked rows. */ #if 0 fprintf(stderr, "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", __func__, a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks), b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks)); #endif return(TRX_WEIGHT(a) >= TRX_WEIGHT(b)); } /****************************************************************//** Prepares a transaction. */ static void trx_prepare( /*========*/ trx_t* trx) /*!< in/out: transaction */ { trx_rseg_t* rseg; lsn_t lsn; mtr_t mtr; rseg = trx->rseg; /* Only fresh user transactions can be prepared. Recovered transactions cannot. */ ut_a(!trx->is_recovered); if (trx->insert_undo != NULL || trx->update_undo != NULL) { mtr_start(&mtr); /* Change the undo log segment states from TRX_UNDO_ACTIVE to TRX_UNDO_PREPARED: these modifications to the file data structure define the transaction as prepared in the file-based world, at the serialization point of lsn. */ mutex_enter(&rseg->mutex); if (trx->insert_undo != NULL) { /* It is not necessary to obtain trx->undo_mutex here because only a single OS thread is allowed to do the transaction prepare for this transaction. */ trx_undo_set_state_at_prepare(trx, trx->insert_undo, &mtr); } if (trx->update_undo) { trx_undo_set_state_at_prepare( trx, trx->update_undo, &mtr); } mutex_exit(&rseg->mutex); /*--------------*/ mtr_commit(&mtr); /* This mtr commit makes the transaction prepared in the file-based world */ /*--------------*/ lsn = mtr.end_lsn; ut_ad(lsn); } else { lsn = 0; } /*--------------------------------------*/ ut_a(trx->state == TRX_STATE_ACTIVE); mutex_enter(&trx_sys->mutex); trx->state = TRX_STATE_PREPARED; trx_sys->n_prepared_trx++; mutex_exit(&trx_sys->mutex); /*--------------------------------------*/ if (lsn) { /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the prepared state of the transaction durable if the OS does not crash. We may also flush the log files to disk, making the prepared state of the transaction durable also at an OS crash or a power outage. The idea in InnoDB's group prepare is that a group of transactions gather behind a trx doing a physical disk write to log files, and when that physical write has been completed, one of those transactions does a write which prepares the whole group. Note that this group prepare will only bring benefit if there are > 2 users in the database. Then at least 2 users can gather behind one doing the physical log write to disk. TODO: find out if MySQL holds some mutex when calling this. That would spoil our group prepare algorithm. */ trx_flush_log_if_needed(lsn, trx); } } /**********************************************************************//** Does the transaction prepare for MySQL. */ UNIV_INTERN void trx_prepare_for_mysql( /*==================*/ trx_t* trx) /*!< in/out: trx handle */ { trx_start_if_not_started_xa(trx); trx->op_info = "preparing"; trx_prepare(trx); trx->op_info = ""; } /**********************************************************************//** This function is used to find number of prepared transactions and their transaction objects for a recovery. @return number of prepared transactions stored in xid_list */ UNIV_INTERN int trx_recover_for_mysql( /*==================*/ XID* xid_list, /*!< in/out: prepared transactions */ ulint len) /*!< in: number of slots in xid_list */ { const trx_t* trx; ulint count = 0; ut_ad(xid_list); ut_ad(len); /* We should set those transactions which are in the prepared state to the xid_list */ mutex_enter(&trx_sys->mutex); for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); trx != NULL; trx = UT_LIST_GET_NEXT(trx_list, trx)) { assert_trx_in_rw_list(trx); /* The state of a read-write transaction cannot change from or to NOT_STARTED while we are holding the trx_sys->mutex. It may change to PREPARED, but not if trx->is_recovered. It may also change to COMMITTED. */ if (trx_state_eq(trx, TRX_STATE_PREPARED)) { xid_list[count] = trx->xid; if (count == 0) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Starting recovery for" " XA transactions...\n"); } ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Transaction " TRX_ID_FMT " in" " prepared state after recovery\n", trx->id); ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Transaction contains changes" " to " TRX_ID_FMT " rows\n", trx->undo_no); count++; if (count == len) { break; } } } mutex_exit(&trx_sys->mutex); if (count > 0){ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: %d transactions in prepared state" " after recovery\n", int (count)); } return(int (count)); } /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state @return trx on match, the trx->xid will be invalidated; note that the trx may have been committed, unless the caller is holding lock_sys->mutex */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) trx_t* trx_get_trx_by_xid_low( /*===================*/ const XID* xid) /*!< in: X/Open XA transaction identifier */ { trx_t* trx; ut_ad(mutex_own(&trx_sys->mutex)); for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list); trx != NULL; trx = UT_LIST_GET_NEXT(trx_list, trx)) { assert_trx_in_rw_list(trx); /* Compare two X/Open XA transaction id's: their length should be the same and binary comparison of gtrid_length+bqual_length bytes should be the same */ if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_PREPARED) && xid->gtrid_length == trx->xid.gtrid_length && xid->bqual_length == trx->xid.bqual_length && memcmp(xid->data, trx->xid.data, xid->gtrid_length + xid->bqual_length) == 0) { /* Invalidate the XID, so that subsequent calls will not find it. */ memset(&trx->xid, 0, sizeof(trx->xid)); trx->xid.formatID = -1; break; } } return(trx); } /*******************************************************************//** This function is used to find one X/Open XA distributed transaction which is in the prepared state @return trx or NULL; on match, the trx->xid will be invalidated; note that the trx may have been committed, unless the caller is holding lock_sys->mutex */ UNIV_INTERN trx_t* trx_get_trx_by_xid( /*===============*/ const XID* xid) /*!< in: X/Open XA transaction identifier */ { trx_t* trx; if (xid == NULL) { return(NULL); } mutex_enter(&trx_sys->mutex); /* Recovered/Resurrected transactions are always only on the trx_sys_t::rw_trx_list. */ trx = trx_get_trx_by_xid_low(xid); mutex_exit(&trx_sys->mutex); return(trx); } /*************************************************************//** Starts the transaction if it is not yet started. */ UNIV_INTERN void trx_start_if_not_started_xa_low( /*============================*/ trx_t* trx) /*!< in: transaction */ { switch (trx->state) { case TRX_STATE_NOT_STARTED: /* Update the info whether we should skip XA steps that eat CPU time. For the duration of the transaction trx->support_xa is not reread from thd so any changes in the value take effect in the next transaction. This is to avoid a scenario where some undo generated by a transaction, has XA stuff, and other undo, generated by the same transaction, doesn't. */ trx->support_xa = thd_supports_xa(trx->mysql_thd); trx_start_low(trx); /* fall through */ case TRX_STATE_ACTIVE: return; case TRX_STATE_PREPARED: case TRX_STATE_COMMITTED_IN_MEMORY: break; } ut_error; } /*************************************************************//** Starts the transaction if it is not yet started. */ UNIV_INTERN void trx_start_if_not_started_low( /*=========================*/ trx_t* trx) /*!< in: transaction */ { switch (trx->state) { case TRX_STATE_NOT_STARTED: trx_start_low(trx); /* fall through */ case TRX_STATE_ACTIVE: return; case TRX_STATE_PREPARED: case TRX_STATE_COMMITTED_IN_MEMORY: break; } ut_error; } /*************************************************************//** Starts the transaction for a DDL operation. */ UNIV_INTERN void trx_start_for_ddl_low( /*==================*/ trx_t* trx, /*!< in/out: transaction */ trx_dict_op_t op) /*!< in: dictionary operation type */ { switch (trx->state) { case TRX_STATE_NOT_STARTED: /* Flag this transaction as a dictionary operation, so that the data dictionary will be locked in crash recovery. */ trx_set_dict_operation(trx, op); /* Ensure it is not flagged as an auto-commit-non-locking transation. */ trx->will_lock = 1; trx->ddl = true; trx_start_low(trx); return; case TRX_STATE_ACTIVE: /* We have this start if not started idiom, therefore we can't add stronger checks here. */ trx->ddl = true; ut_ad(trx->dict_operation != TRX_DICT_OP_NONE); ut_ad(trx->will_lock > 0); return; case TRX_STATE_PREPARED: case TRX_STATE_COMMITTED_IN_MEMORY: break; } ut_error; }