MWL#116: Efficient group commit

Tweak the commit_ordered() semantics. Now it is only called for transactions
that go through 2-phase commit. This avoids forcing engines to make commits
visible before they are durable.

Also take LOCK_commit_ordered() around START TRANSACTION WITH CONSISTENT
SNAPSHOT, to get a truly consistent snapshot.
This commit is contained in:
unknown 2010-10-04 20:40:31 +02:00
parent 498f10a2be
commit 8bc445360e
4 changed files with 117 additions and 98 deletions

View file

@ -1251,32 +1251,7 @@ int ha_commit_one_phase(THD *thd, bool all)
enclosing 'all' transaction is rolled back.
*/
bool is_real_trans=all || thd->transaction.all.ha_list == 0;
Ha_trx_info *ha_info= trans->ha_list;
DBUG_ENTER("ha_commit_one_phase");
#ifdef USING_TRANSACTIONS
if (ha_info)
{
if (is_real_trans)
{
bool locked= false;
for (; ha_info; ha_info= ha_info->next())
{
handlerton *ht= ha_info->ht();
if (ht->commit_ordered)
{
if (ha_info->is_trx_read_write() && !locked)
{
pthread_mutex_lock(&LOCK_commit_ordered);
locked= 1;
}
ht->commit_ordered(ht, thd, all);
}
}
if (locked)
pthread_mutex_unlock(&LOCK_commit_ordered);
}
}
#endif /* USING_TRANSACTIONS */
DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans));
}
@ -1901,7 +1876,13 @@ int ha_start_consistent_snapshot(THD *thd)
{
bool warn= true;
/*
Holding the LOCK_commit_ordered mutex ensures that for any transaction
we either see it committed in all engines, or in none.
*/
pthread_mutex_lock(&LOCK_commit_ordered);
plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
pthread_mutex_unlock(&LOCK_commit_ordered);
/*
Same idea as when one wants to CREATE TABLE in one engine which does not

View file

@ -667,6 +667,11 @@ struct handlerton
full transaction is committed, not for each commit of statement
transaction in a multi-statement transaction.
Not that like prepare(), commit_ordered() is only called when 2-phase
commit takes place. Ie. when no binary log and only a single engine
participates in a transaction, one commit() is called, no
commit_orderd(). So engines must be prepared for this.
The calls to commit_ordered() in multiple parallel transactions is
guaranteed to happen in the same order in every participating
handler. This can be used to ensure the same commit order among multiple
@ -684,11 +689,9 @@ struct handlerton
doing any time-consuming or blocking operations in commit_ordered() will
limit scalability.
Handlers can rely on commit_ordered() calls for transactions that updated
data to be serialised (no two calls can run in parallel, so no extra
locking on the handler part is required to ensure this). However, calls
for SELECT-only transactions are not serialised, so can occur in parallel
with each other and with at most one write-transaction.
Handlers can rely on commit_ordered() calls to be serialised (no two
calls can run in parallel, so no extra locking on the handler part is
required to ensure this).
Note that commit_ordered() can be called from a different thread than the
one handling the transaction! So it can not do anything that depends on
@ -700,7 +703,8 @@ struct handlerton
must be saved and returned from the commit() method instead.
The commit_ordered method is optional, and can be left unset if not
needed in a particular handler.
needed in a particular handler (then there will be no ordering guarantees
wrt. other engines and binary log).
*/
void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
int (*rollback)(handlerton *hton, THD *thd, bool all);

View file

@ -1700,10 +1700,10 @@ innobase_query_caching_of_table_permitted(
/* The call of row_search_.. will start a new transaction if it is
not yet started */
if (trx->active_trans == 0) {
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) {
innobase_register_trx_and_stmt(innodb_hton_ptr, thd);
trx->active_trans = 1;
trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
}
if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
@ -1973,11 +1973,11 @@ ha_innobase::init_table_handle_for_HANDLER(void)
/* Set the MySQL flag to mark that there is an active transaction */
if (prebuilt->trx->active_trans == 0) {
if ((prebuilt->trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) {
innobase_register_trx_and_stmt(ht, user_thd);
prebuilt->trx->active_trans = 1;
prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
}
/* We did the necessary inits in this function, no need to repeat them
@ -2704,58 +2704,21 @@ innobase_start_trx_and_assign_read_view(
/* Set the MySQL flag to mark that there is an active transaction */
if (trx->active_trans == 0) {
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) {
innobase_register_trx_and_stmt(hton, thd);
trx->active_trans = 1;
trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
}
DBUG_RETURN(0);
}
/*****************************************************************//**
Perform the first, fast part of InnoDB commit.
Doing it in this call ensures that we get the same commit order here
as in binlog and any other participating transactional storage engines.
Note that we want to do as little as really needed here, as we run
under a global mutex. The expensive fsync() is done later, in
innobase_commit(), without a lock so group commit can take place.
Note also that this method can be called from a different thread than
the one handling the rest of the transaction. */
static
void
innobase_commit_ordered(
innobase_commit_ordered_2(
/*============*/
handlerton *hton, /*!< in: Innodb handlerton */
THD* thd, /*!< in: MySQL thread handle of the user for whom
the transaction should be committed */
bool all) /*!< in: TRUE - commit transaction
FALSE - the current SQL statement ended */
trx_t* trx) /*!< in: Innodb transaction */
{
trx_t* trx;
DBUG_ENTER("innobase_commit_ordered");
DBUG_ASSERT(hton == innodb_hton_ptr);
trx = check_trx_exists(thd);
if (trx->active_trans == 0
&& trx->conc_state != TRX_NOT_STARTED) {
/* We cannot throw error here; instead we will catch this error
again in innobase_commit() and report it from there. */
DBUG_VOID_RETURN;
}
/* Since we will reserve the kernel mutex, we have to release
the search system latch first to obey the latching order. */
if (trx->has_search_latch) {
trx_search_latch_release_if_reserved(trx);
}
/* commit_ordered is only called when committing the whole transaction
(or an SQL statement when autocommit is on). */
DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
/* We need current binlog position for ibbackup to work.
Note, the position is current because commit_ordered is guaranteed
@ -2807,6 +2770,60 @@ retry:
DBUG_VOID_RETURN;
}
/*****************************************************************//**
Perform the first, fast part of InnoDB commit.
Doing it in this call ensures that we get the same commit order here
as in binlog and any other participating transactional storage engines.
Note that we want to do as little as really needed here, as we run
under a global mutex. The expensive fsync() is done later, in
innobase_commit(), without a lock so group commit can take place.
Note also that this method can be called from a different thread than
the one handling the rest of the transaction. */
static
void
innobase_commit_ordered(
/*============*/
handlerton *hton, /*!< in: Innodb handlerton */
THD* thd, /*!< in: MySQL thread handle of the user for whom
the transaction should be committed */
bool all) /*!< in: TRUE - commit transaction
FALSE - the current SQL statement ended */
{
trx_t* trx;
DBUG_ENTER("innobase_commit_ordered");
DBUG_ASSERT(hton == innodb_hton_ptr);
trx = check_trx_exists(thd);
/* Since we will reserve the kernel mutex, we have to release
the search system latch first to obey the latching order. */
if (trx->has_search_latch) {
trx_search_latch_release_if_reserved(trx);
}
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0
&& trx->conc_state != TRX_NOT_STARTED) {
/* We cannot throw error here; instead we will catch this error
again in innobase_commit() and report it from there. */
DBUG_VOID_RETURN;
}
/* commit_ordered is only called when committing the whole transaction
(or an SQL statement when autocommit is on). */
DBUG_ASSERT(all ||
(!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
innobase_commit_ordered_2(trx);
trx->active_trans |= TRX_ACTIVE_COMMIT_ORDERED;
DBUG_VOID_RETURN;
}
/*****************************************************************//**
Commits a transaction in an InnoDB database or marks an SQL statement
ended.
@ -2829,7 +2846,15 @@ innobase_commit(
trx = check_trx_exists(thd);
/* The flag trx->active_trans is set to 1 in
/* Since we will reserve the kernel mutex, we have to release
the search system latch first to obey the latching order. */
if (trx->has_search_latch &&
(trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) {
trx_search_latch_release_if_reserved(trx);
}
/* The flag TRX_ACTIVE_IN_MYSQL in trx->active_trans is set in
1. ::external_lock(),
2. ::start_stmt(),
@ -2839,20 +2864,26 @@ innobase_commit(
6. innobase_start_trx_and_assign_read_view(),
7. ::transactional_table_lock()
and it is only set to 0 in a commit or a rollback. If it is 0 we know
and it is only cleared in a commit or a rollback. If it is unset we know
there cannot be resources to be freed and we could return immediately.
For the time being, we play safe and do the cleanup though there should
be nothing to clean up. */
if (trx->active_trans == 0
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0
&& trx->conc_state != TRX_NOT_STARTED) {
sql_print_error("trx->active_trans == 0, but"
" trx->conc_state != TRX_NOT_STARTED");
}
if (all
|| (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
/* Run the fast part of commit if we did not already. */
if ((trx->active_trans & TRX_ACTIVE_COMMIT_ORDERED) == 0) {
innobase_commit_ordered_2(trx);
}
/* We were instructed to commit the whole transaction, or
this is an SQL statement end and autocommit is on */
@ -3076,7 +3107,7 @@ innobase_savepoint(
innobase_release_stat_resources(trx);
/* cannot happen outside of transaction */
DBUG_ASSERT(trx->active_trans);
DBUG_ASSERT(trx->active_trans & TRX_ACTIVE_IN_MYSQL);
/* TODO: use provided savepoint data area to store savepoint data */
char name[64];
@ -3106,7 +3137,7 @@ innobase_close_connection(
ut_a(trx);
if (trx->active_trans == 0
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0
&& trx->conc_state != TRX_NOT_STARTED) {
sql_print_error("trx->active_trans == 0, but"
@ -5021,10 +5052,9 @@ no_commit:
no need to re-acquire locks on it. */
/* Altering to InnoDB format */
innobase_commit_ordered(ht, user_thd, 1);
innobase_commit(ht, user_thd, 1);
/* Note that this transaction is still active. */
prebuilt->trx->active_trans = 1;
prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
/* We will need an IX lock on the destination table. */
prebuilt->sql_stat_start = TRUE;
} else {
@ -5038,10 +5068,9 @@ no_commit:
/* Commit the transaction. This will release the table
locks, so they have to be acquired again. */
innobase_commit_ordered(ht, user_thd, 1);
innobase_commit(ht, user_thd, 1);
/* Note that this transaction is still active. */
prebuilt->trx->active_trans = 1;
prebuilt->trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
/* Re-acquire the table lock on the source table. */
row_lock_table_for_mysql(prebuilt, src_table, mode);
/* We will need an IX lock on the destination table. */
@ -8929,10 +8958,10 @@ ha_innobase::start_stmt(
trx->detailed_error[0] = '\0';
/* Set the MySQL flag to mark that there is an active transaction */
if (trx->active_trans == 0) {
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) {
innobase_register_trx_and_stmt(ht, thd);
trx->active_trans = 1;
trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
} else {
innobase_register_stmt(ht, thd);
}
@ -9030,10 +9059,10 @@ ha_innobase::external_lock(
/* Set the MySQL flag to mark that there is an active
transaction */
if (trx->active_trans == 0) {
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) {
innobase_register_trx_and_stmt(ht, thd);
trx->active_trans = 1;
trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
} else if (trx->n_mysql_tables_in_use == 0) {
innobase_register_stmt(ht, thd);
}
@ -9131,8 +9160,7 @@ ha_innobase::external_lock(
prebuilt->used_in_HANDLER = FALSE;
if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
if (trx->active_trans != 0) {
innobase_commit_ordered(ht, thd, TRUE);
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) != 0) {
innobase_commit(ht, thd, TRUE);
}
} else {
@ -9217,10 +9245,10 @@ ha_innobase::transactional_table_lock(
/* MySQL is setting a new transactional table lock */
/* Set the MySQL flag to mark that there is an active transaction */
if (trx->active_trans == 0) {
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0) {
innobase_register_trx_and_stmt(ht, thd);
trx->active_trans = 1;
trx->active_trans |= TRX_ACTIVE_IN_MYSQL;
}
if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) {
@ -10272,7 +10300,8 @@ innobase_xa_prepare(
innobase_release_stat_resources(trx);
if (trx->active_trans == 0 && trx->conc_state != TRX_NOT_STARTED) {
if ((trx->active_trans & TRX_ACTIVE_IN_MYSQL) == 0 &&
trx->conc_state != TRX_NOT_STARTED) {
sql_print_error("trx->active_trans == 0, but trx->conc_state != "
"TRX_NOT_STARTED");
@ -10284,7 +10313,7 @@ innobase_xa_prepare(
/* We were instructed to prepare the whole transaction, or
this is an SQL statement end and autocommit is on */
ut_ad(trx->active_trans);
ut_ad(trx->active_trans & TRX_ACTIVE_IN_MYSQL);
error = (int) trx_prepare_for_mysql(trx);
} else {

View file

@ -511,9 +511,10 @@ struct trx_struct{
in that case we must flush the log
in trx_commit_complete_for_mysql() */
ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
ulint active_trans; /*!< 1 - if a transaction in MySQL
is active. 2 - if prepare_commit_mutex
was taken */
ulint active_trans; /*!< TRX_ACTIVE_IN_MYSQL - set if a
transaction in MySQL is active.
TRX_ACTIVE_COMMIT_ORDERED - set if
innobase_commit_ordered has run */
ulint has_search_latch;
/* TRUE if this trx has latched the
search system latch in S-mode */
@ -824,6 +825,10 @@ Multiple flags can be combined with bitwise OR. */
#define TRX_SIG_OTHER_SESS 1 /* sent by another session (which
must hold rights to this) */
/* Flag bits for trx_struct.active_trans */
#define TRX_ACTIVE_IN_MYSQL (1<<0)
#define TRX_ACTIVE_COMMIT_ORDERED (1<<1)
/** Commit node states */
enum commit_node_state {
COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to