branches/zip

rb://130

Enable Group Commit functionality that was broken in 5.0 when
distributed transactions were introduced.

Reviewed by: Heikki
This commit is contained in:
inaam 2009-06-09 16:46:29 +00:00
parent b667060360
commit bfa7cf72aa
3 changed files with 43 additions and 34 deletions

View file

@ -2449,7 +2449,12 @@ retry:
trx->mysql_log_file_name = mysql_bin_log_file_name(); trx->mysql_log_file_name = mysql_bin_log_file_name();
trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
/* Don't do write + flush right now. For group commit
to work we want to do the flush after releasing the
prepare_commit_mutex. */
trx->flush_log_later = TRUE;
innobase_commit_low(trx); innobase_commit_low(trx);
trx->flush_log_later = FALSE;
if (innobase_commit_concurrency > 0) { if (innobase_commit_concurrency > 0) {
pthread_mutex_lock(&commit_cond_m); pthread_mutex_lock(&commit_cond_m);
@ -2463,6 +2468,8 @@ retry:
pthread_mutex_unlock(&prepare_commit_mutex); pthread_mutex_unlock(&prepare_commit_mutex);
} }
/* Now do a write + flush of logs. */
trx_commit_complete_for_mysql(trx);
trx->active_trans = 0; trx->active_trans = 0;
} else { } else {
@ -8934,33 +8941,6 @@ innobase_xa_prepare(
DBUG_ASSERT(hton == innodb_hton_ptr); DBUG_ASSERT(hton == innodb_hton_ptr);
if (thd_sql_command(thd) != SQLCOM_XA_PREPARE &&
(all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
{
/* For ibbackup to work the order of transactions in binlog
and InnoDB must be the same. Consider the situation
thread1> prepare; write to binlog; ...
<context switch>
thread2> prepare; write to binlog; commit
thread1> ... commit
To ensure this will not happen we're taking the mutex on
prepare, and releasing it on commit.
Note: only do it for normal commits, done via ha_commit_trans.
If 2pc protocol is executed by external transaction
coordinator, it will be just a regular MySQL client
executing XA PREPARE and XA COMMIT commands.
In this case we cannot know how many minutes or hours
will be between XA PREPARE and XA COMMIT, and we don't want
to block for undefined period of time.
*/
pthread_mutex_lock(&prepare_commit_mutex);
trx->active_trans = 2;
}
/* we use support_xa value as it was seen at transaction start /* we use support_xa value as it was seen at transaction start
time, not the current session variable value. Any possible changes time, not the current session variable value. Any possible changes
to the session variable take effect only in the next transaction */ to the session variable take effect only in the next transaction */
@ -9013,6 +8993,33 @@ innobase_xa_prepare(
srv_active_wake_master_thread(); srv_active_wake_master_thread();
if (thd_sql_command(thd) != SQLCOM_XA_PREPARE &&
(all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
{
/* For ibbackup to work the order of transactions in binlog
and InnoDB must be the same. Consider the situation
thread1> prepare; write to binlog; ...
<context switch>
thread2> prepare; write to binlog; commit
thread1> ... commit
To ensure this will not happen we're taking the mutex on
prepare, and releasing it on commit.
Note: only do it for normal commits, done via ha_commit_trans.
If 2pc protocol is executed by external transaction
coordinator, it will be just a regular MySQL client
executing XA PREPARE and XA COMMIT commands.
In this case we cannot know how many minutes or hours
will be between XA PREPARE and XA COMMIT, and we don't want
to block for undefined period of time.
*/
pthread_mutex_lock(&prepare_commit_mutex);
trx->active_trans = 2;
}
return(error); return(error);
} }

View file

@ -497,10 +497,12 @@ struct trx_struct{
FALSE, one can save CPU time and about FALSE, one can save CPU time and about
150 bytes in the undo log size as then 150 bytes in the undo log size as then
we skip XA steps */ we skip XA steps */
unsigned flush_log_later:1;/* when we commit the transaction unsigned flush_log_later:1;/* In 2PC, we hold the
in MySQL's binlog write, we will prepare_commit mutex across
flush the log to disk later in both phases. In that case, we
a separate call */ defer flush of the logs to disk
until after we release the
mutex. */
unsigned must_flush_log_later:1;/* this flag is set to TRUE in unsigned must_flush_log_later:1;/* this flag is set to TRUE in
trx_commit_off_kernel() if trx_commit_off_kernel() if
flush_log_later was TRUE, and there flush_log_later was TRUE, and there

View file

@ -891,11 +891,11 @@ trx_commit_off_kernel(
there are > 2 users in the database. Then at least 2 users can there are > 2 users in the database. Then at least 2 users can
gather behind one doing the physical log write to disk. gather behind one doing the physical log write to disk.
If we are calling trx_commit() under MySQL's binlog mutex, we If we are calling trx_commit() under prepare_commit_mutex, we
will delay possible log write and flush to a separate function will delay possible log write and flush to a separate function
trx_commit_complete_for_mysql(), which is only called when the trx_commit_complete_for_mysql(), which is only called when the
thread has released the binlog mutex. This is to make the thread has released the mutex. This is to make the
group commit algorithm to work. Otherwise, the MySQL binlog group commit algorithm to work. Otherwise, the prepare_commit
mutex would serialize all commits and prevent a group of mutex would serialize all commits and prevent a group of
transactions from gathering. */ transactions from gathering. */