2013-06-24 10:50:25 +02:00
|
|
|
#include "my_global.h"
|
|
|
|
#include "rpl_parallel.h"
|
|
|
|
#include "slave.h"
|
|
|
|
#include "rpl_mi.h"
|
2013-12-18 16:26:22 +01:00
|
|
|
#include "debug_sync.h"
|
2013-06-24 10:50:25 +02:00
|
|
|
|
2013-07-03 13:46:33 +02:00
|
|
|
/*
|
|
|
|
Code for optional parallel execution of replicated events on the slave.
|
|
|
|
*/
|
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
Maximum number of queued events to accumulate in a local free list, before
|
|
|
|
moving them to the global free list. There is additional a limit of how much
|
|
|
|
to accumulate based on opt_slave_parallel_max_queued.
|
|
|
|
*/
|
|
|
|
#define QEV_BATCH_FREE 200
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
struct rpl_parallel_thread_pool global_rpl_thread_pool;
|
|
|
|
|
2014-03-21 10:11:28 +01:00
|
|
|
static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
|
|
|
|
int err);
|
2013-06-24 10:50:25 +02:00
|
|
|
|
2013-10-14 15:28:16 +02:00
|
|
|
static int
|
2013-06-24 10:50:25 +02:00
|
|
|
rpt_handle_event(rpl_parallel_thread::queued_event *qev,
|
|
|
|
struct rpl_parallel_thread *rpt)
|
|
|
|
{
|
2013-12-13 14:26:51 +01:00
|
|
|
int err;
|
2013-09-13 15:09:57 +02:00
|
|
|
rpl_group_info *rgi= qev->rgi;
|
2013-06-28 15:19:30 +02:00
|
|
|
Relay_log_info *rli= rgi->rli;
|
2013-07-08 16:47:07 +02:00
|
|
|
THD *thd= rgi->thd;
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
Log_event *ev;
|
|
|
|
|
|
|
|
DBUG_ASSERT(qev->typ == rpl_parallel_thread::queued_event::QUEUED_EVENT);
|
|
|
|
ev= qev->ev;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
2014-04-25 12:58:31 +02:00
|
|
|
thd->system_thread_info.rpl_sql_info->rpl_filter = rli->mi->rpl_filter;
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
ev->thd= thd;
|
2013-07-03 13:46:33 +02:00
|
|
|
|
2013-10-23 15:03:03 +02:00
|
|
|
strcpy(rgi->event_relay_log_name_buf, qev->event_relay_log_name);
|
|
|
|
rgi->event_relay_log_name= rgi->event_relay_log_name_buf;
|
|
|
|
rgi->event_relay_log_pos= qev->event_relay_log_pos;
|
2013-10-17 14:11:19 +02:00
|
|
|
rgi->future_event_relay_log_pos= qev->future_event_relay_log_pos;
|
2013-10-23 15:03:03 +02:00
|
|
|
strcpy(rgi->future_event_master_log_name, qev->future_event_master_log_name);
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
mysql_mutex_lock(&rli->data_lock);
|
|
|
|
/* Mutex will be released in apply_event_and_update_pos(). */
|
|
|
|
err= apply_event_and_update_pos(ev, thd, rgi, rpt);
|
2013-10-14 23:17:16 +02:00
|
|
|
|
|
|
|
thread_safe_increment64(&rli->executed_entries,
|
|
|
|
&slave_executed_entries_lock);
|
2013-06-24 10:50:25 +02:00
|
|
|
/* ToDo: error handling. */
|
2013-10-14 15:28:16 +02:00
|
|
|
return err;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-10-31 14:11:41 +01:00
|
|
|
static void
|
|
|
|
handle_queued_pos_update(THD *thd, rpl_parallel_thread::queued_event *qev)
|
|
|
|
{
|
|
|
|
int cmp;
|
|
|
|
Relay_log_info *rli;
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
rpl_parallel_entry *e;
|
|
|
|
|
2013-10-31 14:11:41 +01:00
|
|
|
/*
|
|
|
|
Events that are not part of an event group, such as Format Description,
|
|
|
|
Stop, GTID List and such, are executed directly in the driver SQL thread,
|
|
|
|
to keep the relay log state up-to-date. But the associated position update
|
|
|
|
is done here, in sync with other normal events as they are queued to
|
|
|
|
worker threads.
|
|
|
|
*/
|
|
|
|
if ((thd->variables.option_bits & OPTION_BEGIN) &&
|
|
|
|
opt_using_transactions)
|
|
|
|
return;
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
|
|
|
|
/* Do not update position if an earlier event group caused an error abort. */
|
|
|
|
DBUG_ASSERT(qev->typ == rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE);
|
|
|
|
e= qev->entry_for_queued;
|
|
|
|
if (e->stop_on_error_sub_id < (uint64)ULONGLONG_MAX || e->force_abort)
|
|
|
|
return;
|
|
|
|
|
2013-10-31 14:11:41 +01:00
|
|
|
rli= qev->rgi->rli;
|
|
|
|
mysql_mutex_lock(&rli->data_lock);
|
|
|
|
cmp= strcmp(rli->group_relay_log_name, qev->event_relay_log_name);
|
|
|
|
if (cmp < 0)
|
|
|
|
{
|
|
|
|
rli->group_relay_log_pos= qev->future_event_relay_log_pos;
|
|
|
|
strmake_buf(rli->group_relay_log_name, qev->event_relay_log_name);
|
|
|
|
rli->notify_group_relay_log_name_update();
|
|
|
|
} else if (cmp == 0 &&
|
|
|
|
rli->group_relay_log_pos < qev->future_event_relay_log_pos)
|
|
|
|
rli->group_relay_log_pos= qev->future_event_relay_log_pos;
|
|
|
|
|
|
|
|
cmp= strcmp(rli->group_master_log_name, qev->future_event_master_log_name);
|
|
|
|
if (cmp < 0)
|
|
|
|
{
|
|
|
|
strcpy(rli->group_master_log_name, qev->future_event_master_log_name);
|
|
|
|
rli->notify_group_master_log_name_update();
|
|
|
|
rli->group_master_log_pos= qev->future_event_master_log_pos;
|
|
|
|
}
|
|
|
|
else if (cmp == 0
|
|
|
|
&& rli->group_master_log_pos < qev->future_event_master_log_pos)
|
|
|
|
rli->group_master_log_pos= qev->future_event_master_log_pos;
|
|
|
|
mysql_mutex_unlock(&rli->data_lock);
|
|
|
|
mysql_cond_broadcast(&rli->data_cond);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-11-05 12:01:26 +01:00
|
|
|
static void
|
2014-03-21 10:11:28 +01:00
|
|
|
finish_event_group(THD *thd, uint64 sub_id, rpl_parallel_entry *entry,
|
|
|
|
rpl_group_info *rgi)
|
2013-11-05 12:01:26 +01:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
wait_for_commit *wfc= &rgi->commit_orderer;
|
2014-03-21 10:11:28 +01:00
|
|
|
int err;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
2013-11-05 12:01:26 +01:00
|
|
|
/*
|
|
|
|
Remove any left-over registration to wait for a prior commit to
|
|
|
|
complete. Normally, such wait would already have been removed at
|
MDEV-4506: Parallel replication
MDEV-5217: Incorrect MyISAM event execution order causing incorrect parallel replication
In parallel replication, if transactions A,B group-commit together on the
master, we can execute them in parallel on a replication slave. But then, if
transaction C follows on the master, on the slave, we need to be sure that
both A and B have completed before starting on C to be sure to avoid
conflicts.
The necessary wait is implemented such that B waits for A to commit before it
commits itself (thus preserving commit order). And C waits for B to commit
before it itself can start executing. This way C does not start until both A
and B have completed.
The wait for B's commit on A happens inside the commit processing. However, in
the case of MyISAM with no binlog enabled on the slave, it appears that no
commit processing takes place (since MyISAM is non-transactional), and thus
the wait of B for A was not done. This allowed C to start before A, which can
lead to conflicts and incorrect replication.
Fixed by doing an extra wait for A at the end of B before signalling C.
2013-11-06 14:51:06 +01:00
|
|
|
this point by wait_for_prior_commit() called from within COMMIT
|
|
|
|
processing. However, in case of MyISAM and no binlog, we might not
|
|
|
|
have any commit processing, and so we need to do the wait here,
|
|
|
|
before waking up any subsequent commits, to preserve correct
|
|
|
|
order of event execution. Also, in the error case we might have
|
|
|
|
skipped waiting and thus need to remove it explicitly.
|
|
|
|
|
|
|
|
It is important in the non-error case to do a wait, not just an
|
|
|
|
unregister. Because we might be last in a group-commit that is
|
|
|
|
replicated in parallel, and the following event will then wait
|
|
|
|
for us to complete and rely on this also ensuring that any other
|
|
|
|
event in the group has completed.
|
|
|
|
|
|
|
|
But in the error case, we have to abort anyway, and it seems best
|
|
|
|
to just complete as quickly as possible with unregister. Anyone
|
|
|
|
waiting for us will in any case receive the error back from their
|
|
|
|
wait_for_prior_commit() call.
|
2013-11-05 12:01:26 +01:00
|
|
|
*/
|
2014-03-21 10:11:28 +01:00
|
|
|
if (rgi->worker_error)
|
MDEV-4506: Parallel replication
MDEV-5217: Incorrect MyISAM event execution order causing incorrect parallel replication
In parallel replication, if transactions A,B group-commit together on the
master, we can execute them in parallel on a replication slave. But then, if
transaction C follows on the master, on the slave, we need to be sure that
both A and B have completed before starting on C to be sure to avoid
conflicts.
The necessary wait is implemented such that B waits for A to commit before it
commits itself (thus preserving commit order). And C waits for B to commit
before it itself can start executing. This way C does not start until both A
and B have completed.
The wait for B's commit on A happens inside the commit processing. However, in
the case of MyISAM with no binlog enabled on the slave, it appears that no
commit processing takes place (since MyISAM is non-transactional), and thus
the wait of B for A was not done. This allowed C to start before A, which can
lead to conflicts and incorrect replication.
Fixed by doing an extra wait for A at the end of B before signalling C.
2013-11-06 14:51:06 +01:00
|
|
|
wfc->unregister_wait_for_prior_commit();
|
2014-03-21 10:11:28 +01:00
|
|
|
else if ((err= wfc->wait_for_prior_commit(thd)))
|
|
|
|
signal_error_to_sql_driver_thread(thd, rgi, err);
|
2013-11-05 12:01:26 +01:00
|
|
|
thd->wait_for_commit_ptr= NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
Record that this event group has finished (eg. transaction is
|
|
|
|
committed, if transactional), so other event groups will no longer
|
|
|
|
attempt to wait for us to commit. Once we have increased
|
|
|
|
entry->last_committed_sub_id, no other threads will execute
|
|
|
|
register_wait_for_prior_commit() against us. Thus, by doing one
|
|
|
|
extra (usually redundant) wakeup_subsequent_commits() we can ensure
|
|
|
|
that no register_wait_for_prior_commit() can ever happen without a
|
|
|
|
subsequent wakeup_subsequent_commits() to wake it up.
|
|
|
|
|
|
|
|
We can race here with the next transactions, but that is fine, as
|
|
|
|
long as we check that we do not decrease last_committed_sub_id. If
|
|
|
|
this commit is done, then any prior commits will also have been
|
|
|
|
done and also no longer need waiting for.
|
|
|
|
*/
|
|
|
|
mysql_mutex_lock(&entry->LOCK_parallel_entry);
|
|
|
|
if (entry->last_committed_sub_id < sub_id)
|
|
|
|
entry->last_committed_sub_id= sub_id;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
If this event group got error, then any following event groups that have
|
|
|
|
not yet started should just skip their group, preparing for stop of the
|
|
|
|
SQL driver thread.
|
|
|
|
*/
|
2014-03-21 10:11:28 +01:00
|
|
|
if (unlikely(rgi->worker_error) &&
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
entry->stop_on_error_sub_id == (uint64)ULONGLONG_MAX)
|
|
|
|
entry->stop_on_error_sub_id= sub_id;
|
|
|
|
/*
|
|
|
|
We need to mark that this event group started its commit phase, in case we
|
|
|
|
missed it before (otherwise we would deadlock the next event group that is
|
|
|
|
waiting for this). In most cases (normal DML), it will be a no-op.
|
|
|
|
*/
|
|
|
|
rgi->mark_start_commit_no_lock();
|
2013-11-05 12:01:26 +01:00
|
|
|
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
thd->clear_error();
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
thd->reset_killed();
|
2014-02-26 16:38:42 +01:00
|
|
|
thd->get_stmt_da()->reset_diagnostics_area();
|
2014-03-21 10:11:28 +01:00
|
|
|
wfc->wakeup_subsequent_commits(rgi->worker_error);
|
2013-11-05 12:01:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-12-13 14:26:51 +01:00
|
|
|
static void
|
2014-03-21 10:11:28 +01:00
|
|
|
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
|
2013-12-13 14:26:51 +01:00
|
|
|
{
|
2014-03-21 10:11:28 +01:00
|
|
|
rgi->worker_error= err;
|
2013-12-13 14:26:51 +01:00
|
|
|
rgi->cleanup_context(thd, true);
|
|
|
|
rgi->rli->abort_slave= true;
|
2014-03-03 12:13:55 +01:00
|
|
|
rgi->rli->stop_for_until= false;
|
2013-12-13 14:26:51 +01:00
|
|
|
mysql_mutex_lock(rgi->rli->relay_log.get_log_lock());
|
|
|
|
mysql_mutex_unlock(rgi->rli->relay_log.get_log_lock());
|
|
|
|
rgi->rli->relay_log.signal_update();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
static void
|
|
|
|
unlock_or_exit_cond(THD *thd, mysql_mutex_t *lock, bool *did_enter_cond,
|
2014-02-26 16:38:42 +01:00
|
|
|
PSI_stage_info *old_stage)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
if (*did_enter_cond)
|
|
|
|
{
|
2014-02-26 16:38:42 +01:00
|
|
|
thd->EXIT_COND(old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
*did_enter_cond= false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
mysql_mutex_unlock(lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
static void
|
|
|
|
register_wait_for_prior_event_group_commit(rpl_group_info *rgi,
|
|
|
|
rpl_parallel_entry *entry)
|
|
|
|
{
|
|
|
|
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
|
|
|
|
if (rgi->wait_commit_sub_id > entry->last_committed_sub_id)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
Register that the commit of this event group must wait for the
|
|
|
|
commit of the previous event group to complete before it may
|
|
|
|
complete itself, so that we preserve commit order.
|
|
|
|
*/
|
|
|
|
wait_for_commit *waitee=
|
|
|
|
&rgi->wait_commit_group_info->commit_orderer;
|
|
|
|
rgi->commit_orderer.register_wait_for_prior_commit(waitee);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-05-13 13:42:06 +02:00
|
|
|
#ifndef DBUG_OFF
|
|
|
|
static int
|
|
|
|
dbug_simulate_tmp_error(rpl_group_info *rgi, THD *thd)
|
|
|
|
{
|
|
|
|
if (rgi->current_gtid.domain_id == 0 && rgi->current_gtid.seq_no == 100 &&
|
|
|
|
rgi->retry_event_count == 4)
|
|
|
|
{
|
|
|
|
thd->clear_error();
|
|
|
|
thd->get_stmt_da()->reset_diagnostics_area();
|
|
|
|
my_error(ER_LOCK_DEADLOCK, MYF(0));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
/*
|
|
|
|
If we detect a deadlock due to eg. storage engine locks that conflict with
|
|
|
|
the fixed commit order, then the later transaction will be killed
|
|
|
|
asynchroneously to allow the former to complete its commit.
|
|
|
|
|
|
|
|
In this case, we convert the 'killed' error into a deadlock error, and retry
|
|
|
|
the later transaction. */
|
|
|
|
static void
|
|
|
|
convert_kill_to_deadlock_error(rpl_group_info *rgi)
|
|
|
|
{
|
|
|
|
THD *thd= rgi->thd;
|
2014-07-10 13:55:53 +02:00
|
|
|
int err_code;
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
|
2014-07-10 13:55:53 +02:00
|
|
|
if (!thd->get_stmt_da()->is_error())
|
|
|
|
return;
|
|
|
|
err_code= thd->get_stmt_da()->sql_errno();
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
Remove the temporary fix for MDEV-5914, which used READ COMMITTED for parallel
replication worker threads. Replace it with a better, more selective solution.
The issue is with certain edge cases of InnoDB gap locks, for example between
INSERT and ranged DELETE. It is possible for the gap lock set by the DELETE to
block the INSERT, if the DELETE runs first, while the record lock set by
INSERT does not block the DELETE, if the INSERT runs first. This can cause a
conflict between the two in parallel replication on the slave even though they
ran without conflicts on the master.
With this patch, InnoDB will ask the server layer about the two involved
transactions before blocking on a gap lock. If the server layer tells InnoDB
that the transactions are already fixed wrt. commit order, as they are in
parallel replication, InnoDB will ignore the gap lock and allow the two
transactions to proceed in parallel, avoiding the conflict.
Improve the fix for MDEV-6020. When InnoDB itself detects a deadlock, it now
asks the server layer for any preferences about which transaction to roll
back. In case of parallel replication with two transactions T1 and T2 fixed to
commit T1 before T2, the server layer will ask InnoDB to roll back T2 as the
deadlock victim, not T1. This helps in some cases to avoid excessive deadlock
rollback, as T2 will in any case need to wait for T1 to complete before it can
itself commit.
Also some misc. fixes found during development and testing:
- Remove thd_rpl_is_parallel(), it is not used or needed.
- Use KILL_CONNECTION instead of KILL_QUERY when a parallel replication
worker thread is killed to resolve a deadlock with fixed commit
ordering. There are some cases, eg. in sql/sql_parse.cc, where a KILL_QUERY
can be ignored if the query otherwise completed successfully, and this
could cause the deadlock kill to be lost, so that the deadlock was not
correctly resolved.
- Fix random test failure due to missing wait_for_binlog_checkpoint.inc.
- Make sure that deadlock or other temporary errors during parallel
replication are not printed to the the error log; there were some places
around the replication code with extra error logging. These conditions can
occur occasionally and are handled automatically without breaking
replication, so they should not pollute the error log.
- Fix handling of rgi->gtid_sub_id. We need to be able to access this also at
the end of a transaction, to be able to detect and resolve deadlocks due to
commit ordering. But this value was also used as a flag to mark whether
record_gtid() had been called, by being set to zero, losing the value. Now,
introduce a separate flag rgi->gtid_pending, so rgi->gtid_sub_id remains
valid for the entire duration of the transaction.
- Fix one place where the code to handle ignored errors called reset_killed()
unconditionally, even if no error was caught that should be ignored. This
could cause loss of a deadlock kill signal, breaking deadlock detection and
resolution.
- Fix a couple of missing mysql_reset_thd_for_next_command(). This could
cause a prior error condition to remain for the next event executed,
causing assertions about errors already being set and possibly giving
incorrect error handling for following event executions.
- Fix code that cleared thd->rgi_slave in the parallel replication worker
threads after each event execution; this caused the deadlock detection and
handling code to not be able to correctly process the associated
transactions as belonging to replication worker threads.
- Remove useless error code in slave_background_kill_request().
- Fix bug where wfc->wakeup_error was not cleared at
wait_for_commit::unregister_wait_for_prior_commit(). This could cause the
error condition to wrongly propagate to a later wait_for_prior_commit(),
causing spurious ER_PRIOR_COMMIT_FAILED errors.
- Do not put the binlog background thread into the processlist. It causes
too many result differences in mtr, but also it probably is not useful
for users to pollute the process list with a system thread that does not
really perform any user-visible tasks...
2014-06-10 10:13:15 +02:00
|
|
|
if ((err_code == ER_QUERY_INTERRUPTED || err_code == ER_CONNECTION_KILLED) &&
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
rgi->killed_for_retry)
|
|
|
|
{
|
|
|
|
thd->clear_error();
|
|
|
|
my_error(ER_LOCK_DEADLOCK, MYF(0));
|
|
|
|
rgi->killed_for_retry= false;
|
|
|
|
thd->reset_killed();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
is_group_ending(Log_event *ev, Log_event_type event_type)
|
|
|
|
{
|
|
|
|
return event_type == XID_EVENT ||
|
|
|
|
(event_type == QUERY_EVENT &&
|
|
|
|
(((Query_log_event *)ev)->is_commit() ||
|
|
|
|
((Query_log_event *)ev)->is_rollback()));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
static int
|
|
|
|
retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
|
|
|
|
rpl_parallel_thread::queued_event *orig_qev)
|
|
|
|
{
|
|
|
|
IO_CACHE rlog;
|
2014-05-15 15:52:08 +02:00
|
|
|
LOG_INFO linfo;
|
|
|
|
File fd= (File)-1;
|
2014-05-08 14:20:18 +02:00
|
|
|
const char *errmsg= NULL;
|
|
|
|
inuse_relaylog *ir= rgi->relay_log;
|
2014-05-13 13:42:06 +02:00
|
|
|
uint64 event_count;
|
2014-05-08 14:20:18 +02:00
|
|
|
uint64 events_to_execute= rgi->retry_event_count;
|
|
|
|
Relay_log_info *rli= rgi->rli;
|
2014-05-13 13:42:06 +02:00
|
|
|
int err;
|
2014-05-08 14:20:18 +02:00
|
|
|
ulonglong cur_offset, old_offset;
|
|
|
|
char log_name[FN_REFLEN];
|
|
|
|
THD *thd= rgi->thd;
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
rpl_parallel_entry *entry= rgi->parallel_entry;
|
2014-05-13 13:42:06 +02:00
|
|
|
ulong retries= 0;
|
2014-11-13 10:09:46 +01:00
|
|
|
Format_description_log_event *description_event= NULL;
|
2014-05-08 14:20:18 +02:00
|
|
|
|
|
|
|
do_retry:
|
2014-05-13 13:42:06 +02:00
|
|
|
event_count= 0;
|
|
|
|
err= 0;
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
If we already started committing before getting the deadlock (or other
|
|
|
|
error) that caused us to need to retry, we have already signalled
|
|
|
|
subsequent transactions that we have started committing. This is
|
|
|
|
potentially a problem, as now we will rollback, and if subsequent
|
|
|
|
transactions would start to execute now, they could see an unexpected
|
|
|
|
state of the database and get eg. key not found or duplicate key error.
|
|
|
|
|
|
|
|
However, to get a deadlock in the first place, there must have been
|
|
|
|
another earlier transaction that is waiting for us. Thus that other
|
|
|
|
transaction has _not_ yet started to commit, and any subsequent
|
|
|
|
transactions will still be waiting at this point.
|
|
|
|
|
|
|
|
So here, we decrement back the count of transactions that started
|
|
|
|
committing (if we already incremented it), undoing the effect of an
|
|
|
|
earlier mark_start_commit(). Then later, when the retry succeeds and we
|
|
|
|
commit again, we can do a new mark_start_commit() and eventually wake up
|
|
|
|
subsequent transactions at the proper time.
|
|
|
|
|
|
|
|
We need to do the unmark before the rollback, to be sure that the
|
|
|
|
transaction we deadlocked with will not signal that it started to commit
|
|
|
|
until after the unmark.
|
|
|
|
*/
|
|
|
|
rgi->unmark_start_commit();
|
|
|
|
|
|
|
|
/*
|
|
|
|
We might get the deadlock error that causes the retry during commit, while
|
|
|
|
sitting in wait_for_prior_commit(). If this happens, we will have a
|
|
|
|
pending error in the wait_for_commit object. So clear this by
|
|
|
|
unregistering (and later re-registering) the wait.
|
|
|
|
*/
|
|
|
|
if(thd->wait_for_commit_ptr)
|
|
|
|
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
|
2014-05-08 14:20:18 +02:00
|
|
|
rgi->cleanup_context(thd, 1);
|
|
|
|
|
2014-08-13 13:34:28 +02:00
|
|
|
/*
|
|
|
|
If we retry due to a deadlock kill that occured during the commit step, we
|
|
|
|
might have already updated (but not committed) an update of table
|
|
|
|
mysql.gtid_slave_pos, and cleared the gtid_pending flag. Now we have
|
|
|
|
rolled back any such update, so we must set the gtid_pending flag back to
|
|
|
|
true so that we will do a new update when/if we succeed with the retry.
|
|
|
|
*/
|
|
|
|
rgi->gtid_pending= true;
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
mysql_mutex_lock(&rli->data_lock);
|
|
|
|
++rli->retried_trans;
|
|
|
|
statistic_increment(slave_retried_transactions, LOCK_status);
|
|
|
|
mysql_mutex_unlock(&rli->data_lock);
|
|
|
|
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
mysql_mutex_lock(&entry->LOCK_parallel_entry);
|
|
|
|
register_wait_for_prior_event_group_commit(rgi, entry);
|
|
|
|
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
|
|
|
|
|
2014-07-08 12:54:47 +02:00
|
|
|
strmake_buf(log_name, ir->name);
|
2014-05-08 14:20:18 +02:00
|
|
|
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
|
2014-05-15 15:52:08 +02:00
|
|
|
{
|
|
|
|
err= 1;
|
|
|
|
goto err;
|
|
|
|
}
|
2014-05-08 14:20:18 +02:00
|
|
|
cur_offset= rgi->retry_start_offset;
|
2014-11-13 10:09:46 +01:00
|
|
|
delete description_event;
|
|
|
|
description_event=
|
|
|
|
read_relay_log_description_event(&rlog, cur_offset, &errmsg);
|
|
|
|
if (!description_event)
|
|
|
|
{
|
|
|
|
err= 1;
|
|
|
|
goto err;
|
|
|
|
}
|
2014-05-08 14:20:18 +02:00
|
|
|
my_b_seek(&rlog, cur_offset);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
Log_event_type event_type;
|
|
|
|
Log_event *ev;
|
2014-05-15 15:52:08 +02:00
|
|
|
rpl_parallel_thread::queued_event *qev;
|
2014-05-08 14:20:18 +02:00
|
|
|
|
2014-05-15 15:52:08 +02:00
|
|
|
/* The loop is here so we can try again the next relay log file on EOF. */
|
|
|
|
for (;;)
|
2014-05-08 14:20:18 +02:00
|
|
|
{
|
2014-05-15 15:52:08 +02:00
|
|
|
old_offset= cur_offset;
|
2014-11-13 10:09:46 +01:00
|
|
|
ev= Log_event::read_log_event(&rlog, 0, description_event,
|
2014-05-15 15:52:08 +02:00
|
|
|
opt_slave_sql_verify_checksum);
|
|
|
|
cur_offset= my_b_tell(&rlog);
|
2014-05-08 14:20:18 +02:00
|
|
|
|
2014-05-15 15:52:08 +02:00
|
|
|
if (ev)
|
|
|
|
break;
|
|
|
|
if (rlog.error < 0)
|
2014-05-08 14:20:18 +02:00
|
|
|
{
|
2014-05-15 15:52:08 +02:00
|
|
|
errmsg= "slave SQL thread aborted because of I/O error";
|
2014-05-08 14:20:18 +02:00
|
|
|
err= 1;
|
|
|
|
goto err;
|
|
|
|
}
|
2014-05-15 15:52:08 +02:00
|
|
|
if (rlog.error > 0)
|
|
|
|
{
|
|
|
|
sql_print_error("Slave SQL thread: I/O error reading "
|
|
|
|
"event(errno: %d cur_log->error: %d)",
|
|
|
|
my_errno, rlog.error);
|
|
|
|
errmsg= "Aborting slave SQL thread because of partial event read";
|
|
|
|
err= 1;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
/* EOF. Move to the next relay log. */
|
|
|
|
end_io_cache(&rlog);
|
|
|
|
mysql_file_close(fd, MYF(MY_WME));
|
|
|
|
fd= (File)-1;
|
|
|
|
|
|
|
|
/* Find the next relay log file. */
|
|
|
|
if((err= rli->relay_log.find_log_pos(&linfo, log_name, 1)) ||
|
|
|
|
(err= rli->relay_log.find_next_log(&linfo, 1)))
|
|
|
|
{
|
|
|
|
char buff[22];
|
|
|
|
sql_print_error("next log error: %d offset: %s log: %s",
|
|
|
|
err,
|
|
|
|
llstr(linfo.index_file_offset, buff),
|
|
|
|
log_name);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
strmake_buf(log_name ,linfo.log_file_name);
|
|
|
|
|
|
|
|
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
|
|
|
|
{
|
|
|
|
err= 1;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
/* Loop to try again on the new log file. */
|
2014-05-08 14:20:18 +02:00
|
|
|
}
|
2014-05-15 15:52:08 +02:00
|
|
|
|
|
|
|
event_type= ev->get_type_code();
|
2014-11-13 10:09:46 +01:00
|
|
|
if (event_type == FORMAT_DESCRIPTION_EVENT)
|
|
|
|
{
|
|
|
|
delete description_event;
|
|
|
|
description_event= (Format_description_log_event *)ev;
|
|
|
|
continue;
|
|
|
|
} else if (!Log_event::is_group_event(event_type))
|
2014-05-15 15:52:08 +02:00
|
|
|
{
|
|
|
|
delete ev;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ev->thd= thd;
|
|
|
|
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
2014-11-13 10:46:09 +01:00
|
|
|
qev= rpt->retry_get_qev(ev, orig_qev, log_name, old_offset,
|
2014-05-15 15:52:08 +02:00
|
|
|
cur_offset - old_offset);
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
if (!qev)
|
|
|
|
{
|
|
|
|
delete ev;
|
|
|
|
my_error(ER_OUT_OF_RESOURCES, MYF(0));
|
|
|
|
err= 1;
|
|
|
|
goto err;
|
|
|
|
}
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
if (is_group_ending(ev, event_type))
|
|
|
|
rgi->mark_start_commit();
|
|
|
|
|
2014-05-15 15:52:08 +02:00
|
|
|
err= rpt_handle_event(qev, rpt);
|
|
|
|
++event_count;
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
|
|
|
rpt->free_qev(qev);
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
delete_or_keep_event_post_apply(rgi, event_type, ev);
|
2014-05-13 13:42:06 +02:00
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_simulate_double_temp_err_gtid_0_x_100",
|
|
|
|
if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd););
|
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100",
|
|
|
|
err= dbug_simulate_tmp_error(rgi, thd););
|
2014-05-08 14:20:18 +02:00
|
|
|
if (err)
|
|
|
|
{
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
convert_kill_to_deadlock_error(rgi);
|
2014-05-13 13:42:06 +02:00
|
|
|
if (has_temporary_error(thd))
|
|
|
|
{
|
|
|
|
++retries;
|
|
|
|
if (retries < slave_trans_retries)
|
|
|
|
{
|
|
|
|
end_io_cache(&rlog);
|
|
|
|
mysql_file_close(fd, MYF(MY_WME));
|
2014-05-15 15:52:08 +02:00
|
|
|
fd= (File)-1;
|
2014-05-13 13:42:06 +02:00
|
|
|
goto do_retry;
|
|
|
|
}
|
|
|
|
sql_print_error("Slave worker thread retried transaction %lu time(s) "
|
|
|
|
"in vain, giving up. Consider raising the value of "
|
|
|
|
"the slave_transaction_retries variable.",
|
|
|
|
slave_trans_retries);
|
|
|
|
}
|
2014-05-08 14:20:18 +02:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
} while (event_count < events_to_execute);
|
|
|
|
|
|
|
|
err:
|
|
|
|
|
2014-11-13 10:09:46 +01:00
|
|
|
if (description_event)
|
|
|
|
delete description_event;
|
2014-05-15 15:52:08 +02:00
|
|
|
if (fd >= 0)
|
|
|
|
{
|
|
|
|
end_io_cache(&rlog);
|
|
|
|
mysql_file_close(fd, MYF(MY_WME));
|
|
|
|
}
|
|
|
|
if (errmsg)
|
|
|
|
sql_print_error("Error reading relay log event: %s", errmsg);
|
2014-05-08 14:20:18 +02:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
pthread_handler_t
|
|
|
|
handle_rpl_parallel_thread(void *arg)
|
|
|
|
{
|
|
|
|
THD *thd;
|
2013-11-01 12:00:11 +01:00
|
|
|
PSI_stage_info old_stage;
|
2013-06-24 10:50:25 +02:00
|
|
|
struct rpl_parallel_thread::queued_event *events;
|
|
|
|
bool group_standalone= true;
|
|
|
|
bool in_event_group= false;
|
2014-03-09 10:27:38 +01:00
|
|
|
bool skip_event_group= false;
|
2013-11-05 12:01:26 +01:00
|
|
|
rpl_group_info *group_rgi= NULL;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
group_commit_orderer *gco, *tmp_gco;
|
2013-07-04 13:17:01 +02:00
|
|
|
uint64 event_gtid_sub_id= 0;
|
2014-04-25 12:58:31 +02:00
|
|
|
rpl_sql_thread_info sql_info(NULL);
|
2013-10-14 15:28:16 +02:00
|
|
|
int err;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
|
|
|
struct rpl_parallel_thread *rpt= (struct rpl_parallel_thread *)arg;
|
|
|
|
|
|
|
|
my_thread_init();
|
|
|
|
thd = new THD;
|
|
|
|
thd->thread_stack = (char*)&thd;
|
|
|
|
mysql_mutex_lock(&LOCK_thread_count);
|
|
|
|
thd->thread_id= thd->variables.pseudo_thread_id= thread_id++;
|
|
|
|
threads.append(thd);
|
|
|
|
mysql_mutex_unlock(&LOCK_thread_count);
|
|
|
|
set_current_thd(thd);
|
|
|
|
pthread_detach_this_thread();
|
|
|
|
thd->init_for_queries();
|
|
|
|
thd->variables.binlog_annotate_row_events= 0;
|
|
|
|
init_thr_lock();
|
|
|
|
thd->store_globals();
|
|
|
|
thd->system_thread= SYSTEM_THREAD_SLAVE_SQL;
|
|
|
|
thd->security_ctx->skip_grants();
|
|
|
|
thd->variables.max_allowed_packet= slave_max_allowed_packet;
|
|
|
|
thd->slave_thread= 1;
|
|
|
|
thd->enable_slow_log= opt_log_slow_slave_statements;
|
|
|
|
thd->variables.log_slow_filter= global_system_variables.log_slow_filter;
|
|
|
|
set_slave_thread_options(thd);
|
|
|
|
thd->client_capabilities = CLIENT_LOCAL_FILES;
|
2013-12-18 16:26:22 +01:00
|
|
|
thd->net.reading_or_writing= 0;
|
2013-06-24 10:50:25 +02:00
|
|
|
thd_proc_info(thd, "Waiting for work from main SQL threads");
|
|
|
|
thd->set_time();
|
|
|
|
thd->variables.lock_wait_timeout= LONG_TIMEOUT;
|
2014-04-25 12:58:31 +02:00
|
|
|
thd->system_thread_info.rpl_sql_info= &sql_info;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
|
|
|
rpt->thd= thd;
|
|
|
|
|
|
|
|
while (rpt->delay_start)
|
|
|
|
mysql_cond_wait(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread);
|
|
|
|
|
|
|
|
rpt->running= true;
|
2013-10-14 15:28:16 +02:00
|
|
|
mysql_cond_signal(&rpt->COND_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
while (!rpt->stop)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
2014-11-13 10:20:48 +01:00
|
|
|
rpl_parallel_thread::queued_event *qev, *next_qev;
|
|
|
|
|
2013-11-01 12:00:11 +01:00
|
|
|
thd->ENTER_COND(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread,
|
|
|
|
&stage_waiting_for_work_from_sql_thread, &old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
There are 4 cases that should cause us to wake up:
|
|
|
|
- Events have been queued for us to handle.
|
|
|
|
- We have an owner, but no events and not inside event group -> we need
|
|
|
|
to release ourself to the thread pool
|
|
|
|
- SQL thread is stopping, and we have an owner but no events, and we are
|
|
|
|
inside an event group; no more events will be queued to us, so we need
|
|
|
|
to abort the group (force_abort==1).
|
|
|
|
- Thread pool shutdown (rpt->stop==1).
|
|
|
|
*/
|
|
|
|
while (!( (events= rpt->event_queue) ||
|
|
|
|
(rpt->current_owner && !in_event_group) ||
|
|
|
|
(rpt->current_owner && group_rgi->parallel_entry->force_abort) ||
|
|
|
|
rpt->stop))
|
2013-06-24 10:50:25 +02:00
|
|
|
mysql_cond_wait(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpt->dequeue1(events);
|
2013-11-01 12:00:11 +01:00
|
|
|
thd->EXIT_COND(&old_stage);
|
2013-06-24 10:50:25 +02:00
|
|
|
|
|
|
|
more_events:
|
2014-11-13 10:20:48 +01:00
|
|
|
for (qev= events; qev; qev= next_qev)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
2013-10-31 14:11:41 +01:00
|
|
|
Log_event_type event_type;
|
2014-11-13 10:20:48 +01:00
|
|
|
rpl_group_info *rgi= qev->rgi;
|
2013-07-03 13:46:33 +02:00
|
|
|
rpl_parallel_entry *entry= rgi->parallel_entry;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
bool end_of_group, group_ending;
|
2013-07-03 13:46:33 +02:00
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
next_qev= qev->next;
|
|
|
|
if (qev->typ == rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE)
|
2013-10-31 14:11:41 +01:00
|
|
|
{
|
2014-11-13 10:20:48 +01:00
|
|
|
handle_queued_pos_update(thd, qev);
|
|
|
|
rpt->loc_free_qev(qev);
|
2013-10-31 14:11:41 +01:00
|
|
|
continue;
|
|
|
|
}
|
2014-11-13 10:20:48 +01:00
|
|
|
else if (qev->typ ==
|
2014-08-19 14:26:42 +02:00
|
|
|
rpl_parallel_thread::queued_event::QUEUED_MASTER_RESTART)
|
|
|
|
{
|
|
|
|
if (in_event_group)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
Master restarted (crashed) in the middle of an event group.
|
|
|
|
So we need to roll back and discard that event group.
|
|
|
|
*/
|
|
|
|
group_rgi->cleanup_context(thd, 1);
|
|
|
|
in_event_group= false;
|
|
|
|
finish_event_group(thd, group_rgi->gtid_sub_id,
|
2014-11-13 10:20:48 +01:00
|
|
|
qev->entry_for_queued, group_rgi);
|
2014-08-19 14:26:42 +02:00
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
rpt->loc_free_rgi(group_rgi);
|
2014-08-19 14:26:42 +02:00
|
|
|
thd->rgi_slave= group_rgi= NULL;
|
|
|
|
}
|
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
rpt->loc_free_qev(qev);
|
2014-08-19 14:26:42 +02:00
|
|
|
continue;
|
|
|
|
}
|
2014-11-13 10:20:48 +01:00
|
|
|
DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT);
|
2013-10-31 14:11:41 +01:00
|
|
|
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
Remove the temporary fix for MDEV-5914, which used READ COMMITTED for parallel
replication worker threads. Replace it with a better, more selective solution.
The issue is with certain edge cases of InnoDB gap locks, for example between
INSERT and ranged DELETE. It is possible for the gap lock set by the DELETE to
block the INSERT, if the DELETE runs first, while the record lock set by
INSERT does not block the DELETE, if the INSERT runs first. This can cause a
conflict between the two in parallel replication on the slave even though they
ran without conflicts on the master.
With this patch, InnoDB will ask the server layer about the two involved
transactions before blocking on a gap lock. If the server layer tells InnoDB
that the transactions are already fixed wrt. commit order, as they are in
parallel replication, InnoDB will ignore the gap lock and allow the two
transactions to proceed in parallel, avoiding the conflict.
Improve the fix for MDEV-6020. When InnoDB itself detects a deadlock, it now
asks the server layer for any preferences about which transaction to roll
back. In case of parallel replication with two transactions T1 and T2 fixed to
commit T1 before T2, the server layer will ask InnoDB to roll back T2 as the
deadlock victim, not T1. This helps in some cases to avoid excessive deadlock
rollback, as T2 will in any case need to wait for T1 to complete before it can
itself commit.
Also some misc. fixes found during development and testing:
- Remove thd_rpl_is_parallel(), it is not used or needed.
- Use KILL_CONNECTION instead of KILL_QUERY when a parallel replication
worker thread is killed to resolve a deadlock with fixed commit
ordering. There are some cases, eg. in sql/sql_parse.cc, where a KILL_QUERY
can be ignored if the query otherwise completed successfully, and this
could cause the deadlock kill to be lost, so that the deadlock was not
correctly resolved.
- Fix random test failure due to missing wait_for_binlog_checkpoint.inc.
- Make sure that deadlock or other temporary errors during parallel
replication are not printed to the the error log; there were some places
around the replication code with extra error logging. These conditions can
occur occasionally and are handled automatically without breaking
replication, so they should not pollute the error log.
- Fix handling of rgi->gtid_sub_id. We need to be able to access this also at
the end of a transaction, to be able to detect and resolve deadlocks due to
commit ordering. But this value was also used as a flag to mark whether
record_gtid() had been called, by being set to zero, losing the value. Now,
introduce a separate flag rgi->gtid_pending, so rgi->gtid_sub_id remains
valid for the entire duration of the transaction.
- Fix one place where the code to handle ignored errors called reset_killed()
unconditionally, even if no error was caught that should be ignored. This
could cause loss of a deadlock kill signal, breaking deadlock detection and
resolution.
- Fix a couple of missing mysql_reset_thd_for_next_command(). This could
cause a prior error condition to remain for the next event executed,
causing assertions about errors already being set and possibly giving
incorrect error handling for following event executions.
- Fix code that cleared thd->rgi_slave in the parallel replication worker
threads after each event execution; this caused the deadlock detection and
handling code to not be able to correctly process the associated
transactions as belonging to replication worker threads.
- Remove useless error code in slave_background_kill_request().
- Fix bug where wfc->wakeup_error was not cleared at
wait_for_commit::unregister_wait_for_prior_commit(). This could cause the
error condition to wrongly propagate to a later wait_for_prior_commit(),
causing spurious ER_PRIOR_COMMIT_FAILED errors.
- Do not put the binlog background thread into the processlist. It causes
too many result differences in mtr, but also it probably is not useful
for users to pollute the process list with a system thread that does not
really perform any user-visible tasks...
2014-06-10 10:13:15 +02:00
|
|
|
thd->rgi_slave= group_rgi= rgi;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
gco= rgi->gco;
|
2013-09-13 15:09:57 +02:00
|
|
|
/* Handle a new event group, which will be initiated by a GTID event. */
|
2014-11-13 10:20:48 +01:00
|
|
|
if ((event_type= qev->ev->get_type_code()) == GTID_EVENT)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
bool did_enter_cond= false;
|
2014-02-26 16:38:42 +01:00
|
|
|
PSI_stage_info old_stage;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
uint64 wait_count;
|
|
|
|
|
2013-07-03 13:46:33 +02:00
|
|
|
in_event_group= true;
|
2013-09-13 15:09:57 +02:00
|
|
|
/*
|
|
|
|
If the standalone flag is set, then this event group consists of a
|
|
|
|
single statement (possibly preceeded by some Intvar_log_event and
|
|
|
|
similar), without any terminating COMMIT/ROLLBACK/XID.
|
|
|
|
*/
|
2013-06-24 10:50:25 +02:00
|
|
|
group_standalone=
|
2014-11-13 10:20:48 +01:00
|
|
|
(0 != (static_cast<Gtid_log_event *>(qev->ev)->flags2 &
|
2013-06-24 10:50:25 +02:00
|
|
|
Gtid_log_event::FL_STANDALONE));
|
2013-07-03 13:46:33 +02:00
|
|
|
|
2013-07-04 13:17:01 +02:00
|
|
|
event_gtid_sub_id= rgi->gtid_sub_id;
|
2013-07-08 16:47:07 +02:00
|
|
|
rgi->thd= thd;
|
|
|
|
|
2013-07-03 13:46:33 +02:00
|
|
|
/*
|
|
|
|
Register ourself to wait for the previous commit, if we need to do
|
|
|
|
such registration _and_ that previous commit has not already
|
|
|
|
occured.
|
2013-07-08 16:47:07 +02:00
|
|
|
|
|
|
|
Also do not start parallel execution of this event group until all
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
prior groups have reached the commit phase that are not safe to run
|
|
|
|
in parallel with.
|
2013-07-03 13:46:33 +02:00
|
|
|
*/
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_mutex_lock(&entry->LOCK_parallel_entry);
|
|
|
|
if (!gco->installed)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (gco->prev_gco)
|
|
|
|
gco->prev_gco->next_gco= gco;
|
|
|
|
gco->installed= true;
|
|
|
|
}
|
|
|
|
wait_count= gco->wait_count;
|
|
|
|
if (wait_count > entry->count_committing_event_groups)
|
|
|
|
{
|
|
|
|
DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior");
|
2014-02-26 16:38:42 +01:00
|
|
|
thd->ENTER_COND(&gco->COND_group_commit_orderer,
|
|
|
|
&entry->LOCK_parallel_entry,
|
2014-11-13 09:56:28 +01:00
|
|
|
&stage_waiting_for_prior_transaction_to_start_commit,
|
2014-02-26 16:38:42 +01:00
|
|
|
&old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
did_enter_cond= true;
|
|
|
|
do
|
2013-07-08 16:47:07 +02:00
|
|
|
{
|
2014-03-21 10:11:28 +01:00
|
|
|
if (thd->check_killed() && !rgi->worker_error)
|
2013-12-06 13:28:23 +01:00
|
|
|
{
|
2013-12-18 16:26:22 +01:00
|
|
|
DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior_killed");
|
2013-12-06 13:28:23 +01:00
|
|
|
thd->send_kill_message();
|
2014-06-25 15:17:03 +02:00
|
|
|
slave_output_error_info(rgi, thd);
|
2014-03-21 10:11:28 +01:00
|
|
|
signal_error_to_sql_driver_thread(thd, rgi, 1);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
Even though we were killed, we need to continue waiting for the
|
|
|
|
prior event groups to signal that we can continue. Otherwise we
|
|
|
|
mess up the accounting for ordering. However, now that we have
|
|
|
|
marked the error, events will just be skipped rather than
|
|
|
|
executed, and things will progress quickly towards stop.
|
|
|
|
*/
|
2013-12-06 13:28:23 +01:00
|
|
|
}
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_cond_wait(&gco->COND_group_commit_orderer,
|
|
|
|
&entry->LOCK_parallel_entry);
|
|
|
|
} while (wait_count > entry->count_committing_event_groups);
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if ((tmp_gco= gco->prev_gco))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
Now all the event groups in the previous batch have entered their
|
|
|
|
commit phase, and will no longer access their gco. So we can free
|
|
|
|
it here.
|
|
|
|
*/
|
|
|
|
DBUG_ASSERT(!tmp_gco->prev_gco);
|
|
|
|
gco->prev_gco= NULL;
|
2014-11-13 10:20:48 +01:00
|
|
|
rpt->loc_free_gco(tmp_gco);
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (entry->force_abort && wait_count > entry->stop_count)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We are stopping (STOP SLAVE), and this event group is beyond the
|
|
|
|
point where we can safely stop. So set a flag that will cause us
|
|
|
|
to skip, rather than execute, the following events.
|
|
|
|
*/
|
2014-03-09 10:27:38 +01:00
|
|
|
skip_event_group= true;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
}
|
|
|
|
else
|
2014-03-09 10:27:38 +01:00
|
|
|
skip_event_group= false;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id))
|
2014-03-09 10:27:38 +01:00
|
|
|
skip_event_group= true;
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
else
|
|
|
|
register_wait_for_prior_event_group_commit(rgi, entry);
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry,
|
2014-02-26 16:38:42 +01:00
|
|
|
&did_enter_cond, &old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
2013-10-30 07:52:30 +01:00
|
|
|
if(thd->wait_for_commit_ptr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
This indicates that we get a new GTID event in the middle of
|
|
|
|
a not completed event group. This is corrupt binlog (the master
|
|
|
|
will never write such binlog), so it does not happen unless
|
|
|
|
someone tries to inject wrong crafted binlog, but let us still
|
|
|
|
try to handle it somewhat nicely.
|
|
|
|
*/
|
|
|
|
rgi->cleanup_context(thd, true);
|
|
|
|
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
|
2014-03-21 10:11:28 +01:00
|
|
|
thd->wait_for_commit_ptr->wakeup_subsequent_commits(rgi->worker_error);
|
2013-10-30 07:52:30 +01:00
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
thd->wait_for_commit_ptr= &rgi->commit_orderer;
|
2014-03-09 10:27:38 +01:00
|
|
|
|
|
|
|
if (opt_gtid_ignore_duplicates)
|
|
|
|
{
|
|
|
|
int res=
|
|
|
|
rpl_global_gtid_slave_state.check_duplicate_gtid(&rgi->current_gtid,
|
2014-03-12 00:14:49 +01:00
|
|
|
rgi);
|
|
|
|
if (res < 0)
|
|
|
|
{
|
|
|
|
/* Error. */
|
2014-06-25 15:17:03 +02:00
|
|
|
slave_output_error_info(rgi, thd);
|
2014-03-21 10:11:28 +01:00
|
|
|
signal_error_to_sql_driver_thread(thd, rgi, 1);
|
2014-03-12 00:14:49 +01:00
|
|
|
}
|
|
|
|
else if (!res)
|
|
|
|
{
|
|
|
|
/* GTID already applied by another master connection, skip. */
|
2014-03-09 10:27:38 +01:00
|
|
|
skip_event_group= true;
|
2014-03-12 00:14:49 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* We have to apply the event. */
|
|
|
|
}
|
2014-03-09 10:27:38 +01:00
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
}
|
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
group_ending= is_group_ending(qev->ev, event_type);
|
2014-06-27 13:34:29 +02:00
|
|
|
if (group_ending && likely(!rgi->worker_error))
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
|
|
|
|
rgi->mark_start_commit();
|
|
|
|
}
|
|
|
|
|
2013-10-08 14:36:06 +02:00
|
|
|
/*
|
|
|
|
If the SQL thread is stopping, we just skip execution of all the
|
|
|
|
following event groups. We still do all the normal waiting and wakeup
|
|
|
|
processing between the event groups as a simple way to ensure that
|
|
|
|
everything is stopped and cleaned up correctly.
|
|
|
|
*/
|
2014-06-27 13:34:29 +02:00
|
|
|
if (likely(!rgi->worker_error) && !skip_event_group)
|
2014-05-08 14:20:18 +02:00
|
|
|
{
|
|
|
|
++rgi->retry_event_count;
|
2014-11-13 10:46:09 +01:00
|
|
|
#ifndef DBUG_OFF
|
|
|
|
err= 0;
|
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid",
|
|
|
|
if (event_type == XID_EVENT)
|
|
|
|
{
|
|
|
|
thd->clear_error();
|
|
|
|
thd->get_stmt_da()->reset_diagnostics_area();
|
|
|
|
my_error(ER_LOCK_DEADLOCK, MYF(0));
|
|
|
|
err= 1;
|
|
|
|
});
|
|
|
|
if (!err)
|
|
|
|
#endif
|
2014-11-13 10:20:48 +01:00
|
|
|
err= rpt_handle_event(qev, rpt);
|
|
|
|
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
|
2014-05-13 13:42:06 +02:00
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
|
|
|
|
err= dbug_simulate_tmp_error(rgi, thd););
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
if (err)
|
|
|
|
{
|
|
|
|
convert_kill_to_deadlock_error(rgi);
|
|
|
|
if (has_temporary_error(thd) && slave_trans_retries > 0)
|
2014-11-13 10:20:48 +01:00
|
|
|
err= retry_event_group(rgi, rpt, qev);
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
}
|
2014-05-08 14:20:18 +02:00
|
|
|
}
|
2013-10-08 14:36:06 +02:00
|
|
|
else
|
2014-05-13 13:42:06 +02:00
|
|
|
{
|
2014-11-13 10:20:48 +01:00
|
|
|
delete qev->ev;
|
2013-10-14 15:28:16 +02:00
|
|
|
err= thd->wait_for_prior_commit();
|
2014-05-13 13:42:06 +02:00
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
|
2013-07-04 13:17:01 +02:00
|
|
|
end_of_group=
|
|
|
|
in_event_group &&
|
|
|
|
((group_standalone && !Log_event::is_part_of_group(event_type)) ||
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
group_ending);
|
2013-07-03 13:46:33 +02:00
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
rpt->loc_free_qev(qev);
|
2013-07-03 13:46:33 +02:00
|
|
|
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
if (unlikely(err))
|
2013-10-28 13:24:56 +01:00
|
|
|
{
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
if (!rgi->worker_error)
|
|
|
|
{
|
|
|
|
slave_output_error_info(rgi, thd);
|
|
|
|
signal_error_to_sql_driver_thread(thd, rgi, err);
|
|
|
|
}
|
|
|
|
thd->reset_killed();
|
2013-10-28 13:24:56 +01:00
|
|
|
}
|
2013-07-04 13:17:01 +02:00
|
|
|
if (end_of_group)
|
|
|
|
{
|
|
|
|
in_event_group= false;
|
2014-03-21 10:11:28 +01:00
|
|
|
finish_event_group(thd, event_gtid_sub_id, entry, rgi);
|
2014-11-13 10:20:48 +01:00
|
|
|
rpt->loc_free_rgi(rgi);
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
Remove the temporary fix for MDEV-5914, which used READ COMMITTED for parallel
replication worker threads. Replace it with a better, more selective solution.
The issue is with certain edge cases of InnoDB gap locks, for example between
INSERT and ranged DELETE. It is possible for the gap lock set by the DELETE to
block the INSERT, if the DELETE runs first, while the record lock set by
INSERT does not block the DELETE, if the INSERT runs first. This can cause a
conflict between the two in parallel replication on the slave even though they
ran without conflicts on the master.
With this patch, InnoDB will ask the server layer about the two involved
transactions before blocking on a gap lock. If the server layer tells InnoDB
that the transactions are already fixed wrt. commit order, as they are in
parallel replication, InnoDB will ignore the gap lock and allow the two
transactions to proceed in parallel, avoiding the conflict.
Improve the fix for MDEV-6020. When InnoDB itself detects a deadlock, it now
asks the server layer for any preferences about which transaction to roll
back. In case of parallel replication with two transactions T1 and T2 fixed to
commit T1 before T2, the server layer will ask InnoDB to roll back T2 as the
deadlock victim, not T1. This helps in some cases to avoid excessive deadlock
rollback, as T2 will in any case need to wait for T1 to complete before it can
itself commit.
Also some misc. fixes found during development and testing:
- Remove thd_rpl_is_parallel(), it is not used or needed.
- Use KILL_CONNECTION instead of KILL_QUERY when a parallel replication
worker thread is killed to resolve a deadlock with fixed commit
ordering. There are some cases, eg. in sql/sql_parse.cc, where a KILL_QUERY
can be ignored if the query otherwise completed successfully, and this
could cause the deadlock kill to be lost, so that the deadlock was not
correctly resolved.
- Fix random test failure due to missing wait_for_binlog_checkpoint.inc.
- Make sure that deadlock or other temporary errors during parallel
replication are not printed to the the error log; there were some places
around the replication code with extra error logging. These conditions can
occur occasionally and are handled automatically without breaking
replication, so they should not pollute the error log.
- Fix handling of rgi->gtid_sub_id. We need to be able to access this also at
the end of a transaction, to be able to detect and resolve deadlocks due to
commit ordering. But this value was also used as a flag to mark whether
record_gtid() had been called, by being set to zero, losing the value. Now,
introduce a separate flag rgi->gtid_pending, so rgi->gtid_sub_id remains
valid for the entire duration of the transaction.
- Fix one place where the code to handle ignored errors called reset_killed()
unconditionally, even if no error was caught that should be ignored. This
could cause loss of a deadlock kill signal, breaking deadlock detection and
resolution.
- Fix a couple of missing mysql_reset_thd_for_next_command(). This could
cause a prior error condition to remain for the next event executed,
causing assertions about errors already being set and possibly giving
incorrect error handling for following event executions.
- Fix code that cleared thd->rgi_slave in the parallel replication worker
threads after each event execution; this caused the deadlock detection and
handling code to not be able to correctly process the associated
transactions as belonging to replication worker threads.
- Remove useless error code in slave_background_kill_request().
- Fix bug where wfc->wakeup_error was not cleared at
wait_for_commit::unregister_wait_for_prior_commit(). This could cause the
error condition to wrongly propagate to a later wait_for_prior_commit(),
causing spurious ER_PRIOR_COMMIT_FAILED errors.
- Do not put the binlog background thread into the processlist. It causes
too many result differences in mtr, but also it probably is not useful
for users to pollute the process list with a system thread that does not
really perform any user-visible tasks...
2014-06-10 10:13:15 +02:00
|
|
|
thd->rgi_slave= group_rgi= rgi= NULL;
|
2014-03-09 10:27:38 +01:00
|
|
|
skip_event_group= false;
|
2013-12-18 16:26:22 +01:00
|
|
|
DEBUG_SYNC(thd, "rpl_parallel_end_of_group");
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
2014-11-13 10:20:48 +01:00
|
|
|
/*
|
|
|
|
Now that we have the lock, we can move everything from our local free
|
|
|
|
lists to the real free lists that are also accessible from the SQL
|
|
|
|
driver thread.
|
|
|
|
*/
|
|
|
|
rpt->batch_free();
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
if ((events= rpt->event_queue) != NULL)
|
|
|
|
{
|
2013-09-23 14:46:57 +02:00
|
|
|
/*
|
|
|
|
Take next group of events from the replication pool.
|
|
|
|
This is faster than having to wakeup the pool manager thread to give us
|
|
|
|
a new event.
|
|
|
|
*/
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpt->dequeue1(events);
|
2013-06-24 10:50:25 +02:00
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
goto more_events;
|
|
|
|
}
|
2014-11-13 10:20:48 +01:00
|
|
|
rpt->inuse_relaylog_refcount_update();
|
2013-06-24 10:50:25 +02:00
|
|
|
|
2013-11-05 12:01:26 +01:00
|
|
|
if (in_event_group && group_rgi->parallel_entry->force_abort)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We are asked to abort, without getting the remaining events in the
|
|
|
|
current event group.
|
|
|
|
|
|
|
|
We have to rollback the current transaction and update the last
|
|
|
|
sub_id value so that SQL thread will know we are done with the
|
|
|
|
half-processed event group.
|
|
|
|
*/
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
2014-03-21 10:11:28 +01:00
|
|
|
signal_error_to_sql_driver_thread(thd, group_rgi, 1);
|
|
|
|
finish_event_group(thd, group_rgi->gtid_sub_id,
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
group_rgi->parallel_entry, group_rgi);
|
2013-11-05 12:01:26 +01:00
|
|
|
in_event_group= false;
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpt->free_rgi(group_rgi);
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
Remove the temporary fix for MDEV-5914, which used READ COMMITTED for parallel
replication worker threads. Replace it with a better, more selective solution.
The issue is with certain edge cases of InnoDB gap locks, for example between
INSERT and ranged DELETE. It is possible for the gap lock set by the DELETE to
block the INSERT, if the DELETE runs first, while the record lock set by
INSERT does not block the DELETE, if the INSERT runs first. This can cause a
conflict between the two in parallel replication on the slave even though they
ran without conflicts on the master.
With this patch, InnoDB will ask the server layer about the two involved
transactions before blocking on a gap lock. If the server layer tells InnoDB
that the transactions are already fixed wrt. commit order, as they are in
parallel replication, InnoDB will ignore the gap lock and allow the two
transactions to proceed in parallel, avoiding the conflict.
Improve the fix for MDEV-6020. When InnoDB itself detects a deadlock, it now
asks the server layer for any preferences about which transaction to roll
back. In case of parallel replication with two transactions T1 and T2 fixed to
commit T1 before T2, the server layer will ask InnoDB to roll back T2 as the
deadlock victim, not T1. This helps in some cases to avoid excessive deadlock
rollback, as T2 will in any case need to wait for T1 to complete before it can
itself commit.
Also some misc. fixes found during development and testing:
- Remove thd_rpl_is_parallel(), it is not used or needed.
- Use KILL_CONNECTION instead of KILL_QUERY when a parallel replication
worker thread is killed to resolve a deadlock with fixed commit
ordering. There are some cases, eg. in sql/sql_parse.cc, where a KILL_QUERY
can be ignored if the query otherwise completed successfully, and this
could cause the deadlock kill to be lost, so that the deadlock was not
correctly resolved.
- Fix random test failure due to missing wait_for_binlog_checkpoint.inc.
- Make sure that deadlock or other temporary errors during parallel
replication are not printed to the the error log; there were some places
around the replication code with extra error logging. These conditions can
occur occasionally and are handled automatically without breaking
replication, so they should not pollute the error log.
- Fix handling of rgi->gtid_sub_id. We need to be able to access this also at
the end of a transaction, to be able to detect and resolve deadlocks due to
commit ordering. But this value was also used as a flag to mark whether
record_gtid() had been called, by being set to zero, losing the value. Now,
introduce a separate flag rgi->gtid_pending, so rgi->gtid_sub_id remains
valid for the entire duration of the transaction.
- Fix one place where the code to handle ignored errors called reset_killed()
unconditionally, even if no error was caught that should be ignored. This
could cause loss of a deadlock kill signal, breaking deadlock detection and
resolution.
- Fix a couple of missing mysql_reset_thd_for_next_command(). This could
cause a prior error condition to remain for the next event executed,
causing assertions about errors already being set and possibly giving
incorrect error handling for following event executions.
- Fix code that cleared thd->rgi_slave in the parallel replication worker
threads after each event execution; this caused the deadlock detection and
handling code to not be able to correctly process the associated
transactions as belonging to replication worker threads.
- Remove useless error code in slave_background_kill_request().
- Fix bug where wfc->wakeup_error was not cleared at
wait_for_commit::unregister_wait_for_prior_commit(). This could cause the
error condition to wrongly propagate to a later wait_for_prior_commit(),
causing spurious ER_PRIOR_COMMIT_FAILED errors.
- Do not put the binlog background thread into the processlist. It causes
too many result differences in mtr, but also it probably is not useful
for users to pollute the process list with a system thread that does not
really perform any user-visible tasks...
2014-06-10 10:13:15 +02:00
|
|
|
thd->rgi_slave= group_rgi= NULL;
|
2014-03-09 10:27:38 +01:00
|
|
|
skip_event_group= false;
|
2013-11-05 12:01:26 +01:00
|
|
|
}
|
2013-06-24 10:50:25 +02:00
|
|
|
if (!in_event_group)
|
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpt->current_owner= NULL;
|
|
|
|
/* Tell wait_for_done() that we are done, if it is waiting. */
|
|
|
|
if (likely(rpt->current_entry) &&
|
|
|
|
unlikely(rpt->current_entry->force_abort))
|
|
|
|
mysql_cond_broadcast(&rpt->current_entry->COND_parallel_entry);
|
2013-06-24 10:50:25 +02:00
|
|
|
rpt->current_entry= NULL;
|
2013-09-23 14:46:57 +02:00
|
|
|
if (!rpt->stop)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpt->pool->release_thread(rpt);
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-09 13:15:53 +02:00
|
|
|
rpt->thd= NULL;
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
|
|
|
|
thd->clear_error();
|
|
|
|
thd->catalog= 0;
|
|
|
|
thd->reset_query();
|
|
|
|
thd->reset_db(NULL, 0);
|
|
|
|
thd_proc_info(thd, "Slave worker thread exiting");
|
|
|
|
thd->temporary_tables= 0;
|
|
|
|
mysql_mutex_lock(&LOCK_thread_count);
|
|
|
|
THD_CHECK_SENTRY(thd);
|
|
|
|
delete thd;
|
|
|
|
mysql_mutex_unlock(&LOCK_thread_count);
|
|
|
|
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
rpt->running= false;
|
2013-07-09 13:15:53 +02:00
|
|
|
mysql_cond_signal(&rpt->COND_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
|
2013-07-09 13:15:53 +02:00
|
|
|
my_thread_end();
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
static void
|
|
|
|
dealloc_gco(group_commit_orderer *gco)
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(!gco->prev_gco /* Must only free after dealloc previous */);
|
|
|
|
mysql_cond_destroy(&gco->COND_group_commit_orderer);
|
|
|
|
my_free(gco);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
int
|
|
|
|
rpl_parallel_change_thread_count(rpl_parallel_thread_pool *pool,
|
|
|
|
uint32 new_count, bool skip_check)
|
|
|
|
{
|
|
|
|
uint32 i;
|
|
|
|
rpl_parallel_thread **new_list= NULL;
|
|
|
|
rpl_parallel_thread *new_free_list= NULL;
|
2013-09-23 14:46:57 +02:00
|
|
|
rpl_parallel_thread *rpt_array= NULL;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
Allocate the new list of threads up-front.
|
|
|
|
That way, if we fail half-way, we only need to free whatever we managed
|
|
|
|
to allocate, and will not be left with a half-functional thread pool.
|
|
|
|
*/
|
|
|
|
if (new_count &&
|
2013-09-23 14:46:57 +02:00
|
|
|
!my_multi_malloc(MYF(MY_WME|MY_ZEROFILL),
|
|
|
|
&new_list, new_count*sizeof(*new_list),
|
|
|
|
&rpt_array, new_count*sizeof(*rpt_array),
|
|
|
|
NULL))
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
2013-09-23 14:46:57 +02:00
|
|
|
my_error(ER_OUTOFMEMORY, MYF(0), (int(new_count*sizeof(*new_list) +
|
|
|
|
new_count*sizeof(*rpt_array))));
|
2013-06-24 10:50:25 +02:00
|
|
|
goto err;;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i= 0; i < new_count; ++i)
|
|
|
|
{
|
|
|
|
pthread_t th;
|
|
|
|
|
2013-09-23 14:46:57 +02:00
|
|
|
new_list[i]= &rpt_array[i];
|
2013-06-24 10:50:25 +02:00
|
|
|
new_list[i]->delay_start= true;
|
|
|
|
mysql_mutex_init(key_LOCK_rpl_thread, &new_list[i]->LOCK_rpl_thread,
|
|
|
|
MY_MUTEX_INIT_SLOW);
|
|
|
|
mysql_cond_init(key_COND_rpl_thread, &new_list[i]->COND_rpl_thread, NULL);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_cond_init(key_COND_rpl_thread_queue,
|
|
|
|
&new_list[i]->COND_rpl_thread_queue, NULL);
|
2013-06-24 10:50:25 +02:00
|
|
|
new_list[i]->pool= pool;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (mysql_thread_create(key_rpl_parallel_thread, &th, &connection_attrib,
|
2013-06-24 10:50:25 +02:00
|
|
|
handle_rpl_parallel_thread, new_list[i]))
|
|
|
|
{
|
|
|
|
my_error(ER_OUT_OF_RESOURCES, MYF(0));
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
new_list[i]->next= new_free_list;
|
|
|
|
new_free_list= new_list[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!skip_check)
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&LOCK_active_mi);
|
|
|
|
if (master_info_index->give_error_if_slave_running())
|
|
|
|
{
|
|
|
|
mysql_mutex_unlock(&LOCK_active_mi);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
if (pool->changing)
|
|
|
|
{
|
|
|
|
mysql_mutex_unlock(&LOCK_active_mi);
|
|
|
|
my_error(ER_CHANGE_SLAVE_PARALLEL_THREADS_ACTIVE, MYF(0));
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
pool->changing= true;
|
|
|
|
mysql_mutex_unlock(&LOCK_active_mi);
|
|
|
|
}
|
|
|
|
|
2013-09-23 14:46:57 +02:00
|
|
|
/*
|
|
|
|
Grab each old thread in turn, and signal it to stop.
|
|
|
|
|
|
|
|
Note that since we require all replication threads to be stopped before
|
|
|
|
changing the parallel replication worker thread pool, all the threads will
|
|
|
|
be already idle and will terminate immediately.
|
|
|
|
*/
|
2013-06-24 10:50:25 +02:00
|
|
|
for (i= 0; i < pool->count; ++i)
|
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpl_parallel_thread *rpt= pool->get_thread(NULL, NULL);
|
2013-06-24 10:50:25 +02:00
|
|
|
rpt->stop= true;
|
2013-07-09 13:15:53 +02:00
|
|
|
mysql_cond_signal(&rpt->COND_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i= 0; i < pool->count; ++i)
|
|
|
|
{
|
|
|
|
rpl_parallel_thread *rpt= pool->threads[i];
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
|
|
|
while (rpt->running)
|
|
|
|
mysql_cond_wait(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread);
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
2013-07-09 13:15:53 +02:00
|
|
|
mysql_mutex_destroy(&rpt->LOCK_rpl_thread);
|
|
|
|
mysql_cond_destroy(&rpt->COND_rpl_thread);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
while (rpt->qev_free_list)
|
|
|
|
{
|
|
|
|
rpl_parallel_thread::queued_event *next= rpt->qev_free_list->next;
|
|
|
|
my_free(rpt->qev_free_list);
|
|
|
|
rpt->qev_free_list= next;
|
|
|
|
}
|
|
|
|
while (rpt->rgi_free_list)
|
|
|
|
{
|
|
|
|
rpl_group_info *next= rpt->rgi_free_list->next;
|
|
|
|
delete rpt->rgi_free_list;
|
|
|
|
rpt->rgi_free_list= next;
|
|
|
|
}
|
|
|
|
while (rpt->gco_free_list)
|
|
|
|
{
|
|
|
|
group_commit_orderer *next= rpt->gco_free_list->next_gco;
|
|
|
|
dealloc_gco(rpt->gco_free_list);
|
|
|
|
rpt->gco_free_list= next;
|
|
|
|
}
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
my_free(pool->threads);
|
|
|
|
pool->threads= new_list;
|
|
|
|
pool->free_list= new_free_list;
|
|
|
|
pool->count= new_count;
|
|
|
|
for (i= 0; i < pool->count; ++i)
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&pool->threads[i]->LOCK_rpl_thread);
|
|
|
|
pool->threads[i]->delay_start= false;
|
|
|
|
mysql_cond_signal(&pool->threads[i]->COND_rpl_thread);
|
2013-10-14 15:28:16 +02:00
|
|
|
while (!pool->threads[i]->running)
|
|
|
|
mysql_cond_wait(&pool->threads[i]->COND_rpl_thread,
|
|
|
|
&pool->threads[i]->LOCK_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
mysql_mutex_unlock(&pool->threads[i]->LOCK_rpl_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!skip_check)
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&LOCK_active_mi);
|
|
|
|
pool->changing= false;
|
|
|
|
mysql_mutex_unlock(&LOCK_active_mi);
|
|
|
|
}
|
2014-03-07 12:08:38 +01:00
|
|
|
|
|
|
|
mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
|
|
|
|
mysql_cond_broadcast(&pool->COND_rpl_thread_pool);
|
|
|
|
mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
|
|
|
if (new_list)
|
|
|
|
{
|
|
|
|
while (new_free_list)
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&new_free_list->LOCK_rpl_thread);
|
|
|
|
new_free_list->delay_start= false;
|
|
|
|
new_free_list->stop= true;
|
2013-07-09 13:15:53 +02:00
|
|
|
mysql_cond_signal(&new_free_list->COND_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
while (!new_free_list->running)
|
|
|
|
mysql_cond_wait(&new_free_list->COND_rpl_thread,
|
|
|
|
&new_free_list->LOCK_rpl_thread);
|
|
|
|
while (new_free_list->running)
|
|
|
|
mysql_cond_wait(&new_free_list->COND_rpl_thread,
|
|
|
|
&new_free_list->LOCK_rpl_thread);
|
2013-07-08 16:47:07 +02:00
|
|
|
mysql_mutex_unlock(&new_free_list->LOCK_rpl_thread);
|
2013-09-23 14:46:57 +02:00
|
|
|
new_free_list= new_free_list->next;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
my_free(new_list);
|
|
|
|
}
|
|
|
|
if (!skip_check)
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&LOCK_active_mi);
|
|
|
|
pool->changing= false;
|
|
|
|
mysql_mutex_unlock(&LOCK_active_mi);
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
void
|
|
|
|
rpl_parallel_thread::batch_free()
|
|
|
|
{
|
|
|
|
mysql_mutex_assert_owner(&LOCK_rpl_thread);
|
|
|
|
if (loc_qev_list)
|
|
|
|
{
|
|
|
|
*loc_qev_last_ptr_ptr= qev_free_list;
|
|
|
|
qev_free_list= loc_qev_list;
|
|
|
|
loc_qev_list= NULL;
|
|
|
|
dequeue2(loc_qev_size);
|
|
|
|
/* Signal that our queue can now accept more events. */
|
|
|
|
mysql_cond_signal(&COND_rpl_thread_queue);
|
|
|
|
loc_qev_size= 0;
|
|
|
|
qev_free_pending= 0;
|
|
|
|
}
|
|
|
|
if (loc_rgi_list)
|
|
|
|
{
|
|
|
|
*loc_rgi_last_ptr_ptr= rgi_free_list;
|
|
|
|
rgi_free_list= loc_rgi_list;
|
|
|
|
loc_rgi_list= NULL;
|
|
|
|
}
|
|
|
|
if (loc_gco_list)
|
|
|
|
{
|
|
|
|
*loc_gco_last_ptr_ptr= gco_free_list;
|
|
|
|
gco_free_list= loc_gco_list;
|
|
|
|
loc_gco_list= NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
rpl_parallel_thread::inuse_relaylog_refcount_update()
|
|
|
|
{
|
|
|
|
inuse_relaylog *ir= accumulated_ir_last;
|
|
|
|
if (ir)
|
|
|
|
{
|
|
|
|
my_atomic_rwlock_wrlock(&ir->rli->inuse_relaylog_atomic_lock);
|
|
|
|
my_atomic_add64(&ir->dequeued_count, accumulated_ir_count);
|
|
|
|
my_atomic_rwlock_wrunlock(&ir->rli->inuse_relaylog_atomic_lock);
|
|
|
|
accumulated_ir_count= 0;
|
|
|
|
accumulated_ir_last= NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpl_parallel_thread::queued_event *
|
2014-05-08 14:20:18 +02:00
|
|
|
rpl_parallel_thread::get_qev_common(Log_event *ev, ulonglong event_size)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
queued_event *qev;
|
|
|
|
mysql_mutex_assert_owner(&LOCK_rpl_thread);
|
|
|
|
if ((qev= qev_free_list))
|
|
|
|
qev_free_list= qev->next;
|
|
|
|
else if(!(qev= (queued_event *)my_malloc(sizeof(*qev), MYF(0))))
|
|
|
|
{
|
|
|
|
my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(*qev));
|
|
|
|
return NULL;
|
|
|
|
}
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
qev->typ= rpl_parallel_thread::queued_event::QUEUED_EVENT;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
qev->ev= ev;
|
|
|
|
qev->event_size= event_size;
|
|
|
|
qev->next= NULL;
|
2014-05-08 14:20:18 +02:00
|
|
|
return qev;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
rpl_parallel_thread::queued_event *
|
|
|
|
rpl_parallel_thread::get_qev(Log_event *ev, ulonglong event_size,
|
|
|
|
Relay_log_info *rli)
|
|
|
|
{
|
|
|
|
queued_event *qev= get_qev_common(ev, event_size);
|
|
|
|
if (!qev)
|
|
|
|
return NULL;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
strcpy(qev->event_relay_log_name, rli->event_relay_log_name);
|
|
|
|
qev->event_relay_log_pos= rli->event_relay_log_pos;
|
|
|
|
qev->future_event_relay_log_pos= rli->future_event_relay_log_pos;
|
|
|
|
strcpy(qev->future_event_master_log_name, rli->future_event_master_log_name);
|
|
|
|
return qev;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
rpl_parallel_thread::queued_event *
|
|
|
|
rpl_parallel_thread::retry_get_qev(Log_event *ev, queued_event *orig_qev,
|
|
|
|
const char *relay_log_name,
|
|
|
|
ulonglong event_pos, ulonglong event_size)
|
|
|
|
{
|
|
|
|
queued_event *qev= get_qev_common(ev, event_size);
|
|
|
|
if (!qev)
|
|
|
|
return NULL;
|
|
|
|
qev->rgi= orig_qev->rgi;
|
|
|
|
strcpy(qev->event_relay_log_name, relay_log_name);
|
|
|
|
qev->event_relay_log_pos= event_pos;
|
|
|
|
qev->future_event_relay_log_pos= event_pos+event_size;
|
|
|
|
strcpy(qev->future_event_master_log_name,
|
|
|
|
orig_qev->future_event_master_log_name);
|
|
|
|
return qev;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
void
|
|
|
|
rpl_parallel_thread::loc_free_qev(rpl_parallel_thread::queued_event *qev)
|
|
|
|
{
|
|
|
|
inuse_relaylog *ir= qev->ir;
|
|
|
|
inuse_relaylog *last_ir= accumulated_ir_last;
|
|
|
|
if (ir != last_ir)
|
|
|
|
{
|
|
|
|
if (last_ir)
|
|
|
|
inuse_relaylog_refcount_update();
|
|
|
|
accumulated_ir_last= ir;
|
|
|
|
}
|
|
|
|
++accumulated_ir_count;
|
|
|
|
if (!loc_qev_list)
|
|
|
|
loc_qev_last_ptr_ptr= &qev->next;
|
|
|
|
else
|
|
|
|
qev->next= loc_qev_list;
|
|
|
|
loc_qev_list= qev;
|
|
|
|
loc_qev_size+= qev->event_size;
|
|
|
|
/*
|
|
|
|
We want to release to the global free list only occasionally, to avoid
|
|
|
|
having to take the LOCK_rpl_thread muted too many times.
|
|
|
|
|
|
|
|
However, we do need to release regularly. If we let the unreleased part
|
|
|
|
grow too large, then the SQL driver thread may go to sleep waiting for
|
|
|
|
the queue to drop below opt_slave_parallel_max_queued, and this in turn
|
|
|
|
can stall all other worker threads for more stuff to do.
|
|
|
|
*/
|
|
|
|
if (++qev_free_pending >= QEV_BATCH_FREE ||
|
|
|
|
loc_qev_size >= opt_slave_parallel_max_queued/3)
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&LOCK_rpl_thread);
|
|
|
|
batch_free();
|
|
|
|
mysql_mutex_unlock(&LOCK_rpl_thread);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
void
|
|
|
|
rpl_parallel_thread::free_qev(rpl_parallel_thread::queued_event *qev)
|
|
|
|
{
|
|
|
|
mysql_mutex_assert_owner(&LOCK_rpl_thread);
|
|
|
|
qev->next= qev_free_list;
|
|
|
|
qev_free_list= qev;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
rpl_group_info*
|
|
|
|
rpl_parallel_thread::get_rgi(Relay_log_info *rli, Gtid_log_event *gtid_ev,
|
2014-05-08 14:20:18 +02:00
|
|
|
rpl_parallel_entry *e, ulonglong event_size)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
rpl_group_info *rgi;
|
|
|
|
mysql_mutex_assert_owner(&LOCK_rpl_thread);
|
|
|
|
if ((rgi= rgi_free_list))
|
|
|
|
{
|
|
|
|
rgi_free_list= rgi->next;
|
|
|
|
rgi->reinit(rli);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if(!(rgi= new rpl_group_info(rli)))
|
|
|
|
{
|
|
|
|
my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(*rgi));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
rgi->is_parallel_exec = true;
|
|
|
|
}
|
2014-03-07 12:02:09 +01:00
|
|
|
if ((rgi->deferred_events_collecting= rli->mi->rpl_filter->is_on()) &&
|
|
|
|
!rgi->deferred_events)
|
|
|
|
rgi->deferred_events= new Deferred_log_events(rli);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (event_group_new_gtid(rgi, gtid_ev))
|
|
|
|
{
|
|
|
|
free_rgi(rgi);
|
|
|
|
my_error(ER_OUT_OF_RESOURCES, MYF(MY_WME));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
rgi->parallel_entry= e;
|
2014-05-08 14:20:18 +02:00
|
|
|
rgi->relay_log= rli->last_inuse_relaylog;
|
|
|
|
rgi->retry_start_offset= rli->future_event_relay_log_pos-event_size;
|
|
|
|
rgi->retry_event_count= 0;
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
In parallel replication, we run transactions from the master in parallel, but
force them to commit in the same order they did on the master. If we force T1
to commit before T2, but T2 holds eg. a row lock that is needed by T1, we get
a deadlock when T2 waits until T1 has committed.
Usually, we do not run T1 and T2 in parallel if there is a chance that they
can have conflicting locks like this, but there are certain edge cases where
it can occasionally happen (eg. MDEV-5914, MDEV-5941, MDEV-6020). The bug was
that this would cause replication to hang, eventually getting a lock timeout
and causing the slave to stop with error.
With this patch, InnoDB will report back to the upper layer whenever a
transactions T1 is about to do a lock wait on T2. If T1 and T2 are parallel
replication transactions, and T2 needs to commit later than T1, we can thus
detect the deadlock; we then kill T2, setting a flag that causes it to catch
the kill and convert it to a deadlock error; this error will then cause T2 to
roll back and release its locks (so that T1 can commit), and later T2 will be
re-tried and eventually also committed.
The kill happens asynchroneously in a slave background thread; this is
necessary, as the reporting from InnoDB about lock waits happen deep inside
the locking code, at a point where it is not possible to directly call
THD::awake() due to mutexes held.
Deadlock is assumed to be (very) rarely occuring, so this patch tries to
minimise the performance impact on the normal case where no deadlocks occur,
rather than optimise the handling of the occasional deadlock.
Also fix transaction retry due to deadlock when it happens after a transaction
already signalled to later transactions that it started to commit. In this
case we need to undo this signalling (and later redo it when we commit again
during retry), so following transactions will not start too early.
Also add a missing thd->send_kill_message() that got triggered during testing
(this corrects an incorrect fix for MySQL Bug#58933).
2014-06-03 10:31:11 +02:00
|
|
|
rgi->killed_for_retry= false;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
return rgi;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-11-13 10:20:48 +01:00
|
|
|
void
|
|
|
|
rpl_parallel_thread::loc_free_rgi(rpl_group_info *rgi)
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(rgi->commit_orderer.waitee == NULL);
|
|
|
|
rgi->free_annotate_event();
|
|
|
|
if (!loc_rgi_list)
|
|
|
|
loc_rgi_last_ptr_ptr= &rgi->next;
|
|
|
|
else
|
|
|
|
rgi->next= loc_rgi_list;
|
|
|
|
loc_rgi_list= rgi;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
void
|
|
|
|
rpl_parallel_thread::free_rgi(rpl_group_info *rgi)
|
|
|
|
{
|
|
|
|
mysql_mutex_assert_owner(&LOCK_rpl_thread);
|
|
|
|
DBUG_ASSERT(rgi->commit_orderer.waitee == NULL);
|
|
|
|
rgi->free_annotate_event();
|
|
|
|
rgi->next= rgi_free_list;
|
|
|
|
rgi_free_list= rgi;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
group_commit_orderer *
|
|
|
|
rpl_parallel_thread::get_gco(uint64 wait_count, group_commit_orderer *prev)
|
|
|
|
{
|
|
|
|
group_commit_orderer *gco;
|
|
|
|
mysql_mutex_assert_owner(&LOCK_rpl_thread);
|
|
|
|
if ((gco= gco_free_list))
|
|
|
|
gco_free_list= gco->next_gco;
|
|
|
|
else if(!(gco= (group_commit_orderer *)my_malloc(sizeof(*gco), MYF(0))))
|
|
|
|
{
|
|
|
|
my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(*gco));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
mysql_cond_init(key_COND_group_commit_orderer,
|
|
|
|
&gco->COND_group_commit_orderer, NULL);
|
|
|
|
gco->wait_count= wait_count;
|
|
|
|
gco->prev_gco= prev;
|
|
|
|
gco->next_gco= NULL;
|
|
|
|
gco->installed= false;
|
|
|
|
return gco;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
2014-11-13 10:20:48 +01:00
|
|
|
rpl_parallel_thread::loc_free_gco(group_commit_orderer *gco)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
DBUG_ASSERT(!gco->prev_gco /* Must not free until wait has completed. */);
|
2014-11-13 10:20:48 +01:00
|
|
|
if (!loc_gco_list)
|
|
|
|
loc_gco_last_ptr_ptr= &gco->next_gco;
|
|
|
|
else
|
|
|
|
gco->next_gco= loc_gco_list;
|
|
|
|
loc_gco_list= gco;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
rpl_parallel_thread_pool::rpl_parallel_thread_pool()
|
|
|
|
: count(0), threads(0), free_list(0), changing(false), inited(false)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
rpl_parallel_thread_pool::init(uint32 size)
|
|
|
|
{
|
|
|
|
count= 0;
|
|
|
|
threads= NULL;
|
|
|
|
free_list= NULL;
|
|
|
|
|
|
|
|
mysql_mutex_init(key_LOCK_rpl_thread_pool, &LOCK_rpl_thread_pool,
|
|
|
|
MY_MUTEX_INIT_SLOW);
|
|
|
|
mysql_cond_init(key_COND_rpl_thread_pool, &COND_rpl_thread_pool, NULL);
|
|
|
|
changing= false;
|
|
|
|
inited= true;
|
|
|
|
|
|
|
|
return rpl_parallel_change_thread_count(this, size, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
rpl_parallel_thread_pool::destroy()
|
|
|
|
{
|
|
|
|
if (!inited)
|
|
|
|
return;
|
|
|
|
rpl_parallel_change_thread_count(this, 0, true);
|
|
|
|
mysql_mutex_destroy(&LOCK_rpl_thread_pool);
|
|
|
|
mysql_cond_destroy(&COND_rpl_thread_pool);
|
2013-06-25 09:30:19 +02:00
|
|
|
inited= false;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-09-23 14:46:57 +02:00
|
|
|
/*
|
|
|
|
Wait for a worker thread to become idle. When one does, grab the thread for
|
|
|
|
our use and return it.
|
|
|
|
|
|
|
|
Note that we return with the worker threads's LOCK_rpl_thread mutex locked.
|
|
|
|
*/
|
2013-06-24 10:50:25 +02:00
|
|
|
struct rpl_parallel_thread *
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpl_parallel_thread_pool::get_thread(rpl_parallel_thread **owner,
|
|
|
|
rpl_parallel_entry *entry)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
|
|
|
rpl_parallel_thread *rpt;
|
|
|
|
|
|
|
|
mysql_mutex_lock(&LOCK_rpl_thread_pool);
|
|
|
|
while ((rpt= free_list) == NULL)
|
|
|
|
mysql_cond_wait(&COND_rpl_thread_pool, &LOCK_rpl_thread_pool);
|
|
|
|
free_list= rpt->next;
|
|
|
|
mysql_mutex_unlock(&LOCK_rpl_thread_pool);
|
2013-06-25 09:30:19 +02:00
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpt->current_owner= owner;
|
2013-06-24 10:50:25 +02:00
|
|
|
rpt->current_entry= entry;
|
|
|
|
|
|
|
|
return rpt;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
Release a thread to the thread pool.
|
|
|
|
The thread should be locked, and should not have any work queued for it.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
rpl_parallel_thread_pool::release_thread(rpl_parallel_thread *rpt)
|
|
|
|
{
|
|
|
|
rpl_parallel_thread *list;
|
|
|
|
|
|
|
|
mysql_mutex_assert_owner(&rpt->LOCK_rpl_thread);
|
|
|
|
DBUG_ASSERT(rpt->current_owner == NULL);
|
|
|
|
mysql_mutex_lock(&LOCK_rpl_thread_pool);
|
|
|
|
list= free_list;
|
|
|
|
rpt->next= list;
|
|
|
|
free_list= rpt;
|
|
|
|
if (!list)
|
|
|
|
mysql_cond_broadcast(&COND_rpl_thread_pool);
|
|
|
|
mysql_mutex_unlock(&LOCK_rpl_thread_pool);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Obtain a worker thread that we can queue an event to.
|
|
|
|
|
|
|
|
Each invocation allocates a new worker thread, to maximise
|
|
|
|
parallelism. However, only up to a maximum of
|
|
|
|
--slave-domain-parallel-threads workers can be occupied by a single
|
|
|
|
replication domain; after that point, we start re-using worker threads that
|
|
|
|
are still executing events that were queued earlier for this thread.
|
|
|
|
|
|
|
|
We never queue more than --rpl-parallel-wait-queue_max amount of events
|
|
|
|
for one worker, to avoid the SQL driver thread using up all memory with
|
|
|
|
queued events while worker threads are stalling.
|
|
|
|
|
|
|
|
Note that this function returns with rpl_parallel_thread::LOCK_rpl_thread
|
|
|
|
locked. Exception is if we were killed, in which case NULL is returned.
|
|
|
|
|
|
|
|
The *did_enter_cond flag is set true if we had to wait for a worker thread
|
2014-02-26 16:38:42 +01:00
|
|
|
to become free (with mysql_cond_wait()). If so, old_stage will also be set,
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
and the LOCK_rpl_thread must be released with THD::EXIT_COND() instead
|
|
|
|
of mysql_mutex_unlock.
|
|
|
|
|
|
|
|
If the flag `reuse' is set, the last worker thread will be returned again,
|
|
|
|
if it is still available. Otherwise a new worker thread is allocated.
|
|
|
|
*/
|
|
|
|
rpl_parallel_thread *
|
2014-06-25 15:17:03 +02:00
|
|
|
rpl_parallel_entry::choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
|
2014-02-26 16:38:42 +01:00
|
|
|
PSI_stage_info *old_stage, bool reuse)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
uint32 idx;
|
2014-06-25 15:17:03 +02:00
|
|
|
Relay_log_info *rli= rgi->rli;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpl_parallel_thread *thr;
|
|
|
|
|
|
|
|
idx= rpl_thread_idx;
|
|
|
|
if (!reuse)
|
|
|
|
{
|
|
|
|
++idx;
|
|
|
|
if (idx >= rpl_thread_max)
|
|
|
|
idx= 0;
|
|
|
|
rpl_thread_idx= idx;
|
|
|
|
}
|
|
|
|
thr= rpl_threads[idx];
|
|
|
|
if (thr)
|
|
|
|
{
|
|
|
|
*did_enter_cond= false;
|
|
|
|
mysql_mutex_lock(&thr->LOCK_rpl_thread);
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
if (thr->current_owner != &rpl_threads[idx])
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
The worker thread became idle, and returned to the free list and
|
|
|
|
possibly was allocated to a different request. So we should allocate
|
|
|
|
a new worker thread.
|
|
|
|
*/
|
|
|
|
unlock_or_exit_cond(rli->sql_driver_thd, &thr->LOCK_rpl_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
did_enter_cond, old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
thr= NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (thr->queued_size <= opt_slave_parallel_max_queued)
|
|
|
|
{
|
|
|
|
/* The thread is ready to queue into. */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (rli->sql_driver_thd->check_killed())
|
|
|
|
{
|
|
|
|
unlock_or_exit_cond(rli->sql_driver_thd, &thr->LOCK_rpl_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
did_enter_cond, old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
my_error(ER_CONNECTION_KILLED, MYF(0));
|
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_wait_queue_max",
|
|
|
|
{
|
|
|
|
debug_sync_set_action(rli->sql_driver_thd,
|
|
|
|
STRING_WITH_LEN("now SIGNAL wait_queue_killed"));
|
|
|
|
};);
|
2014-06-25 15:17:03 +02:00
|
|
|
slave_output_error_info(rgi, rli->sql_driver_thd);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We have reached the limit of how much memory we are allowed to use
|
|
|
|
for queuing events, so wait for the thread to consume some of its
|
|
|
|
queue.
|
|
|
|
*/
|
|
|
|
if (!*did_enter_cond)
|
|
|
|
{
|
|
|
|
/*
|
2014-02-26 16:38:42 +01:00
|
|
|
We need to do the debug_sync before ENTER_COND().
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
Because debug_sync changes the thd->mysys_var->current_mutex,
|
|
|
|
and this can cause THD::awake to use the wrong mutex.
|
|
|
|
*/
|
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_wait_queue_max",
|
|
|
|
{
|
|
|
|
debug_sync_set_action(rli->sql_driver_thd,
|
|
|
|
STRING_WITH_LEN("now SIGNAL wait_queue_ready"));
|
|
|
|
};);
|
2014-02-26 16:38:42 +01:00
|
|
|
rli->sql_driver_thd->ENTER_COND(&thr->COND_rpl_thread_queue,
|
|
|
|
&thr->LOCK_rpl_thread,
|
|
|
|
&stage_waiting_for_room_in_worker_thread,
|
|
|
|
old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
*did_enter_cond= true;
|
|
|
|
}
|
|
|
|
mysql_cond_wait(&thr->COND_rpl_thread_queue, &thr->LOCK_rpl_thread);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!thr)
|
|
|
|
rpl_threads[idx]= thr= global_rpl_thread_pool.get_thread(&rpl_threads[idx],
|
|
|
|
this);
|
|
|
|
|
|
|
|
return thr;
|
|
|
|
}
|
|
|
|
|
2013-09-16 14:33:49 +02:00
|
|
|
static void
|
|
|
|
free_rpl_parallel_entry(void *element)
|
|
|
|
{
|
|
|
|
rpl_parallel_entry *e= (rpl_parallel_entry *)element;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (e->current_gco)
|
|
|
|
dealloc_gco(e->current_gco);
|
2013-09-16 14:33:49 +02:00
|
|
|
mysql_cond_destroy(&e->COND_parallel_entry);
|
|
|
|
mysql_mutex_destroy(&e->LOCK_parallel_entry);
|
|
|
|
my_free(e);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
rpl_parallel::rpl_parallel() :
|
2013-10-08 14:36:06 +02:00
|
|
|
current(NULL), sql_thread_stopping(false)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
|
|
|
my_hash_init(&domain_hash, &my_charset_bin, 32,
|
|
|
|
offsetof(rpl_parallel_entry, domain_id), sizeof(uint32),
|
2013-09-16 14:33:49 +02:00
|
|
|
NULL, free_rpl_parallel_entry, HASH_UNIQUE);
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-09-17 14:07:21 +02:00
|
|
|
void
|
|
|
|
rpl_parallel::reset()
|
|
|
|
{
|
|
|
|
my_hash_reset(&domain_hash);
|
|
|
|
current= NULL;
|
2013-10-08 14:36:06 +02:00
|
|
|
sql_thread_stopping= false;
|
2013-09-17 14:07:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
rpl_parallel::~rpl_parallel()
|
|
|
|
{
|
|
|
|
my_hash_free(&domain_hash);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
rpl_parallel_entry *
|
|
|
|
rpl_parallel::find(uint32 domain_id)
|
|
|
|
{
|
|
|
|
struct rpl_parallel_entry *e;
|
|
|
|
|
|
|
|
if (!(e= (rpl_parallel_entry *)my_hash_search(&domain_hash,
|
|
|
|
(const uchar *)&domain_id, 0)))
|
|
|
|
{
|
|
|
|
/* Allocate a new, empty one. */
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
ulong count= opt_slave_domain_parallel_threads;
|
|
|
|
if (count == 0 || count > opt_slave_parallel_threads)
|
|
|
|
count= opt_slave_parallel_threads;
|
|
|
|
rpl_parallel_thread **p;
|
|
|
|
if (!my_multi_malloc(MYF(MY_WME|MY_ZEROFILL),
|
|
|
|
&e, sizeof(*e),
|
|
|
|
&p, count*sizeof(*p),
|
|
|
|
NULL))
|
|
|
|
{
|
|
|
|
my_error(ER_OUTOFMEMORY, MYF(0), (int)(sizeof(*e)+count*sizeof(*p)));
|
2013-06-24 10:50:25 +02:00
|
|
|
return NULL;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
}
|
|
|
|
e->rpl_threads= p;
|
|
|
|
e->rpl_thread_max= count;
|
2013-06-24 10:50:25 +02:00
|
|
|
e->domain_id= domain_id;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
e->stop_on_error_sub_id= (uint64)ULONGLONG_MAX;
|
2013-06-24 10:50:25 +02:00
|
|
|
if (my_hash_insert(&domain_hash, (uchar *)e))
|
|
|
|
{
|
|
|
|
my_free(e);
|
|
|
|
return NULL;
|
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
mysql_mutex_init(key_LOCK_parallel_entry, &e->LOCK_parallel_entry,
|
|
|
|
MY_MUTEX_INIT_FAST);
|
2013-07-04 09:20:56 +02:00
|
|
|
mysql_cond_init(key_COND_parallel_entry, &e->COND_parallel_entry, NULL);
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
2013-11-05 12:01:26 +01:00
|
|
|
else
|
|
|
|
e->force_abort= false;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
|
|
|
return e;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-07-04 09:20:56 +02:00
|
|
|
void
|
2014-03-03 12:13:55 +01:00
|
|
|
rpl_parallel::wait_for_done(THD *thd, Relay_log_info *rli)
|
2013-07-04 09:20:56 +02:00
|
|
|
{
|
|
|
|
struct rpl_parallel_entry *e;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
rpl_parallel_thread *rpt;
|
|
|
|
uint32 i, j;
|
2013-07-04 09:20:56 +02:00
|
|
|
|
2013-11-05 12:01:26 +01:00
|
|
|
/*
|
|
|
|
First signal all workers that they must force quit; no more events will
|
|
|
|
be queued to complete any partial event groups executed.
|
|
|
|
*/
|
|
|
|
for (i= 0; i < domain_hash.records; ++i)
|
|
|
|
{
|
|
|
|
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_mutex_lock(&e->LOCK_parallel_entry);
|
|
|
|
/*
|
|
|
|
We want the worker threads to stop as quickly as is safe. If the slave
|
|
|
|
SQL threads are behind, we could have significant amount of events
|
|
|
|
queued for the workers, and we want to stop without waiting for them
|
|
|
|
all to be applied first. But if any event group has already started
|
|
|
|
executing in a worker, we want to be sure that all prior event groups
|
|
|
|
are also executed, so that we stop at a consistent point in the binlog
|
|
|
|
stream (per replication domain).
|
|
|
|
|
|
|
|
All event groups wait for e->count_committing_event_groups to reach
|
|
|
|
the value of group_commit_orderer::wait_count before starting to
|
|
|
|
execute. Thus, at this point we know that any event group with a
|
|
|
|
strictly larger wait_count are safe to skip, none of them can have
|
|
|
|
started executing yet. So we set e->stop_count here and use it to
|
|
|
|
decide in the worker threads whether to continue executing an event
|
|
|
|
group or whether to skip it, when force_abort is set.
|
2014-03-03 12:13:55 +01:00
|
|
|
|
|
|
|
If we stop due to reaching the START SLAVE UNTIL condition, then we
|
|
|
|
need to continue executing any queued events up to that point.
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
*/
|
2013-11-05 12:01:26 +01:00
|
|
|
e->force_abort= true;
|
2014-03-03 12:13:55 +01:00
|
|
|
e->stop_count= rli->stop_for_until ?
|
|
|
|
e->count_queued_event_groups : e->count_committing_event_groups;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_mutex_unlock(&e->LOCK_parallel_entry);
|
|
|
|
for (j= 0; j < e->rpl_thread_max; ++j)
|
2013-11-05 12:01:26 +01:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if ((rpt= e->rpl_threads[j]))
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
|
|
|
if (rpt->current_owner == &e->rpl_threads[j])
|
|
|
|
mysql_cond_signal(&rpt->COND_rpl_thread);
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
}
|
2013-11-05 12:01:26 +01:00
|
|
|
}
|
|
|
|
}
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
DBUG_EXECUTE_IF("rpl_parallel_wait_for_done_trigger",
|
|
|
|
{
|
|
|
|
debug_sync_set_action(thd,
|
|
|
|
STRING_WITH_LEN("now SIGNAL wait_for_done_waiting"));
|
|
|
|
};);
|
2013-11-05 12:01:26 +01:00
|
|
|
|
2013-07-04 09:20:56 +02:00
|
|
|
for (i= 0; i < domain_hash.records; ++i)
|
|
|
|
{
|
|
|
|
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
for (j= 0; j < e->rpl_thread_max; ++j)
|
|
|
|
{
|
|
|
|
if ((rpt= e->rpl_threads[j]))
|
|
|
|
{
|
|
|
|
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
|
|
|
|
while (rpt->current_owner == &e->rpl_threads[j])
|
|
|
|
mysql_cond_wait(&e->COND_parallel_entry, &rpt->LOCK_rpl_thread);
|
|
|
|
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
|
|
|
|
}
|
|
|
|
}
|
2013-07-04 09:20:56 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-03-03 12:13:55 +01:00
|
|
|
/*
|
|
|
|
This function handles the case where the SQL driver thread reached the
|
|
|
|
START SLAVE UNTIL position; we stop queueing more events but continue
|
|
|
|
processing remaining, already queued events; then use executes manual
|
|
|
|
STOP SLAVE; then this function signals to worker threads that they
|
|
|
|
should stop the processing of any remaining queued events.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
rpl_parallel::stop_during_until()
|
|
|
|
{
|
|
|
|
struct rpl_parallel_entry *e;
|
|
|
|
uint32 i;
|
|
|
|
|
|
|
|
for (i= 0; i < domain_hash.records; ++i)
|
|
|
|
{
|
|
|
|
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
|
|
|
|
mysql_mutex_lock(&e->LOCK_parallel_entry);
|
|
|
|
if (e->force_abort)
|
|
|
|
e->stop_count= e->count_committing_event_groups;
|
|
|
|
mysql_mutex_unlock(&e->LOCK_parallel_entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-01-08 11:00:44 +01:00
|
|
|
bool
|
|
|
|
rpl_parallel::workers_idle()
|
|
|
|
{
|
|
|
|
struct rpl_parallel_entry *e;
|
|
|
|
uint32 i, max_i;
|
|
|
|
|
|
|
|
max_i= domain_hash.records;
|
|
|
|
for (i= 0; i < max_i; ++i)
|
|
|
|
{
|
|
|
|
bool active;
|
|
|
|
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
|
|
|
|
mysql_mutex_lock(&e->LOCK_parallel_entry);
|
|
|
|
active= e->current_sub_id > e->last_committed_sub_id;
|
|
|
|
mysql_mutex_unlock(&e->LOCK_parallel_entry);
|
|
|
|
if (active)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return (i == max_i);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-08-19 14:26:42 +02:00
|
|
|
int
|
|
|
|
rpl_parallel_entry::queue_master_restart(rpl_group_info *rgi,
|
|
|
|
Format_description_log_event *fdev)
|
|
|
|
{
|
|
|
|
uint32 idx;
|
|
|
|
rpl_parallel_thread *thr;
|
|
|
|
rpl_parallel_thread::queued_event *qev;
|
|
|
|
Relay_log_info *rli= rgi->rli;
|
|
|
|
|
|
|
|
/*
|
|
|
|
We only need to queue the server restart if we still have a thread working
|
|
|
|
on a (potentially partial) event group.
|
|
|
|
|
|
|
|
If the last thread we queued for has finished, then it cannot have any
|
|
|
|
partial event group that needs aborting.
|
|
|
|
|
|
|
|
Thus there is no need for the full complexity of choose_thread(). We only
|
|
|
|
need to check if we have a current worker thread, and queue for it if so.
|
|
|
|
*/
|
|
|
|
idx= rpl_thread_idx;
|
|
|
|
thr= rpl_threads[idx];
|
|
|
|
if (!thr)
|
|
|
|
return 0;
|
|
|
|
mysql_mutex_lock(&thr->LOCK_rpl_thread);
|
|
|
|
if (thr->current_owner != &rpl_threads[idx])
|
|
|
|
{
|
|
|
|
/* No active worker thread, so no need to queue the master restart. */
|
|
|
|
mysql_mutex_unlock(&thr->LOCK_rpl_thread);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(qev= thr->get_qev(fdev, 0, rli)))
|
|
|
|
{
|
|
|
|
mysql_mutex_unlock(&thr->LOCK_rpl_thread);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
qev->rgi= rgi;
|
|
|
|
qev->typ= rpl_parallel_thread::queued_event::QUEUED_MASTER_RESTART;
|
|
|
|
qev->entry_for_queued= this;
|
|
|
|
qev->ir= rli->last_inuse_relaylog;
|
|
|
|
++qev->ir->queued_count;
|
|
|
|
thr->enqueue(qev);
|
2014-11-17 12:42:02 +01:00
|
|
|
mysql_cond_signal(&thr->COND_rpl_thread);
|
2014-08-19 14:26:42 +02:00
|
|
|
mysql_mutex_unlock(&thr->LOCK_rpl_thread);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-08-20 10:59:39 +02:00
|
|
|
int
|
2014-07-02 12:51:45 +02:00
|
|
|
rpl_parallel::wait_for_workers_idle(THD *thd)
|
|
|
|
{
|
|
|
|
uint32 i, max_i;
|
|
|
|
|
2014-08-20 10:59:39 +02:00
|
|
|
/*
|
|
|
|
The domain_hash is only accessed by the SQL driver thread, so it is safe
|
|
|
|
to iterate over without a lock.
|
|
|
|
*/
|
2014-07-02 12:51:45 +02:00
|
|
|
max_i= domain_hash.records;
|
|
|
|
for (i= 0; i < max_i; ++i)
|
|
|
|
{
|
|
|
|
bool active;
|
|
|
|
wait_for_commit my_orderer;
|
|
|
|
struct rpl_parallel_entry *e;
|
|
|
|
|
|
|
|
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
|
|
|
|
mysql_mutex_lock(&e->LOCK_parallel_entry);
|
|
|
|
if ((active= (e->current_sub_id > e->last_committed_sub_id)))
|
|
|
|
{
|
|
|
|
wait_for_commit *waitee= &e->current_group_info->commit_orderer;
|
|
|
|
my_orderer.register_wait_for_prior_commit(waitee);
|
|
|
|
thd->wait_for_commit_ptr= &my_orderer;
|
|
|
|
}
|
|
|
|
mysql_mutex_unlock(&e->LOCK_parallel_entry);
|
|
|
|
if (active)
|
|
|
|
{
|
2014-08-20 10:59:39 +02:00
|
|
|
int err= my_orderer.wait_for_prior_commit(thd);
|
2014-07-02 12:51:45 +02:00
|
|
|
thd->wait_for_commit_ptr= NULL;
|
2014-08-20 10:59:39 +02:00
|
|
|
if (err)
|
|
|
|
return err;
|
2014-07-02 12:51:45 +02:00
|
|
|
}
|
|
|
|
}
|
2014-08-20 10:59:39 +02:00
|
|
|
return 0;
|
2014-07-02 12:51:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
This is used when we get an error during processing in do_event();
|
|
|
|
We will not queue any event to the thread, but we still need to wake it up
|
|
|
|
to be sure that it will be returned to the pool.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
abandon_worker_thread(THD *thd, rpl_parallel_thread *cur_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
bool *did_enter_cond, PSI_stage_info *old_stage)
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
{
|
|
|
|
unlock_or_exit_cond(thd, &cur_thread->LOCK_rpl_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
did_enter_cond, old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_cond_signal(&cur_thread->COND_rpl_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Fixes for parallel slave:
- Made slaves temporary table multi-thread slave safe by adding mutex around save_temporary_table usage.
- rli->save_temporary_tables is the active list of all used temporary tables
- This is copied to THD->temporary_tables when temporary tables are opened and updated when temporary tables are closed
- Added THD->lock_temporary_tables() and THD->unlock_temporary_tables() to simplify this.
- Relay_log_info->sql_thd renamed to Relay_log_info->sql_driver_thd to avoid wrong usage for merged code.
- Added is_part_of_group() to mark functions that are part of the next function. This replaces setting IN_STMT when events are executed.
- Added is_begin(), is_commit() and is_rollback() functions to Query_log_event to simplify code.
- If slave_skip_counter is set run things in single threaded mode. This simplifies code for skipping events.
- Updating state of relay log (IN_STMT and IN_TRANSACTION) is moved to one single function: update_state_of_relay_log()
We can't use OPTION_BEGIN to check for the state anymore as the sql_driver and sql execution threads may be different.
Clear IN_STMT and IN_TRANSACTION in init_relay_log_pos() and Relay_log_info::cleanup_context() to ensure the flags doesn't survive slave restarts
is_in_group() is now independent of state of executed transaction.
- Reset thd->transaction.all.modified_non_trans_table() if we did set it for single table row events.
This was mainly for keeping the flag as documented.
- Changed slave_open_temp_tables to uint32 to be able to use atomic operators on it.
- Relay_log_info::sleep_lock -> rpl_group_info::sleep_lock
- Relay_log_info::sleep_cond -> rpl_group_info::sleep_cond
- Changed some functions to take rpl_group_info instead of Relay_log_info to make them multi-slave safe and to simplify usage
- do_shall_skip()
- continue_group()
- sql_slave_killed()
- next_event()
- Simplifed arguments to io_salve_killed(), check_io_slave_killed() and sql_slave_killed(); No reason to supply THD as this is part of the given structure.
- set_thd_in_use_temporary_tables() removed as in_use is set on usage
- Added information to thd_proc_info() which thread is waiting for slave mutex to exit.
- In open_table() reuse code from find_temporary_table()
Other things:
- More DBUG statements
- Fixed the rpl_incident.test can be run with --debug
- More comments
- Disabled not used function rpl_connect_master()
mysql-test/suite/perfschema/r/all_instances.result:
Moved sleep_lock and sleep_cond to rpl_group_info
mysql-test/suite/rpl/r/rpl_incident.result:
Updated result
mysql-test/suite/rpl/t/rpl_incident-master.opt:
Not needed anymore
mysql-test/suite/rpl/t/rpl_incident.test:
Fixed that test can be run with --debug
sql/handler.cc:
More DBUG_PRINT
sql/log.cc:
More comments
sql/log_event.cc:
Added DBUG statements
do_shall_skip(), continue_group() now takes rpl_group_info param
Use is_begin(), is_commit() and is_rollback() functions instead of inspecting query string
We don't have set slaves temporary tables 'in_use' as this is now done when tables are opened.
Removed IN_STMT flag setting. This is now done in update_state_of_relay_log()
Use IN_TRANSACTION flag to test state of relay log.
In rows_event_stmt_cleanup() reset thd->transaction.all.modified_non_trans_table if we had set this before.
sql/log_event.h:
do_shall_skip(), continue_group() now takes rpl_group_info param
Added is_part_of_group() to mark events that are part of the next event. This replaces setting IN_STMT when events are executed.
Added is_begin(), is_commit() and is_rollback() functions to Query_log_event to simplify code.
sql/log_event_old.cc:
Removed IN_STMT flag setting. This is now done in update_state_of_relay_log()
do_shall_skip(), continue_group() now takes rpl_group_info param
sql/log_event_old.h:
Added is_part_of_group() to mark events that are part of the next event.
do_shall_skip(), continue_group() now takes rpl_group_info param
sql/mysqld.cc:
Changed slave_open_temp_tables to uint32 to be able to use atomic operators on it.
Relay_log_info::sleep_lock -> Rpl_group_info::sleep_lock
Relay_log_info::sleep_cond -> Rpl_group_info::sleep_cond
sql/mysqld.h:
Updated types and names
sql/rpl_gtid.cc:
More DBUG
sql/rpl_parallel.cc:
Updated TODO section
Set thd for event that is execution
Use new is_begin(), is_commit() and is_rollback() functions.
More comments
sql/rpl_rli.cc:
sql_thd -> sql_driver_thd
Relay_log_info::sleep_lock -> rpl_group_info::sleep_lock
Relay_log_info::sleep_cond -> rpl_group_info::sleep_cond
Clear IN_STMT and IN_TRANSACTION in init_relay_log_pos() and Relay_log_info::cleanup_context() to ensure the flags doesn't survive slave restarts.
Reset table->in_use for temporary tables as the table may have been used by another THD.
Use IN_TRANSACTION instead of OPTION_BEGIN to check state of relay log.
Removed IN_STMT flag setting. This is now done in update_state_of_relay_log()
sql/rpl_rli.h:
Changed relay log state flags to bit masks instead of bit positions (most other code we have uses bit masks)
Added IN_TRANSACTION to mark if we are in a BEGIN ... COMMIT section.
save_temporary_tables is now thread safe
Relay_log_info::sleep_lock -> rpl_group_info::sleep_lock
Relay_log_info::sleep_cond -> rpl_group_info::sleep_cond
Relay_log_info->sql_thd renamed to Relay_log_info->sql_driver_thd to avoid wrong usage for merged code
is_in_group() is now independent of state of executed transaction.
sql/slave.cc:
Simplifed arguments to io_salve_killed(), sql_slave_killed() and check_io_slave_killed(); No reason to supply THD as this is part of the given structure.
set_thd_in_use_temporary_tables() removed as in_use is set on usage in sql_base.cc
sql_thd -> sql_driver_thd
More DBUG
Added update_state_of_relay_log() which will calculate the IN_STMT and IN_TRANSACTION state of the relay log after the current element is executed.
If slave_skip_counter is set run things in single threaded mode.
Simplifed arguments to io_salve_killed(), check_io_slave_killed() and sql_slave_killed(); No reason to supply THD as this is part of the given structure.
Added information to thd_proc_info() which thread is waiting for slave mutex to exit.
Disabled not used function rpl_connect_master()
Updated argument to next_event()
sql/sql_base.cc:
Added mutex around usage of slave's temporary tables. The active list is always kept up to date in sql->rgi_slave->save_temporary_tables.
Clear thd->temporary_tables after query (safety)
More DBUG
When using temporary table, set table->in_use to current thd as the THD may be different for slave threads.
Some code is ifdef:ed with REMOVE_AFTER_MERGE_WITH_10 as the given code in 10.0 is not yet in this tree.
In open_table() reuse code from find_temporary_table()
sql/sql_binlog.cc:
rli->sql_thd -> rli->sql_driver_thd
Remove duplicate setting of rgi->rli
sql/sql_class.cc:
Added helper functions rgi_lock_temporary_tables() and rgi_unlock_temporary_tables()
Would have been nicer to have these inline, but there was no easy way to do that
sql/sql_class.h:
Added functions to protect slaves temporary tables
sql/sql_parse.cc:
Added DBUG_PRINT
sql/transaction.cc:
Added comment
2013-10-13 23:24:05 +02:00
|
|
|
/*
|
|
|
|
do_event() is executed by the sql_driver_thd thread.
|
2013-10-14 23:17:16 +02:00
|
|
|
It's main purpose is to find a thread that can execute the query.
|
|
|
|
|
2014-03-04 08:48:32 +01:00
|
|
|
@retval 0 ok, event was accepted
|
|
|
|
@retval 1 error
|
|
|
|
@retval -1 event should be executed serially, in the sql driver thread
|
Fixes for parallel slave:
- Made slaves temporary table multi-thread slave safe by adding mutex around save_temporary_table usage.
- rli->save_temporary_tables is the active list of all used temporary tables
- This is copied to THD->temporary_tables when temporary tables are opened and updated when temporary tables are closed
- Added THD->lock_temporary_tables() and THD->unlock_temporary_tables() to simplify this.
- Relay_log_info->sql_thd renamed to Relay_log_info->sql_driver_thd to avoid wrong usage for merged code.
- Added is_part_of_group() to mark functions that are part of the next function. This replaces setting IN_STMT when events are executed.
- Added is_begin(), is_commit() and is_rollback() functions to Query_log_event to simplify code.
- If slave_skip_counter is set run things in single threaded mode. This simplifies code for skipping events.
- Updating state of relay log (IN_STMT and IN_TRANSACTION) is moved to one single function: update_state_of_relay_log()
We can't use OPTION_BEGIN to check for the state anymore as the sql_driver and sql execution threads may be different.
Clear IN_STMT and IN_TRANSACTION in init_relay_log_pos() and Relay_log_info::cleanup_context() to ensure the flags doesn't survive slave restarts
is_in_group() is now independent of state of executed transaction.
- Reset thd->transaction.all.modified_non_trans_table() if we did set it for single table row events.
This was mainly for keeping the flag as documented.
- Changed slave_open_temp_tables to uint32 to be able to use atomic operators on it.
- Relay_log_info::sleep_lock -> rpl_group_info::sleep_lock
- Relay_log_info::sleep_cond -> rpl_group_info::sleep_cond
- Changed some functions to take rpl_group_info instead of Relay_log_info to make them multi-slave safe and to simplify usage
- do_shall_skip()
- continue_group()
- sql_slave_killed()
- next_event()
- Simplifed arguments to io_salve_killed(), check_io_slave_killed() and sql_slave_killed(); No reason to supply THD as this is part of the given structure.
- set_thd_in_use_temporary_tables() removed as in_use is set on usage
- Added information to thd_proc_info() which thread is waiting for slave mutex to exit.
- In open_table() reuse code from find_temporary_table()
Other things:
- More DBUG statements
- Fixed the rpl_incident.test can be run with --debug
- More comments
- Disabled not used function rpl_connect_master()
mysql-test/suite/perfschema/r/all_instances.result:
Moved sleep_lock and sleep_cond to rpl_group_info
mysql-test/suite/rpl/r/rpl_incident.result:
Updated result
mysql-test/suite/rpl/t/rpl_incident-master.opt:
Not needed anymore
mysql-test/suite/rpl/t/rpl_incident.test:
Fixed that test can be run with --debug
sql/handler.cc:
More DBUG_PRINT
sql/log.cc:
More comments
sql/log_event.cc:
Added DBUG statements
do_shall_skip(), continue_group() now takes rpl_group_info param
Use is_begin(), is_commit() and is_rollback() functions instead of inspecting query string
We don't have set slaves temporary tables 'in_use' as this is now done when tables are opened.
Removed IN_STMT flag setting. This is now done in update_state_of_relay_log()
Use IN_TRANSACTION flag to test state of relay log.
In rows_event_stmt_cleanup() reset thd->transaction.all.modified_non_trans_table if we had set this before.
sql/log_event.h:
do_shall_skip(), continue_group() now takes rpl_group_info param
Added is_part_of_group() to mark events that are part of the next event. This replaces setting IN_STMT when events are executed.
Added is_begin(), is_commit() and is_rollback() functions to Query_log_event to simplify code.
sql/log_event_old.cc:
Removed IN_STMT flag setting. This is now done in update_state_of_relay_log()
do_shall_skip(), continue_group() now takes rpl_group_info param
sql/log_event_old.h:
Added is_part_of_group() to mark events that are part of the next event.
do_shall_skip(), continue_group() now takes rpl_group_info param
sql/mysqld.cc:
Changed slave_open_temp_tables to uint32 to be able to use atomic operators on it.
Relay_log_info::sleep_lock -> Rpl_group_info::sleep_lock
Relay_log_info::sleep_cond -> Rpl_group_info::sleep_cond
sql/mysqld.h:
Updated types and names
sql/rpl_gtid.cc:
More DBUG
sql/rpl_parallel.cc:
Updated TODO section
Set thd for event that is execution
Use new is_begin(), is_commit() and is_rollback() functions.
More comments
sql/rpl_rli.cc:
sql_thd -> sql_driver_thd
Relay_log_info::sleep_lock -> rpl_group_info::sleep_lock
Relay_log_info::sleep_cond -> rpl_group_info::sleep_cond
Clear IN_STMT and IN_TRANSACTION in init_relay_log_pos() and Relay_log_info::cleanup_context() to ensure the flags doesn't survive slave restarts.
Reset table->in_use for temporary tables as the table may have been used by another THD.
Use IN_TRANSACTION instead of OPTION_BEGIN to check state of relay log.
Removed IN_STMT flag setting. This is now done in update_state_of_relay_log()
sql/rpl_rli.h:
Changed relay log state flags to bit masks instead of bit positions (most other code we have uses bit masks)
Added IN_TRANSACTION to mark if we are in a BEGIN ... COMMIT section.
save_temporary_tables is now thread safe
Relay_log_info::sleep_lock -> rpl_group_info::sleep_lock
Relay_log_info::sleep_cond -> rpl_group_info::sleep_cond
Relay_log_info->sql_thd renamed to Relay_log_info->sql_driver_thd to avoid wrong usage for merged code
is_in_group() is now independent of state of executed transaction.
sql/slave.cc:
Simplifed arguments to io_salve_killed(), sql_slave_killed() and check_io_slave_killed(); No reason to supply THD as this is part of the given structure.
set_thd_in_use_temporary_tables() removed as in_use is set on usage in sql_base.cc
sql_thd -> sql_driver_thd
More DBUG
Added update_state_of_relay_log() which will calculate the IN_STMT and IN_TRANSACTION state of the relay log after the current element is executed.
If slave_skip_counter is set run things in single threaded mode.
Simplifed arguments to io_salve_killed(), check_io_slave_killed() and sql_slave_killed(); No reason to supply THD as this is part of the given structure.
Added information to thd_proc_info() which thread is waiting for slave mutex to exit.
Disabled not used function rpl_connect_master()
Updated argument to next_event()
sql/sql_base.cc:
Added mutex around usage of slave's temporary tables. The active list is always kept up to date in sql->rgi_slave->save_temporary_tables.
Clear thd->temporary_tables after query (safety)
More DBUG
When using temporary table, set table->in_use to current thd as the THD may be different for slave threads.
Some code is ifdef:ed with REMOVE_AFTER_MERGE_WITH_10 as the given code in 10.0 is not yet in this tree.
In open_table() reuse code from find_temporary_table()
sql/sql_binlog.cc:
rli->sql_thd -> rli->sql_driver_thd
Remove duplicate setting of rgi->rli
sql/sql_class.cc:
Added helper functions rgi_lock_temporary_tables() and rgi_unlock_temporary_tables()
Would have been nicer to have these inline, but there was no easy way to do that
sql/sql_class.h:
Added functions to protect slaves temporary tables
sql/sql_parse.cc:
Added DBUG_PRINT
sql/transaction.cc:
Added comment
2013-10-13 23:24:05 +02:00
|
|
|
*/
|
|
|
|
|
2014-03-04 08:48:32 +01:00
|
|
|
int
|
2013-10-24 12:44:21 +02:00
|
|
|
rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
|
|
|
|
ulonglong event_size)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
|
|
|
rpl_parallel_entry *e;
|
|
|
|
rpl_parallel_thread *cur_thread;
|
|
|
|
rpl_parallel_thread::queued_event *qev;
|
2013-09-23 14:46:57 +02:00
|
|
|
rpl_group_info *rgi= NULL;
|
2013-07-03 13:46:33 +02:00
|
|
|
Relay_log_info *rli= serial_rgi->rli;
|
|
|
|
enum Log_event_type typ;
|
2013-10-08 14:36:06 +02:00
|
|
|
bool is_group_event;
|
2013-12-06 13:28:23 +01:00
|
|
|
bool did_enter_cond= false;
|
2014-02-03 15:22:39 +01:00
|
|
|
PSI_stage_info old_stage;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
2014-03-04 08:48:32 +01:00
|
|
|
/* Handle master log name change, seen in Rotate_log_event. */
|
|
|
|
typ= ev->get_type_code();
|
|
|
|
if (unlikely(typ == ROTATE_EVENT))
|
|
|
|
{
|
|
|
|
Rotate_log_event *rev= static_cast<Rotate_log_event *>(ev);
|
|
|
|
if ((rev->server_id != global_system_variables.server_id ||
|
|
|
|
rli->replicate_same_server_id) &&
|
|
|
|
!rev->is_relay_log_event() &&
|
|
|
|
!rli->is_in_group())
|
|
|
|
{
|
|
|
|
memcpy(rli->future_event_master_log_name,
|
|
|
|
rev->new_log_ident, rev->ident_len+1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
Execute queries non-parallel if slave_skip_counter is set, as it's is
|
|
|
|
easier to skip queries in single threaded mode.
|
|
|
|
*/
|
|
|
|
if (rli->slave_skip_counter)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* Execute pre-10.0 event, which have no GTID, in single-threaded mode. */
|
|
|
|
if (unlikely(!current) && typ != GTID_EVENT)
|
|
|
|
return -1;
|
|
|
|
|
2013-06-25 09:30:19 +02:00
|
|
|
/* ToDo: what to do with this lock?!? */
|
|
|
|
mysql_mutex_unlock(&rli->data_lock);
|
|
|
|
|
2014-07-02 12:51:45 +02:00
|
|
|
if (typ == FORMAT_DESCRIPTION_EVENT)
|
|
|
|
{
|
|
|
|
Format_description_log_event *fdev=
|
|
|
|
static_cast<Format_description_log_event *>(ev);
|
|
|
|
if (fdev->created)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
This format description event marks a new binlog after a master server
|
|
|
|
restart. We are going to close all temporary tables to clean up any
|
|
|
|
possible left-overs after a prior master crash.
|
|
|
|
|
|
|
|
Thus we need to wait for all prior events to execute to completion,
|
|
|
|
in case they need access to any of the temporary tables.
|
2014-08-19 14:26:42 +02:00
|
|
|
|
|
|
|
We also need to notify the worker thread running the prior incomplete
|
|
|
|
event group (if any), as such event group signifies an incompletely
|
|
|
|
written group cut short by a master crash, and must be rolled back.
|
2014-07-02 12:51:45 +02:00
|
|
|
*/
|
2014-08-20 10:59:39 +02:00
|
|
|
if (current->queue_master_restart(serial_rgi, fdev) ||
|
|
|
|
wait_for_workers_idle(rli->sql_driver_thd))
|
2014-08-19 14:26:42 +02:00
|
|
|
{
|
|
|
|
delete ev;
|
|
|
|
return 1;
|
|
|
|
}
|
2014-07-02 12:51:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-08 14:36:06 +02:00
|
|
|
/*
|
|
|
|
Stop queueing additional event groups once the SQL thread is requested to
|
|
|
|
stop.
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
We have to queue any remaining events of any event group that has already
|
|
|
|
been partially queued, but after that we will just ignore any further
|
|
|
|
events the SQL driver thread may try to queue, and eventually it will stop.
|
2013-10-08 14:36:06 +02:00
|
|
|
*/
|
2014-03-04 08:48:32 +01:00
|
|
|
is_group_event= Log_event::is_group_event(typ);
|
|
|
|
if ((typ == GTID_EVENT || !is_group_event) && rli->abort_slave)
|
2013-10-08 14:36:06 +02:00
|
|
|
sql_thread_stopping= true;
|
|
|
|
if (sql_thread_stopping)
|
2013-10-14 23:17:16 +02:00
|
|
|
{
|
2014-01-03 12:20:53 +01:00
|
|
|
delete ev;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
2014-03-04 08:48:32 +01:00
|
|
|
Return "no error"; normal stop is not an error, and otherwise the error
|
|
|
|
has already been recorded.
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
*/
|
2014-03-04 08:48:32 +01:00
|
|
|
return 0;
|
2013-10-14 23:17:16 +02:00
|
|
|
}
|
2013-10-08 14:36:06 +02:00
|
|
|
|
2014-03-04 08:48:32 +01:00
|
|
|
if (typ == GTID_EVENT)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
2014-05-13 13:42:06 +02:00
|
|
|
Gtid_log_event *gtid_ev= static_cast<Gtid_log_event *>(ev);
|
|
|
|
uint32 domain_id= (rli->mi->using_gtid == Master_info::USE_GTID_NO ?
|
|
|
|
0 : gtid_ev->domain_id);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (!(e= find(domain_id)))
|
|
|
|
{
|
|
|
|
my_error(ER_OUT_OF_RESOURCES, MYF(MY_WME));
|
|
|
|
delete ev;
|
2014-03-04 08:48:32 +01:00
|
|
|
return 1;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
}
|
|
|
|
current= e;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
e= current;
|
|
|
|
|
|
|
|
/*
|
|
|
|
Find a worker thread to queue the event for.
|
|
|
|
Prefer a new thread, so we maximise parallelism (at least for the group
|
|
|
|
commit). But do not exceed a limit of --slave-domain-parallel-threads;
|
|
|
|
instead re-use a thread that we queued for previously.
|
|
|
|
*/
|
|
|
|
cur_thread=
|
2014-06-25 15:17:03 +02:00
|
|
|
e->choose_thread(serial_rgi, &did_enter_cond, &old_stage,
|
|
|
|
typ != GTID_EVENT);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (!cur_thread)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/* This means we were killed. The error is already signalled. */
|
|
|
|
delete ev;
|
2014-03-04 08:48:32 +01:00
|
|
|
return 1;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!(qev= cur_thread->get_qev(ev, event_size, rli)))
|
|
|
|
{
|
|
|
|
abandon_worker_thread(rli->sql_driver_thd, cur_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
&did_enter_cond, &old_stage);
|
2014-01-03 12:20:53 +01:00
|
|
|
delete ev;
|
2014-03-04 08:48:32 +01:00
|
|
|
return 1;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
|
2013-10-08 14:36:06 +02:00
|
|
|
if (typ == GTID_EVENT)
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
|
|
|
Gtid_log_event *gtid_ev= static_cast<Gtid_log_event *>(ev);
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
if (!(rgi= cur_thread->get_rgi(rli, gtid_ev, e, event_size)))
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
cur_thread->free_qev(qev);
|
|
|
|
abandon_worker_thread(rli->sql_driver_thd, cur_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
&did_enter_cond, &old_stage);
|
2014-01-03 12:20:53 +01:00
|
|
|
delete ev;
|
2014-03-04 08:48:32 +01:00
|
|
|
return 1;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
We queue the event group in a new worker thread, to run in parallel
|
|
|
|
with previous groups.
|
2013-06-24 10:50:25 +02:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
To preserve commit order within the replication domain, we set up
|
|
|
|
rgi->wait_commit_sub_id to make the new group commit only after the
|
|
|
|
previous group has committed.
|
2013-12-18 16:26:22 +01:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
Event groups that group-committed together on the master can be run
|
|
|
|
in parallel with each other without restrictions. But one batch of
|
|
|
|
group-commits may not start before all groups in the previous batch
|
|
|
|
have initiated their commit phase; we set up rgi->gco to ensure that.
|
|
|
|
*/
|
|
|
|
rgi->wait_commit_sub_id= e->current_sub_id;
|
|
|
|
rgi->wait_commit_group_info= e->current_group_info;
|
|
|
|
|
|
|
|
if (!((gtid_ev->flags2 & Gtid_log_event::FL_GROUP_COMMIT_ID) &&
|
|
|
|
e->last_commit_id == gtid_ev->commit_id))
|
2013-06-24 10:50:25 +02:00
|
|
|
{
|
2013-09-23 14:46:57 +02:00
|
|
|
/*
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
A new batch of transactions that group-committed together on the master.
|
|
|
|
|
|
|
|
Remember the count that marks the end of the previous group committed
|
|
|
|
batch, and allocate a new gco.
|
2013-09-23 14:46:57 +02:00
|
|
|
*/
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
uint64 count= e->count_queued_event_groups;
|
|
|
|
group_commit_orderer *gco;
|
2013-07-08 16:47:07 +02:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
if (!(gco= cur_thread->get_gco(count, e->current_gco)))
|
2013-07-08 16:47:07 +02:00
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
cur_thread->free_rgi(rgi);
|
|
|
|
cur_thread->free_qev(qev);
|
|
|
|
abandon_worker_thread(rli->sql_driver_thd, cur_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
&did_enter_cond, &old_stage);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
delete ev;
|
2014-03-04 08:48:32 +01:00
|
|
|
return 1;
|
2013-07-08 16:47:07 +02:00
|
|
|
}
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
e->current_gco= rgi->gco= gco;
|
2013-07-08 16:47:07 +02:00
|
|
|
}
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
else
|
|
|
|
rgi->gco= e->current_gco;
|
2013-07-08 16:47:07 +02:00
|
|
|
if (gtid_ev->flags2 & Gtid_log_event::FL_GROUP_COMMIT_ID)
|
|
|
|
e->last_commit_id= gtid_ev->commit_id;
|
|
|
|
else
|
|
|
|
e->last_commit_id= 0;
|
2013-10-31 14:11:41 +01:00
|
|
|
qev->rgi= e->current_group_info= rgi;
|
2013-07-03 13:46:33 +02:00
|
|
|
e->current_sub_id= rgi->gtid_sub_id;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
++e->count_queued_event_groups;
|
2013-07-03 13:46:33 +02:00
|
|
|
}
|
2014-03-04 08:48:32 +01:00
|
|
|
else if (!is_group_event)
|
2013-07-03 13:46:33 +02:00
|
|
|
{
|
2013-10-28 13:24:56 +01:00
|
|
|
int err;
|
2013-10-31 14:11:41 +01:00
|
|
|
bool tmp;
|
2013-07-03 13:46:33 +02:00
|
|
|
/*
|
|
|
|
Events like ROTATE and FORMAT_DESCRIPTION. Do not run in worker thread.
|
|
|
|
Same for events not preceeded by GTID (we should not see those normally,
|
|
|
|
but they might be from an old master).
|
|
|
|
*/
|
|
|
|
qev->rgi= serial_rgi;
|
2013-10-23 15:03:03 +02:00
|
|
|
|
2013-10-31 14:11:41 +01:00
|
|
|
tmp= serial_rgi->is_parallel_exec;
|
|
|
|
serial_rgi->is_parallel_exec= true;
|
2013-10-28 13:24:56 +01:00
|
|
|
err= rpt_handle_event(qev, NULL);
|
2013-10-31 14:11:41 +01:00
|
|
|
serial_rgi->is_parallel_exec= tmp;
|
2014-04-09 14:42:46 +02:00
|
|
|
if (ev->is_relay_log_event())
|
|
|
|
qev->future_event_master_log_pos= 0;
|
|
|
|
else if (typ == ROTATE_EVENT)
|
|
|
|
qev->future_event_master_log_pos=
|
|
|
|
(static_cast<Rotate_log_event *>(ev))->pos;
|
|
|
|
else
|
|
|
|
qev->future_event_master_log_pos= ev->log_pos;
|
2014-03-04 08:48:32 +01:00
|
|
|
delete_or_keep_event_post_apply(serial_rgi, typ, ev);
|
2013-07-04 13:17:01 +02:00
|
|
|
|
2013-10-31 14:11:41 +01:00
|
|
|
if (err)
|
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
cur_thread->free_qev(qev);
|
|
|
|
abandon_worker_thread(rli->sql_driver_thd, cur_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
&did_enter_cond, &old_stage);
|
2014-03-04 08:48:32 +01:00
|
|
|
return 1;
|
2013-10-31 14:11:41 +01:00
|
|
|
}
|
|
|
|
/*
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
Queue a position update, so that the position will be updated in a
|
2013-10-31 14:11:41 +01:00
|
|
|
reasonable way relative to other events:
|
|
|
|
|
|
|
|
- If the currently executing events are queued serially for a single
|
|
|
|
thread, the position will only be updated when everything before has
|
|
|
|
completed.
|
|
|
|
|
|
|
|
- If we are executing multiple independent events in parallel, then at
|
|
|
|
least the position will not be updated until one of them has reached
|
|
|
|
the current point.
|
|
|
|
*/
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
qev->typ= rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE;
|
|
|
|
qev->entry_for_queued= e;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
qev->rgi= e->current_group_info;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|
2013-07-03 13:46:33 +02:00
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
/*
|
|
|
|
Queue the event for processing.
|
|
|
|
*/
|
2013-10-29 11:52:16 +01:00
|
|
|
rli->event_relay_log_pos= rli->future_event_relay_log_pos;
|
2014-05-15 15:52:08 +02:00
|
|
|
qev->ir= rli->last_inuse_relaylog;
|
|
|
|
++qev->ir->queued_count;
|
2013-10-24 12:44:21 +02:00
|
|
|
cur_thread->enqueue(qev);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
unlock_or_exit_cond(rli->sql_driver_thd, &cur_thread->LOCK_rpl_thread,
|
2014-02-26 16:38:42 +01:00
|
|
|
&did_enter_cond, &old_stage);
|
2013-09-23 14:46:57 +02:00
|
|
|
mysql_cond_signal(&cur_thread->COND_rpl_thread);
|
2013-06-24 10:50:25 +02:00
|
|
|
|
2014-03-04 08:48:32 +01:00
|
|
|
return 0;
|
2013-06-24 10:50:25 +02:00
|
|
|
}
|