2013-06-24 10:50:25 +02:00
|
|
|
#ifndef RPL_PARALLEL_H
|
|
|
|
#define RPL_PARALLEL_H
|
|
|
|
|
|
|
|
#include "log_event.h"
|
|
|
|
|
|
|
|
|
|
|
|
struct rpl_parallel;
|
|
|
|
struct rpl_parallel_entry;
|
|
|
|
struct rpl_parallel_thread_pool;
|
|
|
|
|
|
|
|
class Relay_log_info;
|
2014-05-15 15:52:08 +02:00
|
|
|
struct inuse_relaylog;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Structure used to keep track of the parallel replication of a batch of
|
|
|
|
event-groups that group-committed together on the master.
|
|
|
|
|
|
|
|
It is used to ensure that every event group in one batch has reached the
|
|
|
|
commit stage before the next batch starts executing.
|
|
|
|
|
|
|
|
Note the lifetime of this structure:
|
|
|
|
|
|
|
|
- It is allocated when the first event in a new batch of group commits
|
|
|
|
is queued, from the free list rpl_parallel_entry::gco_free_list.
|
|
|
|
|
|
|
|
- The gco for the batch currently being queued is owned by
|
|
|
|
rpl_parallel_entry::current_gco. The gco for a previous batch that has
|
|
|
|
been fully queued is owned by the gco->prev_gco pointer of the gco for
|
|
|
|
the following batch.
|
|
|
|
|
|
|
|
- The worker thread waits on gco->COND_group_commit_orderer for
|
|
|
|
rpl_parallel_entry::count_committing_event_groups to reach wait_count
|
|
|
|
before starting; the first waiter links the gco into the next_gco
|
|
|
|
pointer of the gco of the previous batch for signalling.
|
|
|
|
|
|
|
|
- When an event group reaches the commit stage, it signals the
|
|
|
|
COND_group_commit_orderer if its gco->next_gco pointer is non-NULL and
|
|
|
|
rpl_parallel_entry::count_committing_event_groups has reached
|
|
|
|
gco->next_gco->wait_count.
|
|
|
|
|
|
|
|
- When gco->wait_count is reached for a worker and the wait completes,
|
|
|
|
the worker frees gco->prev_gco; at this point it is guaranteed not to
|
|
|
|
be needed any longer.
|
|
|
|
*/
|
|
|
|
struct group_commit_orderer {
|
|
|
|
/* Wakeup condition, used with rpl_parallel_entry::LOCK_parallel_entry. */
|
|
|
|
mysql_cond_t COND_group_commit_orderer;
|
|
|
|
uint64 wait_count;
|
|
|
|
group_commit_orderer *prev_gco;
|
|
|
|
group_commit_orderer *next_gco;
|
|
|
|
bool installed;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2013-06-24 10:50:25 +02:00
|
|
|
struct rpl_parallel_thread {
|
|
|
|
bool delay_start;
|
|
|
|
bool running;
|
|
|
|
bool stop;
|
|
|
|
mysql_mutex_t LOCK_rpl_thread;
|
|
|
|
mysql_cond_t COND_rpl_thread;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_cond_t COND_rpl_thread_queue;
|
2013-06-24 10:50:25 +02:00
|
|
|
struct rpl_parallel_thread *next; /* For free list. */
|
|
|
|
struct rpl_parallel_thread_pool *pool;
|
|
|
|
THD *thd;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
Who owns the thread, if any (it's a pointer into the
|
|
|
|
rpl_parallel_entry::rpl_threads array.
|
|
|
|
*/
|
|
|
|
struct rpl_parallel_thread **current_owner;
|
|
|
|
/* The rpl_parallel_entry of the owner. */
|
|
|
|
rpl_parallel_entry *current_entry;
|
2013-06-24 10:50:25 +02:00
|
|
|
struct queued_event {
|
|
|
|
queued_event *next;
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
/*
|
|
|
|
queued_event can hold either an event to be executed, or just a binlog
|
|
|
|
position to be updated without any associated event.
|
|
|
|
*/
|
2014-08-19 14:26:42 +02:00
|
|
|
enum queued_event_t {
|
|
|
|
QUEUED_EVENT,
|
|
|
|
QUEUED_POS_UPDATE,
|
|
|
|
QUEUED_MASTER_RESTART
|
|
|
|
} typ;
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
union {
|
|
|
|
Log_event *ev; /* QUEUED_EVENT */
|
2014-08-19 14:26:42 +02:00
|
|
|
rpl_parallel_entry *entry_for_queued; /* QUEUED_POS_UPDATE and
|
|
|
|
QUEUED_MASTER_RESTART */
|
MDEV-6551: Some replication errors are ignored if slave_parallel_threads > 0
The problem occured when using parallel replication, and an error occured that
caused the SQL thread to stop when the IO thread had already reached a
following binlog file from the master (or otherwise performed a relay log
rotation).
In this case, the Rotate Event at the end of the relay log file could still be
executed, even though an earlier event in that relay log file had gotten an
error. This would cause the position to be incorrectly updated, so that upon
restart of the SQL thread, the event that had failed would be silently skipped
and ignored, causing replication corruption.
Fixed by checking before executing Rotate Event, whether an earlier event
has failed. If so, the Rotate Event is not executed, just dequeued, same as
for other normal events following a failing event.
2014-08-15 11:31:13 +02:00
|
|
|
};
|
2013-09-13 15:09:57 +02:00
|
|
|
rpl_group_info *rgi;
|
2014-05-15 15:52:08 +02:00
|
|
|
inuse_relaylog *ir;
|
2013-10-17 14:11:19 +02:00
|
|
|
ulonglong future_event_relay_log_pos;
|
2013-10-23 15:03:03 +02:00
|
|
|
char event_relay_log_name[FN_REFLEN];
|
|
|
|
char future_event_master_log_name[FN_REFLEN];
|
|
|
|
ulonglong event_relay_log_pos;
|
2013-10-31 14:11:41 +01:00
|
|
|
my_off_t future_event_master_log_pos;
|
2013-10-24 12:44:21 +02:00
|
|
|
size_t event_size;
|
2013-06-24 10:50:25 +02:00
|
|
|
} *event_queue, *last_in_queue;
|
2013-10-24 12:44:21 +02:00
|
|
|
uint64 queued_size;
|
2014-11-13 10:20:48 +01:00
|
|
|
/* These free lists are protected by LOCK_rpl_thread. */
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
queued_event *qev_free_list;
|
|
|
|
rpl_group_info *rgi_free_list;
|
|
|
|
group_commit_orderer *gco_free_list;
|
2014-11-13 10:20:48 +01:00
|
|
|
/*
|
|
|
|
These free lists are local to the thread, so need not be protected by any
|
|
|
|
lock. They are moved to the global free lists in batches in the function
|
|
|
|
batch_free(), to reduce LOCK_rpl_thread contention.
|
|
|
|
|
|
|
|
The lists are not NULL-terminated (as we do not need to traverse them).
|
|
|
|
Instead, if they are non-NULL, the loc_XXX_last_ptr_ptr points to the
|
|
|
|
`next' pointer of the last element, which is used to link into the front
|
|
|
|
of the global freelists.
|
|
|
|
*/
|
|
|
|
queued_event *loc_qev_list, **loc_qev_last_ptr_ptr;
|
|
|
|
size_t loc_qev_size;
|
|
|
|
uint64 qev_free_pending;
|
|
|
|
rpl_group_info *loc_rgi_list, **loc_rgi_last_ptr_ptr;
|
|
|
|
group_commit_orderer *loc_gco_list, **loc_gco_last_ptr_ptr;
|
|
|
|
/* These keep track of batch update of inuse_relaylog refcounts. */
|
|
|
|
inuse_relaylog *accumulated_ir_last;
|
|
|
|
uint64 accumulated_ir_count;
|
2013-10-24 12:44:21 +02:00
|
|
|
|
|
|
|
void enqueue(queued_event *qev)
|
|
|
|
{
|
|
|
|
if (last_in_queue)
|
|
|
|
last_in_queue->next= qev;
|
|
|
|
else
|
|
|
|
event_queue= qev;
|
|
|
|
last_in_queue= qev;
|
|
|
|
queued_size+= qev->event_size;
|
|
|
|
}
|
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
void dequeue1(queued_event *list)
|
2013-10-24 12:44:21 +02:00
|
|
|
{
|
|
|
|
DBUG_ASSERT(list == event_queue);
|
|
|
|
event_queue= last_in_queue= NULL;
|
|
|
|
}
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
void dequeue2(size_t dequeue_size)
|
|
|
|
{
|
|
|
|
queued_size-= dequeue_size;
|
|
|
|
}
|
|
|
|
|
2014-05-08 14:20:18 +02:00
|
|
|
queued_event *get_qev_common(Log_event *ev, ulonglong event_size);
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
queued_event *get_qev(Log_event *ev, ulonglong event_size,
|
|
|
|
Relay_log_info *rli);
|
2014-05-08 14:20:18 +02:00
|
|
|
queued_event *retry_get_qev(Log_event *ev, queued_event *orig_qev,
|
|
|
|
const char *relay_log_name,
|
|
|
|
ulonglong event_pos, ulonglong event_size);
|
2014-11-13 10:20:48 +01:00
|
|
|
/*
|
|
|
|
Put a qev on the local free list, to be later released to the global free
|
|
|
|
list by batch_free().
|
|
|
|
*/
|
|
|
|
void loc_free_qev(queued_event *qev);
|
|
|
|
/*
|
|
|
|
Release an rgi immediately to the global free list. Requires holding the
|
|
|
|
LOCK_rpl_thread mutex.
|
|
|
|
*/
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
void free_qev(queued_event *qev);
|
|
|
|
rpl_group_info *get_rgi(Relay_log_info *rli, Gtid_log_event *gtid_ev,
|
2014-05-08 14:20:18 +02:00
|
|
|
rpl_parallel_entry *e, ulonglong event_size);
|
2014-11-13 10:20:48 +01:00
|
|
|
/*
|
|
|
|
Put an gco on the local free list, to be later released to the global free
|
|
|
|
list by batch_free().
|
|
|
|
*/
|
|
|
|
void loc_free_rgi(rpl_group_info *rgi);
|
|
|
|
/*
|
|
|
|
Release an rgi immediately to the global free list. Requires holding the
|
|
|
|
LOCK_rpl_thread mutex.
|
|
|
|
*/
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
void free_rgi(rpl_group_info *rgi);
|
|
|
|
group_commit_orderer *get_gco(uint64 wait_count, group_commit_orderer *prev);
|
2014-11-13 10:20:48 +01:00
|
|
|
/*
|
|
|
|
Put a gco on the local free list, to be later released to the global free
|
|
|
|
list by batch_free().
|
|
|
|
*/
|
|
|
|
void loc_free_gco(group_commit_orderer *gco);
|
|
|
|
/*
|
|
|
|
Move all local free lists to the global ones. Requires holding
|
|
|
|
LOCK_rpl_thread.
|
|
|
|
*/
|
|
|
|
void batch_free();
|
|
|
|
/* Update inuse_relaylog refcounts with what we have accumulated so far. */
|
|
|
|
void inuse_relaylog_refcount_update();
|
2013-06-24 10:50:25 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct rpl_parallel_thread_pool {
|
|
|
|
uint32 count;
|
|
|
|
struct rpl_parallel_thread **threads;
|
|
|
|
struct rpl_parallel_thread *free_list;
|
|
|
|
mysql_mutex_t LOCK_rpl_thread_pool;
|
|
|
|
mysql_cond_t COND_rpl_thread_pool;
|
|
|
|
bool changing;
|
|
|
|
bool inited;
|
|
|
|
|
|
|
|
rpl_parallel_thread_pool();
|
|
|
|
int init(uint32 size);
|
|
|
|
void destroy();
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
struct rpl_parallel_thread *get_thread(rpl_parallel_thread **owner,
|
|
|
|
rpl_parallel_entry *entry);
|
|
|
|
void release_thread(rpl_parallel_thread *rpt);
|
2013-06-24 10:50:25 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct rpl_parallel_entry {
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
mysql_mutex_t LOCK_parallel_entry;
|
|
|
|
mysql_cond_t COND_parallel_entry;
|
2013-06-24 10:50:25 +02:00
|
|
|
uint32 domain_id;
|
|
|
|
uint64 last_commit_id;
|
|
|
|
bool active;
|
2013-11-05 12:01:26 +01:00
|
|
|
/*
|
|
|
|
Set when SQL thread is shutting down, and no more events can be processed,
|
|
|
|
so worker threads must force abort any current transactions without
|
|
|
|
waiting for event groups to complete.
|
|
|
|
*/
|
|
|
|
bool force_abort;
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
At STOP SLAVE (force_abort=true), we do not want to process all events in
|
|
|
|
the queue (which could unnecessarily delay stop, if a lot of events happen
|
|
|
|
to be queued). The stop_count provides a safe point at which to stop, so
|
|
|
|
that everything before becomes committed and nothing after does. The value
|
|
|
|
corresponds to group_commit_orderer::wait_count; if wait_count is less than
|
|
|
|
or equal to stop_count, we execute the associated event group, else we
|
|
|
|
skip it (and all following) and stop.
|
|
|
|
*/
|
|
|
|
uint64 stop_count;
|
2013-11-05 12:01:26 +01:00
|
|
|
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
/*
|
|
|
|
Cyclic array recording the last rpl_thread_max worker threads that we
|
|
|
|
queued event for. This is used to limit how many workers a single domain
|
|
|
|
can occupy (--slave-domain-parallel-threads).
|
|
|
|
|
|
|
|
Note that workers are never explicitly deleted from the array. Instead,
|
|
|
|
we need to check (under LOCK_rpl_thread) that the thread still belongs
|
|
|
|
to us before re-using (rpl_thread::current_owner).
|
|
|
|
*/
|
|
|
|
rpl_parallel_thread **rpl_threads;
|
|
|
|
uint32 rpl_thread_max;
|
|
|
|
uint32 rpl_thread_idx;
|
2013-07-03 13:46:33 +02:00
|
|
|
/*
|
|
|
|
The sub_id of the last transaction to commit within this domain_id.
|
|
|
|
Must be accessed under LOCK_parallel_entry protection.
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
|
|
|
|
Event groups commit in order, so the rpl_group_info for an event group
|
|
|
|
will be alive (at least) as long as
|
MDEV-5262, MDEV-5914, MDEV-5941, MDEV-6020: Deadlocks during parallel
replication causing replication to fail.
Remove the temporary fix for MDEV-5914, which used READ COMMITTED for parallel
replication worker threads. Replace it with a better, more selective solution.
The issue is with certain edge cases of InnoDB gap locks, for example between
INSERT and ranged DELETE. It is possible for the gap lock set by the DELETE to
block the INSERT, if the DELETE runs first, while the record lock set by
INSERT does not block the DELETE, if the INSERT runs first. This can cause a
conflict between the two in parallel replication on the slave even though they
ran without conflicts on the master.
With this patch, InnoDB will ask the server layer about the two involved
transactions before blocking on a gap lock. If the server layer tells InnoDB
that the transactions are already fixed wrt. commit order, as they are in
parallel replication, InnoDB will ignore the gap lock and allow the two
transactions to proceed in parallel, avoiding the conflict.
Improve the fix for MDEV-6020. When InnoDB itself detects a deadlock, it now
asks the server layer for any preferences about which transaction to roll
back. In case of parallel replication with two transactions T1 and T2 fixed to
commit T1 before T2, the server layer will ask InnoDB to roll back T2 as the
deadlock victim, not T1. This helps in some cases to avoid excessive deadlock
rollback, as T2 will in any case need to wait for T1 to complete before it can
itself commit.
Also some misc. fixes found during development and testing:
- Remove thd_rpl_is_parallel(), it is not used or needed.
- Use KILL_CONNECTION instead of KILL_QUERY when a parallel replication
worker thread is killed to resolve a deadlock with fixed commit
ordering. There are some cases, eg. in sql/sql_parse.cc, where a KILL_QUERY
can be ignored if the query otherwise completed successfully, and this
could cause the deadlock kill to be lost, so that the deadlock was not
correctly resolved.
- Fix random test failure due to missing wait_for_binlog_checkpoint.inc.
- Make sure that deadlock or other temporary errors during parallel
replication are not printed to the the error log; there were some places
around the replication code with extra error logging. These conditions can
occur occasionally and are handled automatically without breaking
replication, so they should not pollute the error log.
- Fix handling of rgi->gtid_sub_id. We need to be able to access this also at
the end of a transaction, to be able to detect and resolve deadlocks due to
commit ordering. But this value was also used as a flag to mark whether
record_gtid() had been called, by being set to zero, losing the value. Now,
introduce a separate flag rgi->gtid_pending, so rgi->gtid_sub_id remains
valid for the entire duration of the transaction.
- Fix one place where the code to handle ignored errors called reset_killed()
unconditionally, even if no error was caught that should be ignored. This
could cause loss of a deadlock kill signal, breaking deadlock detection and
resolution.
- Fix a couple of missing mysql_reset_thd_for_next_command(). This could
cause a prior error condition to remain for the next event executed,
causing assertions about errors already being set and possibly giving
incorrect error handling for following event executions.
- Fix code that cleared thd->rgi_slave in the parallel replication worker
threads after each event execution; this caused the deadlock detection and
handling code to not be able to correctly process the associated
transactions as belonging to replication worker threads.
- Remove useless error code in slave_background_kill_request().
- Fix bug where wfc->wakeup_error was not cleared at
wait_for_commit::unregister_wait_for_prior_commit(). This could cause the
error condition to wrongly propagate to a later wait_for_prior_commit(),
causing spurious ER_PRIOR_COMMIT_FAILED errors.
- Do not put the binlog background thread into the processlist. It causes
too many result differences in mtr, but also it probably is not useful
for users to pollute the process list with a system thread that does not
really perform any user-visible tasks...
2014-06-10 10:13:15 +02:00
|
|
|
rpl_group_info::gtid_sub_id > last_committed_sub_id. This can be used to
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
safely refer back to previous event groups if they are still executing,
|
|
|
|
and ignore them if they completed, without requiring explicit
|
|
|
|
synchronisation between the threads.
|
2013-07-03 13:46:33 +02:00
|
|
|
*/
|
|
|
|
uint64 last_committed_sub_id;
|
2013-09-30 10:41:41 +02:00
|
|
|
/*
|
|
|
|
The sub_id of the last event group in this replication domain that was
|
|
|
|
queued for execution by a worker thread.
|
|
|
|
*/
|
2013-07-03 13:46:33 +02:00
|
|
|
uint64 current_sub_id;
|
2013-09-13 15:09:57 +02:00
|
|
|
rpl_group_info *current_group_info;
|
2013-07-08 16:47:07 +02:00
|
|
|
/*
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
If we get an error in some event group, we set the sub_id of that event
|
|
|
|
group here. Then later event groups (with higher sub_id) can know not to
|
|
|
|
try to start (event groups that already started will be rolled back when
|
|
|
|
wait_for_prior_commit() returns error).
|
|
|
|
The value is ULONGLONG_MAX when no error occured.
|
|
|
|
*/
|
|
|
|
uint64 stop_on_error_sub_id;
|
|
|
|
/* Total count of event groups queued so far. */
|
|
|
|
uint64 count_queued_event_groups;
|
|
|
|
/*
|
|
|
|
Count of event groups that have started (but not necessarily completed)
|
|
|
|
the commit phase. We use this to know when every event group in a previous
|
|
|
|
batch of master group commits have started committing on the slave, so
|
|
|
|
that it is safe to start executing the events in the following batch.
|
2013-07-08 16:47:07 +02:00
|
|
|
*/
|
MDEV-5657: Parallel replication.
Clean up and improve the parallel implementation code, mainly related to
scheduling of work to threads and handling of stop and errors.
Fix a lot of bugs in various corner cases that could lead to crashes or
corruption.
Fix that a single replication domain could easily grab all worker threads and
stall all other domains; now a configuration variable
--slave-domain-parallel-threads allows to limit the number of
workers.
Allow next event group to start as soon as previous group begins the commit
phase (as opposed to when it ends it); this allows multiple event groups on
the slave to participate in group commit, even when no other opportunities for
parallelism are available.
Various fixes:
- Fix some races in the rpl.rpl_parallel test case.
- Fix an old incorrect assertion in Log_event iocache read.
- Fix repeated malloc/free of wait_for_commit and rpl_group_info objects.
- Simplify wait_for_commit wakeup logic.
- Fix one case in queue_for_group_commit() where killing one thread would
fail to correctly signal the error to the next, causing loss of the
transaction after slave restart.
- Fix leaking of pthreads (and their allocated stack) due to missing
PTHREAD_CREATE_DETACHED attribute.
- Fix how one batch of group-committed transactions wait for the previous
batch before starting to execute themselves. The old code had a very
complex scheduling where the first transaction was handled differently,
with subtle bugs in corner cases. Now each event group is always scheduled
for a new worker (in a round-robin fashion amongst available workers).
Keep a count of how many transactions have started to commit, and wait for
that counter to reach the appropriate value.
- Fix slave stop to wait for all workers to actually complete processing;
before, the wait was for update of last_committed_sub_id, which happens a
bit earlier, and could leave worker threads potentially accessing bits of
the replication state that is no longer valid after slave stop.
- Fix a couple of places where the test suite would kill a thread waiting
inside enter_cond() in connection with debug_sync; debug_sync + kill can
crash in rare cases due to a race with mysys_var_current_mutex in this
case.
- Fix some corner cases where we had enter_cond() but no exit_cond().
- Fix that we could get failure in wait_for_prior_commit() but forget to flag
the error with my_error().
- Fix slave stop (both for normal stop and stop due to error). Now, at stop
we pick a specific safe point (in terms of event groups executed) and make
sure that all event groups before that point are executed to completion,
and that no event group after start executing; this ensures a safe place to
restart replication, even for non-transactional stuff/DDL. In error stop,
make sure that all prior event groups are allowed to execute to completion,
and that any later event groups that have started are rolled back, if
possible. The old code could leave eg. T1 and T3 committed but T2 not, or
it could even leave half a transaction not rolled back in some random
worker, which would cause big problems when that worker was later reused
after slave restart.
- Fix the accounting of amount of events queued for one worker. Before, the
amount was reduced immediately as soon as the events were dequeued (which
happens all at once); this allowed twice the amount of events to be queued
in memory for each single worker, which is not what users would expect.
- Fix that an error set during execution of one event was sometimes not
cleared before executing the next, causing problems with the error
reporting.
- Fix incorrect handling of thd->killed in worker threads.
2014-02-26 15:02:09 +01:00
|
|
|
uint64 count_committing_event_groups;
|
|
|
|
/* The group_commit_orderer object for the events currently being queued. */
|
|
|
|
group_commit_orderer *current_gco;
|
|
|
|
|
2014-06-25 15:17:03 +02:00
|
|
|
rpl_parallel_thread * choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
|
2014-02-26 16:38:42 +01:00
|
|
|
PSI_stage_info *old_stage, bool reuse);
|
2014-08-19 14:26:42 +02:00
|
|
|
int queue_master_restart(rpl_group_info *rgi,
|
|
|
|
Format_description_log_event *fdev);
|
2013-06-24 10:50:25 +02:00
|
|
|
};
|
|
|
|
struct rpl_parallel {
|
|
|
|
HASH domain_hash;
|
|
|
|
rpl_parallel_entry *current;
|
2013-10-08 14:36:06 +02:00
|
|
|
bool sql_thread_stopping;
|
2013-06-24 10:50:25 +02:00
|
|
|
|
|
|
|
rpl_parallel();
|
|
|
|
~rpl_parallel();
|
2013-09-17 14:07:21 +02:00
|
|
|
void reset();
|
2013-06-24 10:50:25 +02:00
|
|
|
rpl_parallel_entry *find(uint32 domain_id);
|
2014-03-03 12:13:55 +01:00
|
|
|
void wait_for_done(THD *thd, Relay_log_info *rli);
|
|
|
|
void stop_during_until();
|
2014-01-08 11:00:44 +01:00
|
|
|
bool workers_idle();
|
2014-08-20 10:59:39 +02:00
|
|
|
int wait_for_workers_idle(THD *thd);
|
2014-03-04 08:48:32 +01:00
|
|
|
int do_event(rpl_group_info *serial_rgi, Log_event *ev, ulonglong event_size);
|
2013-06-24 10:50:25 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
extern struct rpl_parallel_thread_pool global_rpl_thread_pool;
|
|
|
|
|
|
|
|
|
|
|
|
extern int rpl_parallel_change_thread_count(rpl_parallel_thread_pool *pool,
|
|
|
|
uint32 new_count,
|
|
|
|
bool skip_check= false);
|
|
|
|
|
|
|
|
#endif /* RPL_PARALLEL_H */
|