mariadb/plugin/semisync/semisync_master.h

369 lines
13 KiB
C++

/* Copyright (C) 2007 Google Inc.
Copyright (C) 2008 MySQL AB
Copyright (C) 2009 Sun Microsystems, Inc
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
#ifndef SEMISYNC_MASTER_H
#define SEMISYNC_MASTER_H
#include "semisync.h"
#ifdef HAVE_PSI_INTERFACE
extern PSI_mutex_key key_ss_mutex_LOCK_binlog_;
extern PSI_cond_key key_ss_cond_COND_binlog_send_;
#endif
/**
This class manages memory for active transaction list.
We record each active transaction with a TranxNode, each session
can have only one open transaction. Because of EVENT, the total
active transaction nodes can exceed the maximum allowed
connections.
*/
class ActiveTranx
:public Trace {
private:
struct TranxNode {
char log_name_[FN_REFLEN];
my_off_t log_pos_;
struct TranxNode *next_; /* the next node in the sorted list */
struct TranxNode *hash_next_; /* the next node during hash collision */
};
/* These two record the active transaction list in sort order. */
TranxNode *trx_front_, *trx_rear_;
TranxNode **trx_htb_; /* A hash table on active transactions. */
int num_entries_; /* maximum hash table entries */
mysql_mutex_t *lock_; /* mutex lock */
inline void assert_lock_owner();
inline TranxNode* alloc_tranx_node();
inline unsigned int calc_hash(const unsigned char *key,unsigned int length);
unsigned int get_hash_value(const char *log_file_name, my_off_t log_file_pos);
int compare(const char *log_file_name1, my_off_t log_file_pos1,
const TranxNode *node2) {
return compare(log_file_name1, log_file_pos1,
node2->log_name_, node2->log_pos_);
}
int compare(const TranxNode *node1,
const char *log_file_name2, my_off_t log_file_pos2) {
return compare(node1->log_name_, node1->log_pos_,
log_file_name2, log_file_pos2);
}
int compare(const TranxNode *node1, const TranxNode *node2) {
return compare(node1->log_name_, node1->log_pos_,
node2->log_name_, node2->log_pos_);
}
public:
ActiveTranx(mysql_mutex_t *lock, unsigned long trace_level);
~ActiveTranx();
/* Insert an active transaction node with the specified position.
*
* Return:
* 0: success; non-zero: error
*/
int insert_tranx_node(const char *log_file_name, my_off_t log_file_pos);
/* Clear the active transaction nodes until(inclusive) the specified
* position.
* If log_file_name is NULL, everything will be cleared: the sorted
* list and the hash table will be reset to empty.
*
* Return:
* 0: success; non-zero: error
*/
int clear_active_tranx_nodes(const char *log_file_name,
my_off_t log_file_pos);
/* Given a position, check to see whether the position is an active
* transaction's ending position by probing the hash table.
*/
bool is_tranx_end_pos(const char *log_file_name, my_off_t log_file_pos);
/* Given two binlog positions, compare which one is bigger based on
* (file_name, file_position).
*/
static int compare(const char *log_file_name1, my_off_t log_file_pos1,
const char *log_file_name2, my_off_t log_file_pos2);
};
/**
The extension class for the master of semi-synchronous replication
*/
class ReplSemiSyncMaster
:public ReplSemiSyncBase {
private:
ActiveTranx *active_tranxs_; /* active transaction list: the list will
be cleared when semi-sync switches off. */
/* True when initObject has been called */
bool init_done_;
/* This cond variable is signaled when enough binlog has been sent to slave,
* so that a waiting trx can return the 'ok' to the client for a commit.
*/
mysql_cond_t COND_binlog_send_;
/* Mutex that protects the following state variables and the active
* transaction list.
* Under no cirumstances we can acquire mysql_bin_log.LOCK_log if we are
* already holding LOCK_binlog_ because it can cause deadlocks.
*/
mysql_mutex_t LOCK_binlog_;
/* This is set to true when reply_file_name_ contains meaningful data. */
bool reply_file_name_inited_;
/* The binlog name up to which we have received replies from any slaves. */
char reply_file_name_[FN_REFLEN];
/* The position in that file up to which we have the reply from any slaves. */
my_off_t reply_file_pos_;
/* This is set to true when we know the 'smallest' wait position. */
bool wait_file_name_inited_;
/* NULL, or the 'smallest' filename that a transaction is waiting for
* slave replies.
*/
char wait_file_name_[FN_REFLEN];
/* The smallest position in that file that a trx is waiting for: the trx
* can proceed and send an 'ok' to the client when the master has got the
* reply from the slave indicating that it already got the binlog events.
*/
my_off_t wait_file_pos_;
/* This is set to true when we know the 'largest' transaction commit
* position in the binlog file.
* We always maintain the position no matter whether semi-sync is switched
* on switched off. When a transaction wait timeout occurs, semi-sync will
* switch off. Binlog-dump thread can use the three fields to detect when
* slaves catch up on replication so that semi-sync can switch on again.
*/
bool commit_file_name_inited_;
/* The 'largest' binlog filename that a commit transaction is seeing. */
char commit_file_name_[FN_REFLEN];
/* The 'largest' position in that file that a commit transaction is seeing. */
my_off_t commit_file_pos_;
/* All global variables which can be set by parameters. */
volatile bool master_enabled_; /* semi-sync is enabled on the master */
unsigned long wait_timeout_; /* timeout period(ms) during tranx wait */
bool state_; /* whether semi-sync is switched */
void lock();
void unlock();
void cond_broadcast();
int cond_timewait(struct timespec *wait_time);
/* Is semi-sync replication on? */
bool is_on() {
return (state_);
}
void set_master_enabled(bool enabled) {
master_enabled_ = enabled;
}
/* Switch semi-sync off because of timeout in transaction waiting. */
int switch_off();
/* Switch semi-sync on when slaves catch up. */
int try_switch_on(int server_id,
const char *log_file_name, my_off_t log_file_pos);
public:
ReplSemiSyncMaster();
~ReplSemiSyncMaster();
bool getMasterEnabled() {
return master_enabled_;
}
void setTraceLevel(unsigned long trace_level) {
trace_level_ = trace_level;
if (active_tranxs_)
active_tranxs_->trace_level_ = trace_level;
}
/* Set the transaction wait timeout period, in milliseconds. */
void setWaitTimeout(unsigned long wait_timeout) {
wait_timeout_ = wait_timeout;
}
/* Initialize this class after MySQL parameters are initialized. this
* function should be called once at bootstrap time.
*/
int initObject();
/* Enable the object to enable semi-sync replication inside the master. */
int enableMaster();
/* Enable the object to enable semi-sync replication inside the master. */
int disableMaster();
/* Add a semi-sync replication slave */
void add_slave();
/* Remove a semi-sync replication slave */
void remove_slave();
/* Is the slave servered by the thread requested semi-sync */
bool is_semi_sync_slave();
/* In semi-sync replication, reports up to which binlog position we have
* received replies from the slave indicating that it already get the events.
*
* Input:
* server_id - (IN) master server id number
* log_file_name - (IN) binlog file name
* end_offset - (IN) the offset in the binlog file up to which we have
* the replies from the slave
*
* Return:
* 0: success; non-zero: error
*/
int reportReplyBinlog(uint32 server_id,
const char* log_file_name,
my_off_t end_offset);
/* Commit a transaction in the final step. This function is called from
* InnoDB before returning from the low commit. If semi-sync is switch on,
* the function will wait to see whether binlog-dump thread get the reply for
* the events of the transaction. Remember that this is not a direct wait,
* instead, it waits to see whether the binlog-dump thread has reached the
* point. If the wait times out, semi-sync status will be switched off and
* all other transaction would not wait either.
*
* Input: (the transaction events' ending binlog position)
* trx_wait_binlog_name - (IN) ending position's file name
* trx_wait_binlog_pos - (IN) ending position's file offset
*
* Return:
* 0: success; non-zero: error
*/
int commitTrx(const char* trx_wait_binlog_name,
my_off_t trx_wait_binlog_pos);
/* Reserve space in the replication event packet header:
* . slave semi-sync off: 1 byte - (0)
* . slave semi-sync on: 3 byte - (0, 0xef, 0/1}
*
* Input:
* header - (IN) the header buffer
* size - (IN) size of the header buffer
*
* Return:
* size of the bytes reserved for header
*/
int reserveSyncHeader(unsigned char *header, unsigned long size);
/* Update the sync bit in the packet header to indicate to the slave whether
* the master will wait for the reply of the event. If semi-sync is switched
* off and we detect that the slave is catching up, we switch semi-sync on.
*
* Input:
* packet - (IN) the packet containing the replication event
* log_file_name - (IN) the event ending position's file name
* log_file_pos - (IN) the event ending position's file offset
* server_id - (IN) master server id number
*
* Return:
* 0: success; non-zero: error
*/
int updateSyncHeader(unsigned char *packet,
const char *log_file_name,
my_off_t log_file_pos,
uint32 server_id);
/* Called when a transaction finished writing binlog events.
* . update the 'largest' transactions' binlog event position
* . insert the ending position in the active transaction list if
* semi-sync is on
*
* Input: (the transaction events' ending binlog position)
* log_file_name - (IN) transaction ending position's file name
* log_file_pos - (IN) transaction ending position's file offset
*
* Return:
* 0: success; non-zero: error
*/
int writeTranxInBinlog(const char* log_file_name, my_off_t log_file_pos);
/* Read the slave's reply so that we know how much progress the slave makes
* on receive replication events.
*
* Input:
* net - (IN) the connection to master
* server_id - (IN) master server id number
* event_buf - (IN) pointer to the event packet
*
* Return:
* 0: success; non-zero: error
*/
int readSlaveReply(NET *net, uint32 server_id, const char *event_buf);
/* Export internal statistics for semi-sync replication. */
void setExportStats();
/* 'reset master' command is issued from the user and semi-sync need to
* go off for that.
*/
int resetMaster();
};
/* System and status variables for the master component */
extern char rpl_semi_sync_master_enabled;
extern char rpl_semi_sync_master_status;
extern unsigned long rpl_semi_sync_master_clients;
extern unsigned long rpl_semi_sync_master_timeout;
extern unsigned long rpl_semi_sync_master_trace_level;
extern unsigned long rpl_semi_sync_master_yes_transactions;
extern unsigned long rpl_semi_sync_master_no_transactions;
extern unsigned long rpl_semi_sync_master_off_times;
extern unsigned long rpl_semi_sync_master_wait_timeouts;
extern unsigned long rpl_semi_sync_master_timefunc_fails;
extern unsigned long rpl_semi_sync_master_num_timeouts;
extern unsigned long rpl_semi_sync_master_wait_sessions;
extern unsigned long rpl_semi_sync_master_wait_pos_backtraverse;
extern unsigned long rpl_semi_sync_master_avg_trx_wait_time;
extern unsigned long rpl_semi_sync_master_avg_net_wait_time;
extern unsigned long long rpl_semi_sync_master_net_wait_num;
extern unsigned long long rpl_semi_sync_master_trx_wait_num;
extern unsigned long long rpl_semi_sync_master_net_wait_time;
extern unsigned long long rpl_semi_sync_master_trx_wait_time;
/*
This indicates whether we should keep waiting if no semi-sync slave
is available.
0 : stop waiting if detected no avaialable semi-sync slave.
1 (default) : keep waiting until timeout even no available semi-sync slave.
*/
extern char rpl_semi_sync_master_wait_no_slave;
#endif /* SEMISYNC_MASTER_H */