mirror of
https://github.com/MariaDB/server.git
synced 2025-01-21 22:34:18 +01:00
1434 lines
35 KiB
C
1434 lines
35 KiB
C
/*-
|
|
* See the file LICENSE for redistribution information.
|
|
*
|
|
* Copyright (c) 2001-2005
|
|
* Sleepycat Software. All rights reserved.
|
|
*
|
|
* $Id: rep_util.c,v 12.42 2005/10/27 01:26:02 mjc Exp $
|
|
*/
|
|
|
|
#include "db_config.h"
|
|
|
|
#ifndef NO_SYSTEM_INCLUDES
|
|
#if TIME_WITH_SYS_TIME
|
|
#include <sys/time.h>
|
|
#include <time.h>
|
|
#else
|
|
#if HAVE_SYS_TIME_H
|
|
#include <sys/time.h>
|
|
#else
|
|
#include <time.h>
|
|
#endif
|
|
#endif
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#endif
|
|
|
|
#include "db_int.h"
|
|
#include "dbinc/log.h"
|
|
#include "dbinc/txn.h"
|
|
#ifdef REP_DIAGNOSTIC
|
|
#include "dbinc/db_page.h"
|
|
#include "dbinc/fop.h"
|
|
#include "dbinc/btree.h"
|
|
#include "dbinc/hash.h"
|
|
#include "dbinc/qam.h"
|
|
#endif
|
|
|
|
/*
|
|
* rep_util.c:
|
|
* Miscellaneous replication-related utility functions, including
|
|
* those called by other subsystems.
|
|
*/
|
|
|
|
#define TIMESTAMP_CHECK(dbenv, ts, renv) do { \
|
|
if (renv->op_timestamp != 0 && \
|
|
renv->op_timestamp + DB_REGENV_TIMEOUT < ts) { \
|
|
REP_SYSTEM_LOCK(dbenv); \
|
|
F_CLR(renv, DB_REGENV_REPLOCKED); \
|
|
renv->op_timestamp = 0; \
|
|
REP_SYSTEM_UNLOCK(dbenv); \
|
|
} \
|
|
} while (0)
|
|
|
|
#ifdef REP_DIAGNOSTIC
|
|
static void __rep_print_logmsg __P((DB_ENV *, const DBT *, DB_LSN *));
|
|
#endif
|
|
|
|
/*
|
|
* __rep_bulk_message --
|
|
* This is a wrapper for putting a record into a bulk buffer. Since
|
|
* we have different bulk buffers, the caller must hand us the information
|
|
* we need to put the record into the correct buffer. All bulk buffers
|
|
* are protected by the REP->mtx_clientdb.
|
|
*
|
|
* PUBLIC: int __rep_bulk_message __P((DB_ENV *, REP_BULK *, REP_THROTTLE *,
|
|
* PUBLIC: DB_LSN *, const DBT *, u_int32_t));
|
|
*/
|
|
int
|
|
__rep_bulk_message(dbenv, bulk, repth, lsn, dbt, flags)
|
|
DB_ENV *dbenv;
|
|
REP_BULK *bulk;
|
|
REP_THROTTLE *repth;
|
|
DB_LSN *lsn;
|
|
const DBT *dbt;
|
|
u_int32_t flags;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
int ret;
|
|
u_int32_t recsize, typemore;
|
|
u_int8_t *p;
|
|
#ifdef DIAGNOSTIC
|
|
DB_MSGBUF mb;
|
|
#endif
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
ret = 0;
|
|
|
|
/*
|
|
* Figure out the total number of bytes needed for this record.
|
|
*/
|
|
recsize = dbt->size + sizeof(DB_LSN) + sizeof(dbt->size);
|
|
|
|
/*
|
|
* If *this* buffer is actively being transmitted, wait until
|
|
* we can use it.
|
|
*/
|
|
MUTEX_LOCK(dbenv, rep->mtx_clientdb);
|
|
while (FLD_ISSET(*(bulk->flagsp), BULK_XMIT)) {
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
__os_sleep(dbenv, 1, 0);
|
|
MUTEX_LOCK(dbenv, rep->mtx_clientdb);
|
|
}
|
|
|
|
/*
|
|
* If the record is bigger than the buffer entirely, send the
|
|
* current buffer and then return DB_REP_BULKOVF so that this
|
|
* record is sent as a singleton. Do we have enough info to
|
|
* do that here? XXX
|
|
*/
|
|
if (recsize > bulk->len) {
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"bulk_msg: Record %d (0x%x) larger than entire buffer 0x%x",
|
|
recsize, recsize, bulk->len));
|
|
rep->stat.st_bulk_overflows++;
|
|
(void)__rep_send_bulk(dbenv, bulk, flags);
|
|
/*
|
|
* XXX __rep_send_message...
|
|
*/
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
return (DB_REP_BULKOVF);
|
|
}
|
|
/*
|
|
* If this record doesn't fit, send the current buffer.
|
|
* Sending the buffer will reset the offset, but we will
|
|
* drop the mutex while sending so we need to keep checking
|
|
* if we're racing.
|
|
*/
|
|
while (recsize + *(bulk->offp) > bulk->len) {
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"bulk_msg: Record %lu (%#lx) doesn't fit. Send %lu (%#lx) now.",
|
|
(u_long)recsize, (u_long)recsize,
|
|
(u_long)bulk->len, (u_long)bulk->len));
|
|
rep->stat.st_bulk_fills++;
|
|
if ((ret = __rep_send_bulk(dbenv, bulk, flags)) != 0)
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* If we're using throttling, see if we are at the throttling
|
|
* limit before we do any more work here, by checking if the
|
|
* call to rep_send_throttle changed the repth->type to the
|
|
* *_MORE message type. If the throttling code hits the limit
|
|
* then we're done here.
|
|
*/
|
|
if (bulk->type == REP_BULK_LOG)
|
|
typemore = REP_LOG_MORE;
|
|
else
|
|
typemore = REP_PAGE_MORE;
|
|
if (repth != NULL &&
|
|
(ret = __rep_send_throttle(dbenv, bulk->eid, repth,
|
|
REP_THROTTLE_ONLY)) == 0 && repth->type == typemore) {
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"bulk_msg: Record %d (0x%x) hit throttle limit.",
|
|
recsize, recsize));
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* Now we own the buffer, and we know our record fits into it.
|
|
* The buffer is structured with the len, LSN and then the record.
|
|
* Copy the record into the buffer. Then if we need to,
|
|
* send the buffer.
|
|
*/
|
|
/*
|
|
* First thing is the length of the dbt record.
|
|
*/
|
|
p = bulk->addr + *(bulk->offp);
|
|
memcpy(p, &dbt->size, sizeof(dbt->size));
|
|
p += sizeof(dbt->size);
|
|
/*
|
|
* The next thing is the LSN. We need LSNs for both pages and
|
|
* log records. For log records, this is obviously, the LSN of
|
|
* this record. For pages, the LSN is used by the internal init code.
|
|
*/
|
|
memcpy(p, lsn, sizeof(DB_LSN));
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"bulk_msg: Copying LSN [%lu][%lu] of %lu bytes to %#lx",
|
|
(u_long)lsn->file, (u_long)lsn->offset, (u_long)dbt->size,
|
|
P_TO_ULONG(p)));
|
|
p += sizeof(DB_LSN);
|
|
/*
|
|
* If we're the first record, we need to save the first
|
|
* LSN in the bulk structure.
|
|
*/
|
|
if (*(bulk->offp) == 0)
|
|
bulk->lsn = *lsn;
|
|
/*
|
|
* Now copy the record and finally adjust the offset.
|
|
*/
|
|
memcpy(p, dbt->data, dbt->size);
|
|
p += dbt->size;
|
|
*(bulk->offp) = (uintptr_t)p - (uintptr_t)bulk->addr;
|
|
rep->stat.st_bulk_records++;
|
|
/*
|
|
* Send the buffer if it is a perm record or a force.
|
|
*/
|
|
if (LF_ISSET(DB_LOG_PERM) || FLD_ISSET(*(bulk->flagsp), BULK_FORCE)) {
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"bulk_msg: Send buffer after copy due to %s",
|
|
LF_ISSET(DB_LOG_PERM) ? "PERM" : "FORCE"));
|
|
ret = __rep_send_bulk(dbenv, bulk, flags);
|
|
}
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
return (ret);
|
|
|
|
}
|
|
|
|
/*
|
|
* __rep_send_bulk --
|
|
* This function transmits the bulk buffer given. It assumes the
|
|
* caller holds the REP->mtx_clientdb. We may release it and reacquire
|
|
* it during this call. We will return with it held.
|
|
*
|
|
* PUBLIC: int __rep_send_bulk __P((DB_ENV *, REP_BULK *, u_int32_t));
|
|
*/
|
|
int
|
|
__rep_send_bulk(dbenv, bulkp, flags)
|
|
DB_ENV *dbenv;
|
|
REP_BULK *bulkp;
|
|
u_int32_t flags;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
DBT dbt;
|
|
int ret;
|
|
#ifdef DIAGNOSTIC
|
|
DB_MSGBUF mb;
|
|
#endif
|
|
|
|
/*
|
|
* If the offset is 0, we're done. There is nothing to send.
|
|
*/
|
|
if (*(bulkp->offp) == 0)
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
memset(&dbt, 0, sizeof(dbt));
|
|
/*
|
|
* Set that this buffer is being actively transmitted.
|
|
*/
|
|
FLD_SET(*(bulkp->flagsp), BULK_XMIT);
|
|
dbt.data = bulkp->addr;
|
|
dbt.size = (u_int32_t)*(bulkp->offp);
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"send_bulk: Send %d (0x%x) bulk buffer bytes", dbt.size, dbt.size));
|
|
/*
|
|
* Unlocked the mutex and now send the message.
|
|
*/
|
|
rep->stat.st_bulk_transfers++;
|
|
ret = __rep_send_message(dbenv, bulkp->eid, bulkp->type, &bulkp->lsn,
|
|
&dbt, flags, 0);
|
|
|
|
MUTEX_LOCK(dbenv, rep->mtx_clientdb);
|
|
/*
|
|
* If we're successful, reset the offset pointer to 0.
|
|
* Clear the transmit flag regardless.
|
|
*/
|
|
if (ret == 0)
|
|
*(bulkp->offp) = 0;
|
|
FLD_CLR(*(bulkp->flagsp), BULK_XMIT);
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* __rep_bulk_alloc --
|
|
* This function allocates and initializes an internal bulk buffer.
|
|
* This is used by the master when fulfilling a request for a chunk of
|
|
* log records or a bunch of pages.
|
|
*
|
|
* PUBLIC: int __rep_bulk_alloc __P((DB_ENV *, REP_BULK *, int, uintptr_t *,
|
|
* PUBLIC: u_int32_t *, u_int32_t));
|
|
*/
|
|
int
|
|
__rep_bulk_alloc(dbenv, bulkp, eid, offp, flagsp, type)
|
|
DB_ENV *dbenv;
|
|
REP_BULK *bulkp;
|
|
int eid;
|
|
uintptr_t *offp;
|
|
u_int32_t *flagsp, type;
|
|
{
|
|
int ret;
|
|
|
|
memset(bulkp, 0, sizeof(REP_BULK));
|
|
*offp = *flagsp = 0;
|
|
bulkp->len = MEGABYTE;
|
|
if ((ret = __os_malloc(dbenv, bulkp->len, &bulkp->addr)) != 0)
|
|
return (ret);
|
|
bulkp->offp = offp;
|
|
bulkp->type = type;
|
|
bulkp->eid = eid;
|
|
bulkp->flagsp = flagsp;
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* __rep_bulk_free --
|
|
* This function sends the remainder of the bulk buffer and frees it.
|
|
*
|
|
* PUBLIC: int __rep_bulk_free __P((DB_ENV *, REP_BULK *, u_int32_t));
|
|
*/
|
|
int
|
|
__rep_bulk_free(dbenv, bulkp, flags)
|
|
DB_ENV *dbenv;
|
|
REP_BULK *bulkp;
|
|
u_int32_t flags;
|
|
{
|
|
DB_REP *db_rep;
|
|
int ret;
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
|
|
MUTEX_LOCK(dbenv, db_rep->region->mtx_clientdb);
|
|
ret = __rep_send_bulk(dbenv, bulkp, flags);
|
|
MUTEX_UNLOCK(dbenv, db_rep->region->mtx_clientdb);
|
|
__os_free(dbenv, bulkp->addr);
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* __rep_send_message --
|
|
* This is a wrapper for sending a message. It takes care of constructing
|
|
* the REP_CONTROL structure and calling the user's specified send function.
|
|
*
|
|
* PUBLIC: int __rep_send_message __P((DB_ENV *, int,
|
|
* PUBLIC: u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t));
|
|
*/
|
|
int
|
|
__rep_send_message(dbenv, eid, rtype, lsnp, dbt, logflags, repflags)
|
|
DB_ENV *dbenv;
|
|
int eid;
|
|
u_int32_t rtype;
|
|
DB_LSN *lsnp;
|
|
const DBT *dbt;
|
|
u_int32_t logflags, repflags;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
DBT cdbt, scrap_dbt;
|
|
REP_CONTROL cntrl;
|
|
int ret;
|
|
u_int32_t myflags, rectype;
|
|
#ifdef DIAGNOSTIC
|
|
DB_MSGBUF mb;
|
|
#endif
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
/* Set up control structure. */
|
|
memset(&cntrl, 0, sizeof(cntrl));
|
|
if (lsnp == NULL)
|
|
ZERO_LSN(cntrl.lsn);
|
|
else
|
|
cntrl.lsn = *lsnp;
|
|
cntrl.rectype = rtype;
|
|
cntrl.flags = logflags;
|
|
cntrl.rep_version = DB_REPVERSION;
|
|
cntrl.log_version = DB_LOGVERSION;
|
|
cntrl.gen = rep->gen;
|
|
|
|
memset(&cdbt, 0, sizeof(cdbt));
|
|
cdbt.data = &cntrl;
|
|
cdbt.size = sizeof(cntrl);
|
|
|
|
/* Don't assume the send function will be tolerant of NULL records. */
|
|
if (dbt == NULL) {
|
|
memset(&scrap_dbt, 0, sizeof(DBT));
|
|
dbt = &scrap_dbt;
|
|
}
|
|
|
|
REP_PRINT_MESSAGE(dbenv, eid, &cntrl, "rep_send_message");
|
|
#ifdef REP_DIAGNOSTIC
|
|
if (FLD_ISSET(dbenv->verbose, DB_VERB_REPLICATION) && rtype == REP_LOG)
|
|
__rep_print_logmsg(dbenv, dbt, lsnp);
|
|
#endif
|
|
/*
|
|
* There are several types of records: commit and checkpoint records
|
|
* that affect database durability, regular log records that might
|
|
* be buffered on the master before being transmitted, and control
|
|
* messages which don't require the guarantees of permanency, but
|
|
* should not be buffered.
|
|
*
|
|
* There are request records that can be sent anywhere, and there
|
|
* are rerequest records that the app might want to send to the master.
|
|
*/
|
|
myflags = repflags;
|
|
if (FLD_ISSET(logflags, DB_LOG_PERM))
|
|
myflags |= DB_REP_PERMANENT;
|
|
else if (rtype != REP_LOG || FLD_ISSET(logflags, DB_LOG_RESEND))
|
|
myflags |= DB_REP_NOBUFFER;
|
|
if (rtype == REP_LOG && !FLD_ISSET(logflags, DB_LOG_PERM)) {
|
|
/*
|
|
* Check if this is a log record we just read that
|
|
* may need a DB_LOG_PERM. This is of type REP_LOG,
|
|
* so we know that dbt is a log record.
|
|
*/
|
|
memcpy(&rectype, dbt->data, sizeof(rectype));
|
|
if (rectype == DB___txn_regop || rectype == DB___txn_ckp)
|
|
F_SET(&cntrl, DB_LOG_PERM);
|
|
}
|
|
|
|
/*
|
|
* We set the LSN above to something valid. Give the master the
|
|
* actual LSN so that they can coordinate with permanent records from
|
|
* the client if they want to.
|
|
*/
|
|
ret = dbenv->rep_send(dbenv, &cdbt, dbt, &cntrl.lsn, eid, myflags);
|
|
|
|
/*
|
|
* We don't hold the rep lock, so this could miscount if we race.
|
|
* I don't think it's worth grabbing the mutex for that bit of
|
|
* extra accuracy.
|
|
*/
|
|
if (ret == 0)
|
|
rep->stat.st_msgs_sent++;
|
|
else {
|
|
rep->stat.st_msgs_send_failures++;
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"rep_send_function returned: %d", ret));
|
|
}
|
|
return (ret);
|
|
}
|
|
|
|
#ifdef REP_DIAGNOSTIC
|
|
/*
|
|
* __rep_print_logmsg --
|
|
* This is a debugging routine for printing out log records that
|
|
* we are about to transmit to a client.
|
|
*/
|
|
static void
|
|
__rep_print_logmsg(dbenv, logdbt, lsnp)
|
|
DB_ENV *dbenv;
|
|
const DBT *logdbt;
|
|
DB_LSN *lsnp;
|
|
{
|
|
/* Static structures to hold the printing functions. */
|
|
static int (**ptab)__P((DB_ENV *,
|
|
DBT *, DB_LSN *, db_recops, void *)) = NULL;
|
|
size_t ptabsize = 0;
|
|
|
|
if (ptabsize == 0) {
|
|
/* Initialize the table. */
|
|
(void)__bam_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__crdel_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__db_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__dbreg_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__fop_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__ham_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__qam_init_print(dbenv, &ptab, &ptabsize);
|
|
(void)__txn_init_print(dbenv, &ptab, &ptabsize);
|
|
}
|
|
|
|
(void)__db_dispatch(dbenv,
|
|
ptab, ptabsize, (DBT *)logdbt, lsnp, DB_TXN_PRINT, NULL);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* __rep_new_master --
|
|
* Called after a master election to sync back up with a new master.
|
|
* It's possible that we already know of this new master in which case
|
|
* we don't need to do anything.
|
|
*
|
|
* This is written assuming that this message came from the master; we
|
|
* need to enforce that in __rep_process_record, but right now, we have
|
|
* no way to identify the master.
|
|
*
|
|
* PUBLIC: int __rep_new_master __P((DB_ENV *, REP_CONTROL *, int));
|
|
*/
|
|
int
|
|
__rep_new_master(dbenv, cntrl, eid)
|
|
DB_ENV *dbenv;
|
|
REP_CONTROL *cntrl;
|
|
int eid;
|
|
{
|
|
DB_LOG *dblp;
|
|
DB_LOGC *logc;
|
|
DB_LSN first_lsn, lsn;
|
|
DB_REP *db_rep;
|
|
DBT dbt;
|
|
LOG *lp;
|
|
REGENV *renv;
|
|
REGINFO *infop;
|
|
REP *rep;
|
|
int change, do_req, ret, t_ret;
|
|
#ifdef DIAGNOSTIC
|
|
DB_MSGBUF mb;
|
|
#endif
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
ret = 0;
|
|
logc = NULL;
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
__rep_elect_done(dbenv, rep);
|
|
change = rep->gen != cntrl->gen || rep->master_id != eid;
|
|
if (change) {
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"Updating gen from %lu to %lu from master %d",
|
|
(u_long)rep->gen, (u_long)cntrl->gen, eid));
|
|
rep->gen = cntrl->gen;
|
|
if (rep->egen <= rep->gen)
|
|
rep->egen = rep->gen + 1;
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"Egen is %lu", (u_long)rep->egen));
|
|
rep->master_id = eid;
|
|
rep->stat.st_master_changes++;
|
|
rep->stat.st_startup_complete = 0;
|
|
/*
|
|
* If we're delaying client sync-up, we know we have a
|
|
* new/changed master now, set flag indicating we are
|
|
* actively delaying.
|
|
*/
|
|
if (FLD_ISSET(rep->config, REP_C_DELAYCLIENT))
|
|
F_SET(rep, REP_F_DELAY);
|
|
/*
|
|
* If we are already locking out others, we're either
|
|
* in the middle of sync-up recovery or internal init
|
|
* when this newmaster comes in (we also lockout in
|
|
* rep_start, but we cannot be racing that because we
|
|
* don't allow rep_proc_msg when rep_start is going on).
|
|
*
|
|
* If we were in the middle of an internal initialization
|
|
* and we've discovered a new master instead, clean up
|
|
* our old internal init information. We need to clean
|
|
* up any flags and unlock our lockout.
|
|
*/
|
|
if (rep->in_recovery || F_ISSET(rep, REP_F_READY)) {
|
|
(void)__rep_init_cleanup(dbenv, rep, DB_FORCE);
|
|
F_CLR(rep, REP_F_RECOVER_MASK);
|
|
rep->in_recovery = 0;
|
|
F_CLR(rep, REP_F_READY);
|
|
}
|
|
F_SET(rep, REP_F_NOARCHIVE | REP_F_RECOVER_VERIFY);
|
|
}
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
dblp = dbenv->lg_handle;
|
|
lp = dblp->reginfo.primary;
|
|
LOG_SYSTEM_LOCK(dbenv);
|
|
lsn = lp->lsn;
|
|
LOG_SYSTEM_UNLOCK(dbenv);
|
|
|
|
if (!change) {
|
|
/*
|
|
* If there wasn't a change, we might still have some
|
|
* catching up or verification to do.
|
|
*/
|
|
ret = 0;
|
|
MUTEX_LOCK(dbenv, rep->mtx_clientdb);
|
|
do_req = __rep_check_doreq(dbenv, rep);
|
|
if (F_ISSET(rep, REP_F_RECOVER_VERIFY)) {
|
|
lsn = lp->verify_lsn;
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
if (!F_ISSET(rep, REP_F_DELAY) &&
|
|
!IS_ZERO_LSN(lsn) && do_req)
|
|
(void)__rep_send_message(dbenv, eid,
|
|
REP_VERIFY_REQ, &lsn, NULL, 0,
|
|
DB_REP_ANYWHERE);
|
|
} else {
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
if (log_compare(&lsn, &cntrl->lsn) < 0 && do_req)
|
|
(void)__rep_send_message(dbenv, eid,
|
|
REP_ALL_REQ, &lsn, NULL,
|
|
0, DB_REP_ANYWHERE);
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
F_CLR(rep, REP_F_NOARCHIVE);
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
}
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* If the master changed, we need to start the process of
|
|
* figuring out what our last valid log record is. However,
|
|
* if both the master and we agree that the max LSN is 0,0,
|
|
* then there is no recovery to be done. If we are at 0 and
|
|
* the master is not, then we just need to request all the log
|
|
* records from the master.
|
|
*/
|
|
if (IS_INIT_LSN(lsn) || IS_ZERO_LSN(lsn)) {
|
|
/*
|
|
* If we have no log, then we have no files to open
|
|
* in recovery, but we've opened what we can, which
|
|
* is none. Mark DBREP_OPENFILES here.
|
|
*/
|
|
empty: MUTEX_LOCK(dbenv, rep->mtx_clientdb);
|
|
F_SET(db_rep, DBREP_OPENFILES);
|
|
ZERO_LSN(lp->verify_lsn);
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK);
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
if (!IS_INIT_LSN(cntrl->lsn)) {
|
|
/*
|
|
* We're making an ALL_REQ. But now that we've
|
|
* cleared the flags, we're likely receiving new
|
|
* log records from the master, resulting in a gap
|
|
* immediately. So to avoid multiple data streams,
|
|
* set the wait_recs value high now to give the master
|
|
* a chance to start sending us these records before
|
|
* the gap code re-requests the same gap. Wait_recs
|
|
* will get reset once we start receiving these
|
|
* records.
|
|
*/
|
|
lp->wait_recs = rep->max_gap;
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
/*
|
|
* Don't send the ALL_REQ if we're delayed. But we
|
|
* check here, after lp->wait_recs is set up so that
|
|
* when the app calls rep_sync, everything is ready
|
|
* to go.
|
|
*/
|
|
if (!F_ISSET(rep, REP_F_DELAY))
|
|
(void)__rep_send_message(dbenv, eid,
|
|
REP_ALL_REQ, &lsn, NULL,
|
|
0, DB_REP_ANYWHERE);
|
|
} else
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
|
|
return (DB_REP_NEWMASTER);
|
|
}
|
|
|
|
memset(&dbt, 0, sizeof(dbt));
|
|
/*
|
|
* If this client is farther ahead on the log file than the master, see
|
|
* if there is any overlap in the logs. If not, the client is too
|
|
* far ahead of the master and we cannot determine they're part of
|
|
* the same replication group.
|
|
*/
|
|
if (cntrl->lsn.file < lsn.file) {
|
|
if ((ret = __log_cursor(dbenv, &logc)) != 0)
|
|
goto err;
|
|
if ((ret = __log_c_get(logc, &first_lsn, &dbt, DB_FIRST)) != 0)
|
|
goto err;
|
|
if (cntrl->lsn.file < first_lsn.file) {
|
|
__db_err(dbenv,
|
|
"Client too far ahead of master; unable to join replication group");
|
|
ret = DB_REP_JOIN_FAILURE;
|
|
goto err;
|
|
}
|
|
ret = __log_c_close(logc);
|
|
logc = NULL;
|
|
if (ret != 0)
|
|
goto err;
|
|
}
|
|
if ((ret = __log_cursor(dbenv, &logc)) != 0)
|
|
goto err;
|
|
ret = __rep_log_backup(logc, &lsn);
|
|
err: if (logc != NULL && (t_ret = __log_c_close(logc)) != 0 && ret == 0)
|
|
ret = t_ret;
|
|
if (ret == DB_NOTFOUND) {
|
|
/*
|
|
* If we don't have an identification record, we still might
|
|
* have some log records but we're discarding them to sync
|
|
* up with the master from the start. Therefore,
|
|
* truncate our log and go to the no log case.
|
|
*/
|
|
INIT_LSN(lsn);
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"No commit or ckp found. Truncate log."));
|
|
(void)__log_vtruncate(dbenv, &lsn, &lsn, NULL);
|
|
infop = dbenv->reginfo;
|
|
renv = infop->primary;
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
(void)time(&renv->rep_timestamp);
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
goto empty;
|
|
}
|
|
|
|
/*
|
|
* If we failed here, we need to clear the flags we may
|
|
* have set above because we're not going to be setting
|
|
* the verify_lsn.
|
|
*/
|
|
if (ret != 0) {
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
F_CLR(rep, REP_F_RECOVER_MASK | REP_F_DELAY);
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* Finally, we have a record to ask for.
|
|
*/
|
|
MUTEX_LOCK(dbenv, rep->mtx_clientdb);
|
|
lp->verify_lsn = lsn;
|
|
lp->rcvd_recs = 0;
|
|
lp->wait_recs = rep->request_gap;
|
|
MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
|
|
if (!F_ISSET(rep, REP_F_DELAY))
|
|
(void)__rep_send_message(dbenv,
|
|
eid, REP_VERIFY_REQ, &lsn, NULL, 0, DB_REP_ANYWHERE);
|
|
|
|
return (DB_REP_NEWMASTER);
|
|
}
|
|
|
|
/*
|
|
* __rep_is_client
|
|
* Used by other subsystems to figure out if this is a replication
|
|
* client site.
|
|
*
|
|
* PUBLIC: int __rep_is_client __P((DB_ENV *));
|
|
*/
|
|
int
|
|
__rep_is_client(dbenv)
|
|
DB_ENV *dbenv;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
|
|
if (!REP_ON(dbenv))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
/*
|
|
* Don't just return F_ISSET since that converts unsigned
|
|
* into signed.
|
|
*/
|
|
return (F_ISSET(rep, REP_F_CLIENT) ? 1 : 0);
|
|
}
|
|
|
|
/*
|
|
* __rep_noarchive
|
|
* Used by log_archive to determine if it is okay to remove
|
|
* log files.
|
|
*
|
|
* PUBLIC: int __rep_noarchive __P((DB_ENV *));
|
|
*/
|
|
int
|
|
__rep_noarchive(dbenv)
|
|
DB_ENV *dbenv;
|
|
{
|
|
DB_REP *db_rep;
|
|
REGENV *renv;
|
|
REGINFO *infop;
|
|
REP *rep;
|
|
time_t timestamp;
|
|
|
|
infop = dbenv->reginfo;
|
|
renv = infop->primary;
|
|
|
|
/*
|
|
* This is tested before REP_ON below because we always need
|
|
* to obey if any replication process has disabled archiving.
|
|
* Everything is in the environment region that we need here.
|
|
*/
|
|
if (F_ISSET(renv, DB_REGENV_REPLOCKED)) {
|
|
(void)time(×tamp);
|
|
TIMESTAMP_CHECK(dbenv, timestamp, renv);
|
|
/*
|
|
* Check if we're still locked out after checking
|
|
* the timestamp.
|
|
*/
|
|
if (F_ISSET(renv, DB_REGENV_REPLOCKED))
|
|
return (EINVAL);
|
|
}
|
|
|
|
if (!REP_ON(dbenv))
|
|
return (0);
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
if (F_ISSET(rep, REP_F_NOARCHIVE))
|
|
return (1);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __rep_send_vote
|
|
* Send this site's vote for the election.
|
|
*
|
|
* PUBLIC: void __rep_send_vote __P((DB_ENV *, DB_LSN *, int, int, int,
|
|
* PUBLIC: u_int32_t, u_int32_t, int, u_int32_t));
|
|
*/
|
|
void
|
|
__rep_send_vote(dbenv, lsnp, nsites, nvotes, pri, tie, egen, eid, vtype)
|
|
DB_ENV *dbenv;
|
|
DB_LSN *lsnp;
|
|
int eid, nsites, nvotes, pri;
|
|
u_int32_t egen, tie, vtype;
|
|
{
|
|
DBT vote_dbt;
|
|
REP_VOTE_INFO vi;
|
|
|
|
memset(&vi, 0, sizeof(vi));
|
|
|
|
vi.egen = egen;
|
|
vi.priority = pri;
|
|
vi.nsites = nsites;
|
|
vi.nvotes = nvotes;
|
|
vi.tiebreaker = tie;
|
|
|
|
memset(&vote_dbt, 0, sizeof(vote_dbt));
|
|
vote_dbt.data = &vi;
|
|
vote_dbt.size = sizeof(vi);
|
|
|
|
(void)__rep_send_message(dbenv, eid, vtype, lsnp, &vote_dbt, 0, 0);
|
|
}
|
|
|
|
/*
|
|
* __rep_elect_done
|
|
* Clear all election information for this site. Assumes the
|
|
* caller hold the region mutex.
|
|
*
|
|
* PUBLIC: void __rep_elect_done __P((DB_ENV *, REP *));
|
|
*/
|
|
void
|
|
__rep_elect_done(dbenv, rep)
|
|
DB_ENV *dbenv;
|
|
REP *rep;
|
|
{
|
|
int inelect;
|
|
u_int32_t endsec, endusec;
|
|
#ifdef DIAGNOSTIC
|
|
DB_MSGBUF mb;
|
|
#else
|
|
COMPQUIET(dbenv, NULL);
|
|
#endif
|
|
inelect = IN_ELECTION_TALLY(rep);
|
|
F_CLR(rep, REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY);
|
|
rep->sites = 0;
|
|
rep->votes = 0;
|
|
if (inelect) {
|
|
if (rep->esec != 0) {
|
|
__os_clock(dbenv, &endsec, &endusec);
|
|
__db_difftime(rep->esec, endsec, rep->eusec, endusec,
|
|
&rep->stat.st_election_sec,
|
|
&rep->stat.st_election_usec);
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"Election finished in %u.%06u sec",
|
|
rep->stat.st_election_sec,
|
|
rep->stat.st_election_usec));
|
|
rep->esec = 0;
|
|
rep->eusec = 0;
|
|
}
|
|
rep->egen++;
|
|
}
|
|
RPRINT(dbenv, rep, (dbenv, &mb,
|
|
"Election done; egen %lu", (u_long)rep->egen));
|
|
}
|
|
|
|
/*
|
|
* __rep_grow_sites --
|
|
* Called to allocate more space in the election tally information.
|
|
* Called with the rep mutex held. We need to call the region mutex, so
|
|
* we need to make sure that we *never* acquire those mutexes in the
|
|
* opposite order.
|
|
*
|
|
* PUBLIC: int __rep_grow_sites __P((DB_ENV *dbenv, int nsites));
|
|
*/
|
|
int
|
|
__rep_grow_sites(dbenv, nsites)
|
|
DB_ENV *dbenv;
|
|
int nsites;
|
|
{
|
|
REGENV *renv;
|
|
REGINFO *infop;
|
|
REP *rep;
|
|
int nalloc, ret, *tally;
|
|
|
|
rep = ((DB_REP *)dbenv->rep_handle)->region;
|
|
|
|
/*
|
|
* Allocate either twice the current allocation or nsites,
|
|
* whichever is more.
|
|
*/
|
|
nalloc = 2 * rep->asites;
|
|
if (nalloc < nsites)
|
|
nalloc = nsites;
|
|
|
|
infop = dbenv->reginfo;
|
|
renv = infop->primary;
|
|
MUTEX_LOCK(dbenv, renv->mtx_regenv);
|
|
|
|
/*
|
|
* We allocate 2 tally regions, one for tallying VOTE1's and
|
|
* one for VOTE2's. Always grow them in tandem, because if we
|
|
* get more VOTE1's we'll always expect more VOTE2's then too.
|
|
*/
|
|
if ((ret = __db_shalloc(infop,
|
|
(size_t)nalloc * sizeof(REP_VTALLY), sizeof(REP_VTALLY),
|
|
&tally)) == 0) {
|
|
if (rep->tally_off != INVALID_ROFF)
|
|
__db_shalloc_free(
|
|
infop, R_ADDR(infop, rep->tally_off));
|
|
rep->tally_off = R_OFFSET(infop, tally);
|
|
if ((ret = __db_shalloc(infop,
|
|
(size_t)nalloc * sizeof(REP_VTALLY), sizeof(REP_VTALLY),
|
|
&tally)) == 0) {
|
|
/* Success */
|
|
if (rep->v2tally_off != INVALID_ROFF)
|
|
__db_shalloc_free(infop,
|
|
R_ADDR(infop, rep->v2tally_off));
|
|
rep->v2tally_off = R_OFFSET(infop, tally);
|
|
rep->asites = nalloc;
|
|
rep->nsites = nsites;
|
|
} else {
|
|
/*
|
|
* We were unable to allocate both. So, we must
|
|
* free the first one and reinitialize. If
|
|
* v2tally_off is valid, it is from an old
|
|
* allocation and we are clearing it all out due
|
|
* to the error.
|
|
*/
|
|
if (rep->v2tally_off != INVALID_ROFF)
|
|
__db_shalloc_free(infop,
|
|
R_ADDR(infop, rep->v2tally_off));
|
|
__db_shalloc_free(infop,
|
|
R_ADDR(infop, rep->tally_off));
|
|
rep->v2tally_off = rep->tally_off = INVALID_ROFF;
|
|
rep->asites = 0;
|
|
rep->nsites = 0;
|
|
}
|
|
}
|
|
MUTEX_UNLOCK(dbenv, renv->mtx_regenv);
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* __env_rep_enter --
|
|
*
|
|
* Check if we are in the middle of replication initialization and/or
|
|
* recovery, and if so, disallow operations. If operations are allowed,
|
|
* increment handle-counts, so that we do not start recovery while we
|
|
* are operating in the library.
|
|
*
|
|
* PUBLIC: int __env_rep_enter __P((DB_ENV *, int));
|
|
*/
|
|
int
|
|
__env_rep_enter(dbenv, checklock)
|
|
DB_ENV *dbenv;
|
|
int checklock;
|
|
{
|
|
DB_REP *db_rep;
|
|
REGENV *renv;
|
|
REGINFO *infop;
|
|
REP *rep;
|
|
int cnt;
|
|
time_t timestamp;
|
|
|
|
/* Check if locks have been globally turned off. */
|
|
if (F_ISSET(dbenv, DB_ENV_NOLOCKING))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
infop = dbenv->reginfo;
|
|
renv = infop->primary;
|
|
if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
|
|
(void)time(×tamp);
|
|
TIMESTAMP_CHECK(dbenv, timestamp, renv);
|
|
/*
|
|
* Check if we're still locked out after checking
|
|
* the timestamp.
|
|
*/
|
|
if (F_ISSET(renv, DB_REGENV_REPLOCKED))
|
|
return (EINVAL);
|
|
}
|
|
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
for (cnt = 0; rep->in_recovery;) {
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
if (FLD_ISSET(rep->config, REP_C_NOWAIT)) {
|
|
__db_err(dbenv,
|
|
"Operation locked out. Waiting for replication recovery to complete");
|
|
return (DB_REP_LOCKOUT);
|
|
}
|
|
__os_sleep(dbenv, 1, 0);
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
if (++cnt % 60 == 0)
|
|
__db_err(dbenv,
|
|
"DB_ENV handle waiting %d minutes for replication recovery to complete",
|
|
cnt / 60);
|
|
}
|
|
rep->handle_cnt++;
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __env_db_rep_exit --
|
|
*
|
|
* Decrement handle count upon routine exit.
|
|
*
|
|
* PUBLIC: int __env_db_rep_exit __P((DB_ENV *));
|
|
*/
|
|
int
|
|
__env_db_rep_exit(dbenv)
|
|
DB_ENV *dbenv;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
|
|
/* Check if locks have been globally turned off. */
|
|
if (F_ISSET(dbenv, DB_ENV_NOLOCKING))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
rep->handle_cnt--;
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __db_rep_enter --
|
|
* Called in replicated environments to keep track of in-use handles
|
|
* and prevent any concurrent operation during recovery. If checkgen is
|
|
* non-zero, then we verify that the dbp has the same handle as the env.
|
|
*
|
|
* If return_now is non-zero, we'll return DB_DEADLOCK immediately, else we'll
|
|
* sleep before returning DB_DEADLOCK. Without the sleep, it is likely
|
|
* the application will immediately try again and could reach a retry
|
|
* limit before replication has a chance to finish. The sleep increases
|
|
* the probability that an application retry will succeed.
|
|
*
|
|
* PUBLIC: int __db_rep_enter __P((DB *, int, int, int));
|
|
*/
|
|
int
|
|
__db_rep_enter(dbp, checkgen, checklock, return_now)
|
|
DB *dbp;
|
|
int checkgen, checklock, return_now;
|
|
{
|
|
DB_ENV *dbenv;
|
|
DB_REP *db_rep;
|
|
REGENV *renv;
|
|
REGINFO *infop;
|
|
REP *rep;
|
|
time_t timestamp;
|
|
|
|
dbenv = dbp->dbenv;
|
|
/* Check if locks have been globally turned off. */
|
|
if (F_ISSET(dbenv, DB_ENV_NOLOCKING))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
infop = dbenv->reginfo;
|
|
renv = infop->primary;
|
|
|
|
if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
|
|
(void)time(×tamp);
|
|
TIMESTAMP_CHECK(dbenv, timestamp, renv);
|
|
/*
|
|
* Check if we're still locked out after checking
|
|
* the timestamp.
|
|
*/
|
|
if (F_ISSET(renv, DB_REGENV_REPLOCKED))
|
|
return (EINVAL);
|
|
}
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
if (F_ISSET(rep, REP_F_READY)) {
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
if (!return_now)
|
|
__os_sleep(dbenv, 5, 0);
|
|
return (DB_LOCK_DEADLOCK);
|
|
}
|
|
|
|
if (checkgen && dbp->timestamp != renv->rep_timestamp) {
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
__db_err(dbenv, "%s %s",
|
|
"replication recovery unrolled committed transactions;",
|
|
"open DB and DBcursor handles must be closed");
|
|
return (DB_REP_HANDLE_DEAD);
|
|
}
|
|
rep->handle_cnt++;
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __op_rep_enter --
|
|
*
|
|
* Check if we are in the middle of replication initialization and/or
|
|
* recovery, and if so, disallow new multi-step operations, such as
|
|
* transaction and memp gets. If operations are allowed,
|
|
* increment the op_cnt, so that we do not start recovery while we have
|
|
* active operations.
|
|
*
|
|
* PUBLIC: int __op_rep_enter __P((DB_ENV *));
|
|
*/
|
|
int
|
|
__op_rep_enter(dbenv)
|
|
DB_ENV *dbenv;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
int cnt;
|
|
|
|
/* Check if locks have been globally turned off. */
|
|
if (F_ISSET(dbenv, DB_ENV_NOLOCKING))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
for (cnt = 0; F_ISSET(rep, REP_F_READY);) {
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
if (FLD_ISSET(rep->config, REP_C_NOWAIT)) {
|
|
__db_err(dbenv,
|
|
"Operation locked out. Waiting for replication recovery to complete");
|
|
return (DB_REP_LOCKOUT);
|
|
}
|
|
__os_sleep(dbenv, 5, 0);
|
|
cnt += 5;
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
if (cnt % 60 == 0)
|
|
__db_err(dbenv,
|
|
"__op_rep_enter waiting %d minutes for op count to drain",
|
|
cnt / 60);
|
|
}
|
|
rep->op_cnt++;
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __op_rep_exit --
|
|
*
|
|
* Decrement op count upon transaction commit/abort/discard or
|
|
* memp_fput.
|
|
*
|
|
* PUBLIC: int __op_rep_exit __P((DB_ENV *));
|
|
*/
|
|
int
|
|
__op_rep_exit(dbenv)
|
|
DB_ENV *dbenv;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
|
|
/* Check if locks have been globally turned off. */
|
|
if (F_ISSET(dbenv, DB_ENV_NOLOCKING))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
DB_ASSERT(rep->op_cnt > 0);
|
|
rep->op_cnt--;
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __rep_get_gen --
|
|
*
|
|
* Get the generation number from a replicated environment.
|
|
*
|
|
* PUBLIC: int __rep_get_gen __P((DB_ENV *, u_int32_t *));
|
|
*/
|
|
int
|
|
__rep_get_gen(dbenv, genp)
|
|
DB_ENV *dbenv;
|
|
u_int32_t *genp;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
if (rep->recover_gen > rep->gen)
|
|
*genp = rep->recover_gen;
|
|
else
|
|
*genp = rep->gen;
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __rep_lockout --
|
|
* Coordinate with other threads in the library and active txns so
|
|
* that we can run single-threaded, for recovery or internal backup.
|
|
* Assumes the caller holds the region mutex.
|
|
*
|
|
* PUBLIC: int __rep_lockout __P((DB_ENV *, REP *, u_int32_t));
|
|
*/
|
|
int
|
|
__rep_lockout(dbenv, rep, msg_th)
|
|
DB_ENV *dbenv;
|
|
REP *rep;
|
|
u_int32_t msg_th;
|
|
{
|
|
int wait_cnt;
|
|
|
|
/* Phase 1: set REP_F_READY and wait for op_cnt to go to 0. */
|
|
F_SET(rep, REP_F_READY);
|
|
for (wait_cnt = 0; rep->op_cnt != 0;) {
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
__os_sleep(dbenv, 1, 0);
|
|
#if defined(DIAGNOSTIC) || defined(CONFIG_TEST)
|
|
if (++wait_cnt % 60 == 0)
|
|
__db_err(dbenv,
|
|
"Waiting for txn_cnt to run replication recovery/backup for %d minutes",
|
|
wait_cnt / 60);
|
|
#endif
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
}
|
|
|
|
/*
|
|
* Phase 2: set in_recovery and wait for handle count to go
|
|
* to 0 and for the number of threads in __rep_process_message
|
|
* to go to 1 (us).
|
|
*/
|
|
rep->in_recovery = 1;
|
|
for (wait_cnt = 0; rep->handle_cnt != 0 || rep->msg_th > msg_th;) {
|
|
REP_SYSTEM_UNLOCK(dbenv);
|
|
__os_sleep(dbenv, 1, 0);
|
|
#ifdef DIAGNOSTIC
|
|
if (++wait_cnt % 60 == 0)
|
|
__db_err(dbenv,
|
|
"Waiting for handle count to run replication recovery/backup for %d minutes",
|
|
wait_cnt / 60);
|
|
#endif
|
|
REP_SYSTEM_LOCK(dbenv);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* __rep_send_throttle -
|
|
* Send a record, throttling if necessary. Callers of this function
|
|
* will throttle - breaking out of their loop, if the repth->type field
|
|
* changes from the normal message type to the *_MORE message type.
|
|
* This function will send the normal type unless throttling gets invoked.
|
|
* Then it sets the type field and sends the _MORE message.
|
|
*
|
|
* PUBLIC: int __rep_send_throttle __P((DB_ENV *, int, REP_THROTTLE *,
|
|
* PUBLIC: u_int32_t));
|
|
*/
|
|
int
|
|
__rep_send_throttle(dbenv, eid, repth, flags)
|
|
DB_ENV *dbenv;
|
|
int eid;
|
|
REP_THROTTLE *repth;
|
|
u_int32_t flags;
|
|
{
|
|
DB_REP *db_rep;
|
|
REP *rep;
|
|
u_int32_t size, typemore;
|
|
int check_limit;
|
|
|
|
check_limit = repth->gbytes != 0 || repth->bytes != 0;
|
|
/*
|
|
* If we only want to do throttle processing and we don't have it
|
|
* turned on, return immediately.
|
|
*/
|
|
if (!check_limit && LF_ISSET(REP_THROTTLE_ONLY))
|
|
return (0);
|
|
|
|
db_rep = dbenv->rep_handle;
|
|
rep = db_rep->region;
|
|
typemore = 0;
|
|
if (repth->type == REP_LOG)
|
|
typemore = REP_LOG_MORE;
|
|
if (repth->type == REP_PAGE)
|
|
typemore = REP_PAGE_MORE;
|
|
DB_ASSERT(typemore != 0);
|
|
|
|
/*
|
|
* data_dbt.size is only the size of the log
|
|
* record; it doesn't count the size of the
|
|
* control structure. Factor that in as well
|
|
* so we're not off by a lot if our log records
|
|
* are small.
|
|
*/
|
|
size = repth->data_dbt->size + sizeof(REP_CONTROL);
|
|
if (check_limit) {
|
|
if (repth->lsn.offset == 28) {
|
|
repth->type = typemore;
|
|
goto send;
|
|
}
|
|
while (repth->bytes <= size) {
|
|
if (repth->gbytes > 0) {
|
|
repth->bytes += GIGABYTE;
|
|
--(repth->gbytes);
|
|
continue;
|
|
}
|
|
/*
|
|
* We don't hold the rep mutex,
|
|
* and may miscount.
|
|
*/
|
|
rep->stat.st_nthrottles++;
|
|
repth->type = typemore;
|
|
goto send;
|
|
}
|
|
repth->bytes -= size;
|
|
}
|
|
/*
|
|
* Always send if it is typemore, otherwise send only if
|
|
* REP_THROTTLE_ONLY is not set.
|
|
*/
|
|
send: if ((repth->type == typemore || !LF_ISSET(REP_THROTTLE_ONLY)) &&
|
|
(__rep_send_message(dbenv, eid, repth->type,
|
|
&repth->lsn, repth->data_dbt, DB_LOG_RESEND, 0) != 0))
|
|
return (1);
|
|
return (0);
|
|
}
|
|
|
|
#ifdef DIAGNOSTIC
|
|
/*
|
|
* PUBLIC: void __rep_print_message __P((DB_ENV *, int, REP_CONTROL *, char *));
|
|
*/
|
|
void
|
|
__rep_print_message(dbenv, eid, rp, str)
|
|
DB_ENV *dbenv;
|
|
int eid;
|
|
REP_CONTROL *rp;
|
|
char *str;
|
|
{
|
|
DB_MSGBUF mb;
|
|
char *type;
|
|
|
|
switch (rp->rectype) {
|
|
case REP_ALIVE:
|
|
type = "alive";
|
|
break;
|
|
case REP_ALIVE_REQ:
|
|
type = "alive_req";
|
|
break;
|
|
case REP_ALL_REQ:
|
|
type = "all_req";
|
|
break;
|
|
case REP_BULK_LOG:
|
|
type = "bulk_log";
|
|
break;
|
|
case REP_BULK_PAGE:
|
|
type = "bulk_page";
|
|
break;
|
|
case REP_DUPMASTER:
|
|
type = "dupmaster";
|
|
break;
|
|
case REP_FILE:
|
|
type = "file";
|
|
break;
|
|
case REP_FILE_FAIL:
|
|
type = "file_fail";
|
|
break;
|
|
case REP_FILE_REQ:
|
|
type = "file_req";
|
|
break;
|
|
case REP_LOG:
|
|
type = "log";
|
|
break;
|
|
case REP_LOG_MORE:
|
|
type = "log_more";
|
|
break;
|
|
case REP_LOG_REQ:
|
|
type = "log_req";
|
|
break;
|
|
case REP_MASTER_REQ:
|
|
type = "master_req";
|
|
break;
|
|
case REP_NEWCLIENT:
|
|
type = "newclient";
|
|
break;
|
|
case REP_NEWFILE:
|
|
type = "newfile";
|
|
break;
|
|
case REP_NEWMASTER:
|
|
type = "newmaster";
|
|
break;
|
|
case REP_NEWSITE:
|
|
type = "newsite";
|
|
break;
|
|
case REP_PAGE:
|
|
type = "page";
|
|
break;
|
|
case REP_PAGE_FAIL:
|
|
type = "page_fail";
|
|
break;
|
|
case REP_PAGE_MORE:
|
|
type = "page_more";
|
|
break;
|
|
case REP_PAGE_REQ:
|
|
type = "page_req";
|
|
break;
|
|
case REP_REREQUEST:
|
|
type = "rerequest";
|
|
break;
|
|
case REP_UPDATE:
|
|
type = "update";
|
|
break;
|
|
case REP_UPDATE_REQ:
|
|
type = "update_req";
|
|
break;
|
|
case REP_VERIFY:
|
|
type = "verify";
|
|
break;
|
|
case REP_VERIFY_FAIL:
|
|
type = "verify_fail";
|
|
break;
|
|
case REP_VERIFY_REQ:
|
|
type = "verify_req";
|
|
break;
|
|
case REP_VOTE1:
|
|
type = "vote1";
|
|
break;
|
|
case REP_VOTE2:
|
|
type = "vote2";
|
|
break;
|
|
default:
|
|
type = "NOTYPE";
|
|
break;
|
|
}
|
|
RPRINT(dbenv, ((REP *)((DB_REP *)(dbenv)->rep_handle)->region),
|
|
(dbenv, &mb, "%s %s: gen = %lu eid %d, type %s, LSN [%lu][%lu]",
|
|
dbenv->db_home, str, (u_long)rp->gen,
|
|
eid, type, (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
|
|
}
|
|
#endif
|