2001-03-05 01:42:05 +01:00
|
|
|
/*-
|
|
|
|
* See the file LICENSE for redistribution information.
|
|
|
|
*
|
2002-10-30 12:57:05 +01:00
|
|
|
* Copyright (c) 1996-2002
|
2001-03-05 01:42:05 +01:00
|
|
|
* Sleepycat Software. All rights reserved.
|
|
|
|
*/
|
|
|
|
#include "db_config.h"
|
|
|
|
|
|
|
|
#ifndef lint
|
2002-10-30 12:57:05 +01:00
|
|
|
static const char revid[] = "$Id: mp_sync.c,v 11.64 2002/08/25 16:00:27 bostic Exp $";
|
2001-03-05 01:42:05 +01:00
|
|
|
#endif /* not lint */
|
|
|
|
|
|
|
|
#ifndef NO_SYSTEM_INCLUDES
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "db_int.h"
|
2002-10-30 12:57:05 +01:00
|
|
|
#include "dbinc/db_shash.h"
|
|
|
|
#include "dbinc/mp.h"
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
typedef struct {
|
|
|
|
DB_MPOOL_HASH *track_hp; /* Hash bucket. */
|
|
|
|
|
|
|
|
roff_t track_off; /* Page file offset. */
|
|
|
|
db_pgno_t track_pgno; /* Page number. */
|
|
|
|
} BH_TRACK;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
static int __bhcmp __P((const void *, const void *));
|
2002-10-30 12:57:05 +01:00
|
|
|
static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *));
|
|
|
|
static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* __memp_sync --
|
2001-03-05 01:42:05 +01:00
|
|
|
* Mpool sync function.
|
2002-10-30 12:57:05 +01:00
|
|
|
*
|
|
|
|
* PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
|
|
|
int
|
2002-10-30 12:57:05 +01:00
|
|
|
__memp_sync(dbenv, lsnp)
|
2001-03-05 01:42:05 +01:00
|
|
|
DB_ENV *dbenv;
|
|
|
|
DB_LSN *lsnp;
|
|
|
|
{
|
|
|
|
DB_MPOOL *dbmp;
|
2002-10-30 12:57:05 +01:00
|
|
|
MPOOL *mp;
|
|
|
|
int ret;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
PANIC_CHECK(dbenv);
|
2002-10-30 12:57:05 +01:00
|
|
|
ENV_REQUIRES_CONFIG(dbenv,
|
|
|
|
dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* If no LSN is provided, flush the entire cache (reasonable usage
|
|
|
|
* even if there's no log subsystem configured).
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (lsnp != NULL)
|
|
|
|
ENV_REQUIRES_CONFIG(dbenv,
|
|
|
|
dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
dbmp = dbenv->mp_handle;
|
|
|
|
mp = dbmp->reginfo[0].primary;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/* If we've flushed to the requested LSN, return that information. */
|
|
|
|
if (lsnp != NULL) {
|
|
|
|
R_LOCK(dbenv, dbmp->reginfo);
|
|
|
|
if (log_compare(lsnp, &mp->lsn) <= 0) {
|
2001-03-05 01:42:05 +01:00
|
|
|
*lsnp = mp->lsn;
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
R_UNLOCK(dbenv, dbmp->reginfo);
|
|
|
|
return (0);
|
|
|
|
}
|
2001-03-05 01:42:05 +01:00
|
|
|
R_UNLOCK(dbenv, dbmp->reginfo);
|
|
|
|
}
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
|
2001-03-05 01:42:05 +01:00
|
|
|
return (ret);
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
if (lsnp != NULL) {
|
|
|
|
R_LOCK(dbenv, dbmp->reginfo);
|
|
|
|
if (log_compare(lsnp, &mp->lsn) > 0)
|
|
|
|
mp->lsn = *lsnp;
|
|
|
|
R_UNLOCK(dbenv, dbmp->reginfo);
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
return (0);
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* __memp_fsync --
|
2001-03-05 01:42:05 +01:00
|
|
|
* Mpool file sync function.
|
2002-10-30 12:57:05 +01:00
|
|
|
*
|
|
|
|
* PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
|
|
|
int
|
2002-10-30 12:57:05 +01:00
|
|
|
__memp_fsync(dbmfp)
|
2001-03-05 01:42:05 +01:00
|
|
|
DB_MPOOLFILE *dbmfp;
|
|
|
|
{
|
|
|
|
DB_ENV *dbenv;
|
|
|
|
DB_MPOOL *dbmp;
|
|
|
|
|
|
|
|
dbmp = dbmfp->dbmp;
|
|
|
|
dbenv = dbmp->dbenv;
|
|
|
|
|
|
|
|
PANIC_CHECK(dbenv);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this handle doesn't have a file descriptor that's open for
|
|
|
|
* writing, or if the file is a temporary, there's no reason to
|
|
|
|
* proceed further.
|
|
|
|
*/
|
|
|
|
if (F_ISSET(dbmfp, MP_READONLY))
|
|
|
|
return (0);
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
if (F_ISSET(dbmfp->mfp, MP_TEMP))
|
2001-03-05 01:42:05 +01:00
|
|
|
return (0);
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* __mp_xxx_fh --
|
|
|
|
* Return a file descriptor for DB 1.85 compatibility locking.
|
|
|
|
*
|
|
|
|
* PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
__mp_xxx_fh(dbmfp, fhp)
|
|
|
|
DB_MPOOLFILE *dbmfp;
|
|
|
|
DB_FH **fhp;
|
|
|
|
{
|
2002-10-30 12:57:05 +01:00
|
|
|
DB_ENV *dbenv;
|
2001-03-05 01:42:05 +01:00
|
|
|
/*
|
|
|
|
* This is a truly spectacular layering violation, intended ONLY to
|
|
|
|
* support compatibility for the DB 1.85 DB->fd call.
|
|
|
|
*
|
|
|
|
* Sync the database file to disk, creating the file as necessary.
|
|
|
|
*
|
|
|
|
* We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
|
|
|
|
* The MP_READONLY test isn't interesting because we will either
|
|
|
|
* already have a file descriptor (we opened the database file for
|
|
|
|
* reading) or we aren't readonly (we created the database which
|
|
|
|
* requires write privileges). The MP_TEMP test isn't interesting
|
|
|
|
* because we want to write to the backing file regardless so that
|
|
|
|
* we get a file descriptor to return.
|
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
*fhp = dbmfp->fhp;
|
|
|
|
if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
|
|
|
|
return (0);
|
|
|
|
dbenv = dbmfp->dbmp->dbenv;
|
|
|
|
|
|
|
|
return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* __memp_sync_int --
|
|
|
|
* Mpool sync internal function.
|
|
|
|
*
|
|
|
|
* PUBLIC: int __memp_sync_int
|
|
|
|
* PUBLIC: __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *));
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
int
|
|
|
|
__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep)
|
|
|
|
DB_ENV *dbenv;
|
2001-03-05 01:42:05 +01:00
|
|
|
DB_MPOOLFILE *dbmfp;
|
2002-10-30 12:57:05 +01:00
|
|
|
int ar_max, *wrotep;
|
|
|
|
db_sync_op op;
|
2001-03-05 01:42:05 +01:00
|
|
|
{
|
2002-10-30 12:57:05 +01:00
|
|
|
BH *bhp;
|
|
|
|
BH_TRACK *bharray;
|
2001-03-05 01:42:05 +01:00
|
|
|
DB_MPOOL *dbmp;
|
2002-10-30 12:57:05 +01:00
|
|
|
DB_MPOOL_HASH *hp;
|
|
|
|
DB_MUTEX *mutexp;
|
2001-03-05 01:42:05 +01:00
|
|
|
MPOOL *c_mp, *mp;
|
2002-10-30 12:57:05 +01:00
|
|
|
MPOOLFILE *mfp;
|
|
|
|
u_int32_t n_cache;
|
|
|
|
int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
dbmp = dbenv->mp_handle;
|
2001-03-05 01:42:05 +01:00
|
|
|
mp = dbmp->reginfo[0].primary;
|
2002-10-30 12:57:05 +01:00
|
|
|
pass = wrote = 0;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* If the caller does not specify how many pages assume one
|
|
|
|
* per bucket.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (ar_max == 0)
|
|
|
|
ar_max = mp->nreg * mp->htab_buckets;
|
|
|
|
|
2001-03-05 01:42:05 +01:00
|
|
|
if ((ret =
|
2002-10-30 12:57:05 +01:00
|
|
|
__os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
|
2001-03-05 01:42:05 +01:00
|
|
|
return (ret);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk each cache's list of buffers and mark all dirty buffers to be
|
2002-10-30 12:57:05 +01:00
|
|
|
* written and all pinned buffers to be potentially written, depending
|
|
|
|
* on our flags.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
|
|
|
|
c_mp = dbmp->reginfo[n_cache].primary;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
|
|
|
|
for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
|
2001-03-05 01:42:05 +01:00
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* We can check for empty buckets before locking as we
|
|
|
|
* only care if the pointer is zero or non-zero. We
|
|
|
|
* can ignore empty buckets because we only need write
|
|
|
|
* buffers that were dirty before we started.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
MUTEX_LOCK(dbenv, &hp->hash_mutex);
|
|
|
|
for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
|
|
|
|
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
|
|
|
|
/* Always ignore unreferenced, clean pages. */
|
|
|
|
if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Checkpoints have to wait on all pinned pages,
|
|
|
|
* as pages may be marked dirty when returned to
|
|
|
|
* the cache.
|
|
|
|
*
|
|
|
|
* File syncs only wait on pages both pinned and
|
|
|
|
* dirty. (We don't care if pages are marked
|
|
|
|
* dirty when returned to the cache, that means
|
|
|
|
* there's another writing thread and flushing
|
|
|
|
* the cache for this handle is meaningless.)
|
|
|
|
*/
|
|
|
|
if (op == DB_SYNC_FILE &&
|
|
|
|
!F_ISSET(bhp, BH_DIRTY))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ignore temporary files -- this means you
|
|
|
|
* can't even flush temporary files by handle.
|
|
|
|
* (Checkpoint doesn't require temporary files
|
|
|
|
* be flushed and the underlying buffer write
|
|
|
|
* write routine may not be able to write it
|
|
|
|
* anyway.)
|
|
|
|
*/
|
|
|
|
if (F_ISSET(mfp, MP_TEMP))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're flushing a specific file, see if
|
|
|
|
* this page is from that file.
|
|
|
|
*/
|
|
|
|
if (dbmfp != NULL && mfp != dbmfp->mfp)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ignore files that aren't involved in DB's
|
|
|
|
* transactional operations during checkpoints.
|
|
|
|
*/
|
|
|
|
if (dbmfp == NULL && mfp->lsn_off == -1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Track the buffer, we want it. */
|
|
|
|
bharray[ar_cnt].track_hp = hp;
|
|
|
|
bharray[ar_cnt].track_pgno = bhp->pgno;
|
|
|
|
bharray[ar_cnt].track_off = bhp->mf_offset;
|
|
|
|
ar_cnt++;
|
|
|
|
|
|
|
|
if (ar_cnt >= ar_max) {
|
|
|
|
if ((ret = __os_realloc(dbenv,
|
|
|
|
(ar_max * 2) * sizeof(BH_TRACK),
|
|
|
|
&bharray)) != 0)
|
|
|
|
break;
|
|
|
|
ar_max *= 2;
|
|
|
|
}
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
2002-10-30 12:57:05 +01:00
|
|
|
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
goto err;
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/* If there no buffers to write, we're done. */
|
|
|
|
if (ar_cnt == 0)
|
2001-03-05 01:42:05 +01:00
|
|
|
goto done;
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/*
|
|
|
|
* Write the buffers in file/page order, trying to reduce seeks by the
|
|
|
|
* filesystem and, when pages are smaller than filesystem block sizes,
|
|
|
|
* reduce the actual number of writes.
|
|
|
|
*/
|
2001-03-05 01:42:05 +01:00
|
|
|
if (ar_cnt > 1)
|
2002-10-30 12:57:05 +01:00
|
|
|
qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/*
|
|
|
|
* If we're trickling buffers, only write enough to reach the correct
|
|
|
|
* percentage for this region. We may not write enough if the dirty
|
|
|
|
* buffers have an unbalanced distribution among the regions, but that
|
|
|
|
* seems unlikely.
|
|
|
|
*/
|
|
|
|
if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg)
|
|
|
|
ar_cnt = ar_max / (int)mp->nreg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush the log. We have to ensure the log records reflecting the
|
|
|
|
* changes on the database pages we're writing have already made it
|
|
|
|
* to disk. We still have to check the log each time we write a page
|
|
|
|
* (because pages we are about to write may be modified after we have
|
|
|
|
* flushed the log), but in general this will at least avoid any I/O
|
|
|
|
* on the log's part.
|
|
|
|
*/
|
|
|
|
if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk the array, writing buffers. When we write a buffer, we NULL
|
|
|
|
* out its hash bucket pointer so we don't process a slot more than
|
|
|
|
* once.
|
|
|
|
*/
|
|
|
|
for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) {
|
|
|
|
if (i >= ar_cnt) {
|
|
|
|
i = 0;
|
|
|
|
++pass;
|
|
|
|
__os_sleep(dbenv, 1, 0);
|
|
|
|
}
|
|
|
|
if ((hp = bharray[i].track_hp) == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Lock the hash bucket and find the buffer. */
|
|
|
|
mutexp = &hp->hash_mutex;
|
|
|
|
MUTEX_LOCK(dbenv, mutexp);
|
|
|
|
for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
|
|
|
|
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
|
|
|
|
if (bhp->pgno == bharray[i].track_pgno &&
|
|
|
|
bhp->mf_offset == bharray[i].track_off)
|
|
|
|
break;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* If we can't find the buffer we're done, somebody else had
|
|
|
|
* to have written it.
|
|
|
|
*
|
|
|
|
* If the buffer isn't pinned or dirty, we're done, there's
|
|
|
|
* no work needed.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) {
|
|
|
|
MUTEX_UNLOCK(dbenv, mutexp);
|
|
|
|
--remaining;
|
|
|
|
bharray[i].track_hp = NULL;
|
2001-03-05 01:42:05 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/*
|
|
|
|
* If the buffer is locked by another thread, ignore it, we'll
|
|
|
|
* come back to it.
|
|
|
|
*
|
|
|
|
* If the buffer is pinned and it's only the first or second
|
|
|
|
* time we have looked at it, ignore it, we'll come back to
|
|
|
|
* it.
|
|
|
|
*
|
|
|
|
* In either case, skip the buffer if we're not required to
|
|
|
|
* write it.
|
|
|
|
*/
|
|
|
|
if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
|
|
|
|
MUTEX_UNLOCK(dbenv, mutexp);
|
|
|
|
if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
|
|
|
|
--remaining;
|
|
|
|
bharray[i].track_hp = NULL;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The buffer is either pinned or dirty.
|
|
|
|
*
|
|
|
|
* Set the sync wait-for count, used to count down outstanding
|
|
|
|
* references to this buffer as they are returned to the cache.
|
|
|
|
*/
|
|
|
|
bhp->ref_sync = bhp->ref;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/* Pin the buffer into memory and lock it. */
|
|
|
|
++bhp->ref;
|
|
|
|
F_SET(bhp, BH_LOCKED);
|
|
|
|
MUTEX_LOCK(dbenv, &bhp->mutex);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/*
|
|
|
|
* Unlock the hash bucket and wait for the wait-for count to
|
|
|
|
* go to 0. No new thread can acquire the buffer because we
|
|
|
|
* have it locked.
|
|
|
|
*
|
|
|
|
* If a thread attempts to re-pin a page, the wait-for count
|
|
|
|
* will never go to 0 (the thread spins on our buffer lock,
|
|
|
|
* while we spin on the thread's ref count). Give up if we
|
|
|
|
* don't get the buffer in 3 seconds, we can try again later.
|
|
|
|
*
|
|
|
|
* If, when the wait-for count goes to 0, the buffer is found
|
|
|
|
* to be dirty, write it.
|
|
|
|
*/
|
|
|
|
MUTEX_UNLOCK(dbenv, mutexp);
|
|
|
|
for (wait_cnt = 1;
|
|
|
|
bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
|
|
|
|
__os_sleep(dbenv, 1, 0);
|
|
|
|
MUTEX_LOCK(dbenv, mutexp);
|
|
|
|
hb_lock = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the ref_sync count has gone to 0, we're going to be done
|
|
|
|
* with this buffer no matter what happens.
|
|
|
|
*/
|
|
|
|
if (bhp->ref_sync == 0) {
|
|
|
|
--remaining;
|
|
|
|
bharray[i].track_hp = NULL;
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* If the ref_sync count has gone to 0 and the buffer is still
|
|
|
|
* dirty, we write it. We only try to write the buffer once.
|
|
|
|
* Any process checkpointing or trickle-flushing the pool
|
|
|
|
* must be able to write any underlying file -- if the write
|
|
|
|
* fails, error out. It would be very strange if file sync
|
|
|
|
* failed to write, but we don't care if it happens.
|
|
|
|
*/
|
|
|
|
if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
|
|
|
|
hb_lock = 0;
|
|
|
|
MUTEX_UNLOCK(dbenv, mutexp);
|
|
|
|
|
|
|
|
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
|
|
|
|
if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
|
|
|
|
++wrote;
|
|
|
|
else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE)
|
|
|
|
__db_err(dbenv, "%s: unable to flush page: %lu",
|
|
|
|
__memp_fns(dbmp, mfp), (u_long)bhp->pgno);
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If ref_sync count never went to 0, the buffer was written
|
|
|
|
* by another thread, or the write failed, we still have the
|
|
|
|
* buffer locked.
|
|
|
|
*
|
|
|
|
* We may or may not currently hold the hash bucket mutex. If
|
|
|
|
* the __memp_bhwrite -> __memp_pgwrite call was successful,
|
|
|
|
* then __memp_pgwrite will have swapped the buffer lock for
|
|
|
|
* the hash lock. All other call paths will leave us without
|
|
|
|
* the hash bucket lock.
|
2001-03-05 01:42:05 +01:00
|
|
|
*
|
2002-10-30 12:57:05 +01:00
|
|
|
* The order of mutexes above was to acquire the buffer lock
|
|
|
|
* while holding the hash bucket lock. Don't deadlock here,
|
|
|
|
* release the buffer lock and then acquire the hash bucket
|
|
|
|
* lock.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (F_ISSET(bhp, BH_LOCKED)) {
|
|
|
|
F_CLR(bhp, BH_LOCKED);
|
|
|
|
MUTEX_UNLOCK(dbenv, &bhp->mutex);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
if (!hb_lock)
|
|
|
|
MUTEX_LOCK(dbenv, mutexp);
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
/*
|
|
|
|
* Reset the ref_sync count regardless of our success, we're
|
|
|
|
* done with this buffer for now.
|
|
|
|
*/
|
|
|
|
bhp->ref_sync = 0;
|
|
|
|
|
|
|
|
/* Discard our reference and unlock the bucket. */
|
|
|
|
--bhp->ref;
|
|
|
|
MUTEX_UNLOCK(dbenv, mutexp);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
if (ret != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
done: /* If we've opened files to flush pages, close them. */
|
|
|
|
if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0)
|
|
|
|
ret = t_ret;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* If doing a checkpoint or flushing a file for the application, we
|
|
|
|
* have to force the pages to disk. We don't do this as we go along
|
|
|
|
* because we want to give the OS as much time as possible to lazily
|
|
|
|
* flush, and because we have to flush files that might not even have
|
|
|
|
* had dirty buffers in the cache, so we have to walk the files list.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
|
|
|
|
if (dbmfp == NULL)
|
|
|
|
ret = __memp_sync_files(dbenv, dbmp);
|
|
|
|
else
|
|
|
|
ret = __os_fsync(dbenv, dbmfp->fhp);
|
|
|
|
}
|
|
|
|
|
|
|
|
err: __os_free(dbenv, bharray);
|
|
|
|
if (wrotep != NULL)
|
|
|
|
*wrotep = wrote;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* __memp_sync_files --
|
|
|
|
* Sync all the files in the environment, open or not.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
static
|
|
|
|
int __memp_sync_files(dbenv, dbmp)
|
2001-03-05 01:42:05 +01:00
|
|
|
DB_ENV *dbenv;
|
|
|
|
DB_MPOOL *dbmp;
|
2002-10-30 12:57:05 +01:00
|
|
|
{
|
|
|
|
DB_MPOOLFILE *dbmfp;
|
|
|
|
MPOOL *mp;
|
|
|
|
MPOOLFILE *mfp;
|
|
|
|
int ret, t_ret;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
ret = 0;
|
2001-03-05 01:42:05 +01:00
|
|
|
mp = dbmp->reginfo[0].primary;
|
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
R_LOCK(dbenv, dbmp->reginfo);
|
|
|
|
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
|
|
|
|
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
|
|
|
|
if (mfp->stat.st_page_out == 0 ||
|
|
|
|
F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Look for an already open handle. */
|
|
|
|
ret = 0;
|
|
|
|
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
|
|
|
|
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
|
|
|
|
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
|
|
|
|
if (dbmfp->mfp == mfp) {
|
|
|
|
ret = __os_fsync(dbenv, dbmfp->fhp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
|
|
|
|
if (ret != 0)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
/* If we don't find one, open one. */
|
|
|
|
if (dbmfp == NULL) {
|
|
|
|
if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
|
|
|
|
goto err;
|
|
|
|
ret = __memp_fopen_int(
|
|
|
|
dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
|
|
|
|
0, 0, mfp->stat.st_pagesize);
|
|
|
|
if (ret == 0)
|
|
|
|
ret = __os_fsync(dbenv, dbmfp->fhp);
|
|
|
|
if ((t_ret =
|
|
|
|
__memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
|
|
|
|
ret = t_ret;
|
|
|
|
if (ret != 0)
|
|
|
|
goto err;
|
|
|
|
}
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
2002-10-30 12:57:05 +01:00
|
|
|
|
|
|
|
if (0) {
|
|
|
|
err: __db_err(dbenv, "%s: cannot sync: %s",
|
|
|
|
R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret));
|
2001-03-05 01:42:05 +01:00
|
|
|
}
|
2002-10-30 12:57:05 +01:00
|
|
|
R_UNLOCK(dbenv, dbmp->reginfo);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* __memp_close_flush_files --
|
|
|
|
* Close files opened only to flush buffers.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
__memp_close_flush_files(dbenv, dbmp)
|
|
|
|
DB_ENV *dbenv;
|
|
|
|
DB_MPOOL *dbmp;
|
|
|
|
{
|
|
|
|
DB_MPOOLFILE *dbmfp;
|
|
|
|
int ret;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/*
|
2002-10-30 12:57:05 +01:00
|
|
|
* The routine exists because we must close files opened by sync to
|
|
|
|
* flush buffers. There are two cases: first, extent files have to
|
|
|
|
* be closed so they may be removed when empty. Second, regular
|
|
|
|
* files have to be closed so we don't run out of descriptors (for
|
|
|
|
* example, and application partitioning its data into databases
|
|
|
|
* based on timestamps, so there's a continually increasing set of
|
|
|
|
* files).
|
|
|
|
*
|
|
|
|
* We mark files opened in the __memp_bhwrite() function with the
|
|
|
|
* MP_FLUSH flag. Here we walk through our file descriptor list,
|
|
|
|
* and, if a file was opened by __memp_bhwrite(), we close it.
|
2001-03-05 01:42:05 +01:00
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
retry: MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
|
|
|
|
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
|
|
|
|
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
|
|
|
|
if (F_ISSET(dbmfp, MP_FLUSH)) {
|
|
|
|
F_CLR(dbmfp, MP_FLUSH);
|
|
|
|
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
|
|
|
|
if ((ret = __memp_fclose_int(dbmfp, 0)) != 0)
|
|
|
|
return (ret);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
__bhcmp(p1, p2)
|
|
|
|
const void *p1, *p2;
|
|
|
|
{
|
2002-10-30 12:57:05 +01:00
|
|
|
BH_TRACK *bhp1, *bhp2;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
2002-10-30 12:57:05 +01:00
|
|
|
bhp1 = (BH_TRACK *)p1;
|
|
|
|
bhp2 = (BH_TRACK *)p2;
|
2001-03-05 01:42:05 +01:00
|
|
|
|
|
|
|
/* Sort by file (shared memory pool offset). */
|
2002-10-30 12:57:05 +01:00
|
|
|
if (bhp1->track_off < bhp2->track_off)
|
2001-03-05 01:42:05 +01:00
|
|
|
return (-1);
|
2002-10-30 12:57:05 +01:00
|
|
|
if (bhp1->track_off > bhp2->track_off)
|
2001-03-05 01:42:05 +01:00
|
|
|
return (1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* !!!
|
|
|
|
* Defend against badly written quicksort code calling the comparison
|
|
|
|
* function with two identical pointers (e.g., WATCOM C++ (Power++)).
|
|
|
|
*/
|
2002-10-30 12:57:05 +01:00
|
|
|
if (bhp1->track_pgno < bhp2->track_pgno)
|
2001-03-05 01:42:05 +01:00
|
|
|
return (-1);
|
2002-10-30 12:57:05 +01:00
|
|
|
if (bhp1->track_pgno > bhp2->track_pgno)
|
2001-03-05 01:42:05 +01:00
|
|
|
return (1);
|
|
|
|
return (0);
|
|
|
|
}
|