mariadb/bdb/mp/mp_fget.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
#include "db_config.h"

#ifndef lint
static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
#endif /* not lint */

#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>

#include <string.h>
#endif

#include "db_int.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"

#ifdef HAVE_FILESYSTEM_NOTZERO
static int __memp_fs_notzero
    __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
#endif

/*
 * __memp_fget --
 *	Get a page from the file.
 *
 * PUBLIC: int __memp_fget
 * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
 */
int
__memp_fget(dbmfp, pgnoaddr, flags, addrp)
	DB_MPOOLFILE *dbmfp;
	db_pgno_t *pgnoaddr;
	u_int32_t flags;
	void *addrp;
{
	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
	BH *alloc_bhp, *bhp;
	DB_ENV *dbenv;
	DB_MPOOL *dbmp;
	DB_MPOOL_HASH *hp;
	MPOOL *c_mp, *mp;
	MPOOLFILE *mfp;
	roff_t mf_offset;
	u_int32_t n_cache, st_hsearch;
	int b_incr, extending, first, ret;

	*(void **)addrp = NULL;

	dbmp = dbmfp->dbmp;
	dbenv = dbmp->dbenv;

	PANIC_CHECK(dbenv);

	mp = dbmp->reginfo[0].primary;
	mfp = dbmfp->mfp;
	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
	alloc_bhp = bhp = NULL;
	hp = NULL;
	b_incr = extending = ret = 0;

	/*
	 * Validate arguments.
	 *
	 * !!!
	 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
	 * files here, and create non-existent pages in readonly files if the
	 * flags are set, later.  The reason is that the hash access method
	 * wants to get empty pages that don't really exist in readonly files.
	 * The only alternative is for hash to write the last "bucket" all the
	 * time, which we don't want to do because one of our big goals in life
	 * is to keep database files small.  It's sleazy as hell, but we catch
	 * any attempt to actually write the file in memp_fput().
	 */
#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
	if (flags != 0) {
		if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
			return (ret);

		switch (flags) {
		case DB_MPOOL_CREATE:
			break;
		case DB_MPOOL_LAST:
			/* Get the last page number in the file. */
			if (flags == DB_MPOOL_LAST) {
				R_LOCK(dbenv, dbmp->reginfo);
				*pgnoaddr = mfp->last_pgno;
				R_UNLOCK(dbenv, dbmp->reginfo);
			}
			break;
		case DB_MPOOL_NEW:
			/*
			 * If always creating a page, skip the first search
			 * of the hash bucket.
			 */
			if (flags == DB_MPOOL_NEW)
				goto alloc;
			break;
		default:
			return (__db_ferr(dbenv, "memp_fget", 1));
		}
	}

	/*
	 * If mmap'ing the file and the page is not past the end of the file,
	 * just return a pointer.
	 *
	 * The page may be past the end of the file, so check the page number
	 * argument against the original length of the file.  If we previously
	 * returned pages past the original end of the file, last_pgno will
	 * have been updated to match the "new" end of the file, and checking
	 * against it would return pointers past the end of the mmap'd region.
	 *
	 * If another process has opened the file for writing since we mmap'd
	 * it, we will start playing the game by their rules, i.e. everything
	 * goes through the cache.  All pages previously returned will be safe,
	 * as long as the correct locking protocol was observed.
	 *
	 * We don't discard the map because we don't know when all of the
	 * pages will have been discarded from the process' address space.
	 * It would be possible to do so by reference counting the open
	 * pages from the mmap, but it's unclear to me that it's worth it.
	 */
	if (dbmfp->addr != NULL &&
	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
		*(void **)addrp =
		    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
		++mfp->stat.st_map;
		return (0);
	}

hb_search:
	/*
	 * Determine the cache and hash bucket where this page lives and get
	 * local pointers to them.  Reset on each pass through this code, the
	 * page number can change.
	 */
	n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
	c_mp = dbmp->reginfo[n_cache].primary;
	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
	hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];

	/* Search the hash chain for the page. */
retry:	st_hsearch = 0;
	MUTEX_LOCK(dbenv, &hp->hash_mutex);
	for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
		++st_hsearch;
		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
			continue;

		/*
		 * Increment the reference count.  We may discard the hash
		 * bucket lock as we evaluate and/or read the buffer, so we
		 * need to ensure it doesn't move and its contents remain
		 * unchanged.
		 */
		if (bhp->ref == UINT16_T_MAX) {
			__db_err(dbenv,
			    "%s: page %lu: reference count overflow",
			    __memp_fn(dbmfp), (u_long)bhp->pgno);
			ret = EINVAL;
			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
			goto err;
		}
		++bhp->ref;
		b_incr = 1;

		/*
		 * BH_LOCKED --
		 * I/O is in progress or sync is waiting on the buffer to write
		 * it.  Because we've incremented the buffer reference count,
		 * we know the buffer can't move.  Unlock the bucket lock, wait
		 * for the buffer to become available, reacquire the bucket.
		 */
		for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
		    !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
			/*
			 * If someone is trying to sync this buffer and the
			 * buffer is hot, they may never get in.  Give up
			 * and try again.
			 */
			if (!first && bhp->ref_sync != 0) {
				--bhp->ref;
				b_incr = 0;
				MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
				__os_yield(dbenv, 1);
				goto retry;
			}

			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
			/*
			 * Explicitly yield the processor if not the first pass
			 * through this loop -- if we don't, we might run to the
			 * end of our CPU quantum as we will simply be swapping
			 * between the two locks.
			 */
			if (!first)
				__os_yield(dbenv, 1);

			MUTEX_LOCK(dbenv, &bhp->mutex);
			/* Wait for I/O to finish... */
			MUTEX_UNLOCK(dbenv, &bhp->mutex);
			MUTEX_LOCK(dbenv, &hp->hash_mutex);
		}

		++mfp->stat.st_cache_hit;
		break;
	}

	/*
	 * Update the hash bucket search statistics -- do now because our next
	 * search may be for a different bucket.
	 */
	++c_mp->stat.st_hash_searches;
	if (st_hsearch > c_mp->stat.st_hash_longest)
		c_mp->stat.st_hash_longest = st_hsearch;
	c_mp->stat.st_hash_examined += st_hsearch;

	/*
	 * There are 4 possible paths to this location:
	 *
	 * FIRST_MISS:
	 *	Didn't find the page in the hash bucket on our first pass:
	 *	bhp == NULL, alloc_bhp == NULL
	 *
	 * FIRST_FOUND:
	 *	Found the page in the hash bucket on our first pass:
	 *	bhp != NULL, alloc_bhp == NULL
	 *
	 * SECOND_FOUND:
	 *	Didn't find the page in the hash bucket on the first pass,
	 *	allocated space, and found the page in the hash bucket on
	 *	our second pass:
	 *	bhp != NULL, alloc_bhp != NULL
	 *
	 * SECOND_MISS:
	 *	Didn't find the page in the hash bucket on the first pass,
	 *	allocated space, and didn't find the page in the hash bucket
	 *	on our second pass:
	 *	bhp == NULL, alloc_bhp != NULL
	 */
	state = bhp == NULL ?
	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
	switch (state) {
	case FIRST_FOUND:
		/* We found the buffer in our first check -- we're done. */
		break;
	case FIRST_MISS:
		/*
		 * We didn't find the buffer in our first check.  Figure out
		 * if the page exists, and allocate structures so we can add
		 * the page to the buffer pool.
		 */
		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);

alloc:		/*
		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
		 * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
		 * it's an error to try and get a page past the end of file.
		 */
		COMPQUIET(n_cache, 0);

		extending = ret = 0;
		R_LOCK(dbenv, dbmp->reginfo);
		switch (flags) {
		case DB_MPOOL_NEW:
			extending = 1;
			*pgnoaddr = mfp->last_pgno + 1;
			break;
		case DB_MPOOL_CREATE:
			extending = *pgnoaddr > mfp->last_pgno;
			break;
		default:
			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
			break;
		}
		R_UNLOCK(dbenv, dbmp->reginfo);
		if (ret != 0)
			goto err;

		/*
		 * !!!
		 * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
		 * not yet been initialized.
		 */
		mf_offset = R_OFFSET(dbmp->reginfo, mfp);
		n_cache = NCACHE(mp, mf_offset, *pgnoaddr);

		/* Allocate a new buffer header and data space. */
		if ((ret = __memp_alloc(dbmp,
		    &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
			goto err;
#ifdef DIAGNOSTIC
		if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
			__db_err(dbenv,
			    "Error: buffer data is NOT size_t aligned");
			ret = EINVAL;
			goto err;
		}
#endif
		/*
		 * If we are extending the file, we'll need the region lock
		 * again.
		 */
		if (extending)
			R_LOCK(dbenv, dbmp->reginfo);

		/*
		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
		 * any other thread of control.  (That guarantee is interesting
		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
		 * did not specify the page number, and so, may reasonably not
		 * have any way to lock the page outside of mpool.) Regardless,
		 * if we allocate the page, and some other thread of control
		 * requests the page by number, we will not detect that and the
		 * thread of control that allocated using DB_MPOOL_NEW may not
		 * have a chance to initialize the page.  (Note: we *could*
		 * detect this case if we set a flag in the buffer header which
		 * guaranteed that no gets of the page would succeed until the
		 * reference count went to 0, that is, until the creating page
		 * put the page.)  What we do guarantee is that if two threads
		 * of control are both doing DB_MPOOL_NEW calls, they won't
		 * collide, that is, they won't both get the same page.
		 *
		 * There's a possibility that another thread allocated the page
		 * we were planning to allocate while we were off doing buffer
		 * allocation.  We can do that by making sure the page number
		 * we were going to use is still available.  If it's not, then
		 * we check to see if the next available page number hashes to
		 * the same mpool region as the old one -- if it does, we can
		 * continue, otherwise, we have to start over.
		 */
		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
			*pgnoaddr = mfp->last_pgno + 1;
			if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
				__db_shalloc_free(
				    dbmp->reginfo[n_cache].addr, alloc_bhp);
				/*
				 * flags == DB_MPOOL_NEW, so extending is set
				 * and we're holding the region locked.
				 */
				R_UNLOCK(dbenv, dbmp->reginfo);

				alloc_bhp = NULL;
				goto alloc;
			}
		}

		/*
		 * We released the region lock, so another thread might have
		 * extended the file.  Update the last_pgno and initialize
		 * the file, as necessary, if we extended the file.
		 */
		if (extending) {
#ifdef HAVE_FILESYSTEM_NOTZERO
			if (*pgnoaddr > mfp->last_pgno &&
			    __os_fs_notzero() &&
			    F_ISSET(dbmfp->fhp, DB_FH_VALID))
				ret = __memp_fs_notzero(
				    dbenv, dbmfp, mfp, pgnoaddr);
			else
				ret = 0;
#endif
			if (ret == 0 && *pgnoaddr > mfp->last_pgno)
				mfp->last_pgno = *pgnoaddr;

			R_UNLOCK(dbenv, dbmp->reginfo);
			if (ret != 0)
				goto err;
		}
		goto hb_search;
	case SECOND_FOUND:
		/*
		 * We allocated buffer space for the requested page, but then
		 * found the page in the buffer cache on our second check.
		 * That's OK -- we can use the page we found in the pool,
		 * unless DB_MPOOL_NEW is set.
		 *
		 * Free the allocated memory, we no longer need it.  Since we
		 * can't acquire the region lock while holding the hash bucket
		 * lock, we have to release the hash bucket and re-acquire it.
		 * That's OK, because we have the buffer pinned down.
		 */
		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
		R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
		alloc_bhp = NULL;
		R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
		MUTEX_LOCK(dbenv, &hp->hash_mutex);

		/*
		 * We can't use the page we found in the pool if DB_MPOOL_NEW
		 * was set.  (For details, see the above comment beginning
		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
		 * release our pin on this particular buffer, and try to get
		 * another one.
		 */
		if (flags == DB_MPOOL_NEW) {
			--bhp->ref;
			b_incr = 0;
			goto alloc;
		}
		break;
	case SECOND_MISS:
		/*
		 * We allocated buffer space for the requested page, and found
		 * the page still missing on our second pass through the buffer
		 * cache.  Instantiate the page.
		 */
		bhp = alloc_bhp;
		alloc_bhp = NULL;

		/*
		 * Initialize all the BH and hash bucket fields so we can call
		 * __memp_bhfree if an error occurs.
		 *
		 * Append the buffer to the tail of the bucket list and update
		 * the hash bucket's priority.
		 */
		b_incr = 1;

		memset(bhp, 0, sizeof(BH));
		bhp->ref = 1;
		bhp->priority = UINT32_T_MAX;
		bhp->pgno = *pgnoaddr;
		bhp->mf_offset = mf_offset;
		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
		hp->hash_priority =
		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;

		/* If we extended the file, make sure the page is never lost. */
		if (extending) {
			++hp->hash_page_dirty;
			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
		}

		/*
		 * If we created the page, zero it out.  If we didn't create
		 * the page, read from the backing file.
		 *
		 * !!!
		 * DB_MPOOL_NEW doesn't call the pgin function.
		 *
		 * If DB_MPOOL_CREATE is used, then the application's pgin
		 * function has to be able to handle pages of 0's -- if it
		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
		 * and not bother.
		 *
		 * If we're running in diagnostic mode, smash any bytes on the
		 * page that are unknown quantities for the caller.
		 *
		 * Otherwise, read the page into memory, optionally creating it
		 * if DB_MPOOL_CREATE is set.
		 */
		if (extending) {
			if (mfp->clear_len == 0)
				memset(bhp->buf, 0, mfp->stat.st_pagesize);
			else {
				memset(bhp->buf, 0, mfp->clear_len);
#if defined(DIAGNOSTIC) || defined(UMRW)
				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
				    mfp->stat.st_pagesize - mfp->clear_len);
#endif
			}

			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
				F_SET(bhp, BH_CALLPGIN);

			++mfp->stat.st_page_create;
		} else {
			F_SET(bhp, BH_TRASH);
			++mfp->stat.st_cache_miss;
		}

		/* Increment buffer count referenced by MPOOLFILE. */
		MUTEX_LOCK(dbenv, &mfp->mutex);
		++mfp->block_cnt;
		MUTEX_UNLOCK(dbenv, &mfp->mutex);

		/*
		 * Initialize the mutex.  This is the last initialization step,
		 * because it's the only one that can fail, and everything else
		 * must be set up or we can't jump to the err label because it
		 * will call __memp_bhfree.
		 */
		if ((ret = __db_mutex_setup(dbenv,
		    &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
			goto err;
	}

	DB_ASSERT(bhp->ref != 0);

	/*
	 * If we're the only reference, update buffer and bucket priorities.
	 * We may be about to release the hash bucket lock, and everything
	 * should be correct, first.  (We've already done this if we created
	 * the buffer, so there is no need to do it again.)
	 */
	if (state != SECOND_MISS && bhp->ref == 1) {
		bhp->priority = UINT32_T_MAX;
		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
		hp->hash_priority =
		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
	}

	/*
	 * BH_TRASH --
	 * The buffer we found may need to be filled from the disk.
	 *
	 * It's possible for the read function to fail, which means we fail as
	 * well.  Note, the __memp_pgread() function discards and reacquires
	 * the hash lock, so the buffer must be pinned down so that it cannot
	 * move and its contents are unchanged.  Discard the buffer on failure
	 * unless another thread is waiting on our I/O to complete.  It's OK to
	 * leave the buffer around, as the waiting thread will see the BH_TRASH
	 * flag set, and will also attempt to discard it.  If there's a waiter,
	 * we need to decrement our reference count.
	 */
	if (F_ISSET(bhp, BH_TRASH) &&
	    (ret = __memp_pgread(dbmfp,
	    &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
		goto err;

	/*
	 * BH_CALLPGIN --
	 * The buffer was processed for being written to disk, and now has
	 * to be re-converted for use.
	 */
	if (F_ISSET(bhp, BH_CALLPGIN)) {
		if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
			goto err;
		F_CLR(bhp, BH_CALLPGIN);
	}

	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);

#ifdef DIAGNOSTIC
	/* Update the file's pinned reference count. */
	R_LOCK(dbenv, dbmp->reginfo);
	++dbmfp->pinref;
	R_UNLOCK(dbenv, dbmp->reginfo);

	/*
	 * We want to switch threads as often as possible, and at awkward
	 * times.  Yield every time we get a new page to ensure contention.
	 */
	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
		__os_yield(dbenv, 1);
#endif

	*(void **)addrp = bhp->buf;
	return (0);

err:	/*
	 * Discard our reference.  If we're the only reference, discard the
	 * the buffer entirely.  If we held a reference to a buffer, we are
	 * also still holding the hash bucket mutex.
	 */
	if (b_incr) {
		if (bhp->ref == 1)
			(void)__memp_bhfree(dbmp, hp, bhp, 1);
		else {
			--bhp->ref;
			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
		}
	}

	/* If alloc_bhp is set, free the memory. */
	if (alloc_bhp != NULL)
		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);

	return (ret);
}

#ifdef HAVE_FILESYSTEM_NOTZERO
/*
 * __memp_fs_notzero --
 *	Initialize the underlying allocated pages in the file.
 */
static int
__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
	DB_ENV *dbenv;
	DB_MPOOLFILE *dbmfp;
	MPOOLFILE *mfp;
	db_pgno_t *pgnoaddr;
{
	DB_IO db_io;
	u_int32_t i, npages;
	size_t nw;
	int ret;
	u_int8_t *page;
	char *fail;

	/*
	 * Pages allocated by writing pages past end-of-file are not zeroed,
	 * on some systems.  Recovery could theoretically be fooled by a page
	 * showing up that contained garbage.  In order to avoid this, we
	 * have to write the pages out to disk, and flush them.  The reason
	 * for the flush is because if we don't sync, the allocation of another
	 * page subsequent to this one might reach the disk first, and if we
	 * crashed at the right moment, leave us with this page as the one
	 * allocated by writing a page past it in the file.
	 *
	 * Hash is the only access method that allocates groups of pages.  We
	 * know that it will use the existence of the last page in a group to
	 * signify that the entire group is OK; so, write all the pages but
	 * the last one in the group, flush them to disk, and then write the
	 * last one to disk and flush it.
	 */
	if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
		return (ret);

	db_io.fhp = dbmfp->fhp;
	db_io.mutexp = dbmfp->mutexp;
	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
	db_io.buf = page;

	npages = *pgnoaddr - mfp->last_pgno;
	for (i = 1; i < npages; ++i) {
		db_io.pgno = mfp->last_pgno + i;
		if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
			fail = "write";
			goto err;
		}
	}
	if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
		fail = "sync";
		goto err;
	}

	db_io.pgno = mfp->last_pgno + npages;
	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
		fail = "write";
		goto err;
	}
	if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
		fail = "sync";
err:		__db_err(dbenv, "%s: %s failed for page %lu",
		    __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
	}

	__os_free(dbenv, page);
	return (ret);
}
#endif