mariadb/bdb/mp/mp_alloc.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
#include "db_config.h"

#ifndef lint
static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
#endif /* not lint */

#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <string.h>
#endif

#include "db_int.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"

typedef struct {
	DB_MPOOL_HASH *bucket;
	u_int32_t priority;
} HS;

static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));

/*
 * __memp_alloc --
 *	Allocate some space from a cache region.
 *
 * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
 * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
 */
int
__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
	DB_MPOOL *dbmp;
	REGINFO *memreg;
	MPOOLFILE *mfp;
	size_t len;
	roff_t *offsetp;
	void *retp;
{
	BH *bhp;
	DB_ENV *dbenv;
	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
	DB_MUTEX *mutexp;
	MPOOL *c_mp;
	MPOOLFILE *bh_mfp;
	size_t freed_space;
	u_int32_t buckets, buffers, high_priority, max_na, priority;
	int aggressive, ret;
	void *p;

	dbenv = dbmp->dbenv;
	c_mp = memreg->primary;
	dbht = R_ADDR(memreg, c_mp->htab);
	hp_end = &dbht[c_mp->htab_buckets];

	buckets = buffers = 0;
	aggressive = 0;

	c_mp->stat.st_alloc++;

	/*
	 * Get aggressive if we've tried to flush the number of pages as are
	 * in the system without finding space.
	 */
	max_na = 5 * c_mp->htab_buckets;

	/*
	 * If we're allocating a buffer, and the one we're discarding is the
	 * same size, we don't want to waste the time to re-integrate it into
	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
	 * NULL, we'll compare the underlying page sizes of the two buffers
	 * before free-ing and re-allocating buffers.
	 */
	if (mfp != NULL)
		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;

	R_LOCK(dbenv, memreg);

	/*
	 * On every buffer allocation we update the buffer generation number
	 * and check for wraparound.
	 */
	if (++c_mp->lru_count == UINT32_T_MAX)
		__memp_reset_lru(dbenv, memreg, c_mp);

	/*
	 * Anything newer than 1/10th of the buffer pool is ignored during
	 * allocation (unless allocation starts failing).
	 */
	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;

	/*
	 * First we try to allocate from free memory.  If that fails, scan the
	 * buffer pool to find buffers with low priorities.  We consider small
	 * sets of hash buckets each time to limit the amount of work needing
	 * to be done.  This approximates LRU, but not very well.  We either
	 * find a buffer of the same size to use, or we will free 3 times what
	 * we need in the hopes it will coalesce into a contiguous chunk of the
	 * right size.  In the latter case we branch back here and try again.
	 */
alloc:	if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
		if (mfp != NULL)
			c_mp->stat.st_pages++;
		R_UNLOCK(dbenv, memreg);

found:		if (offsetp != NULL)
			*offsetp = R_OFFSET(memreg, p);
		*(void **)retp = p;

		/*
		 * Update the search statistics.
		 *
		 * We're not holding the region locked here, these statistics
		 * can't be trusted.
		 */
		if (buckets != 0) {
			if (buckets > c_mp->stat.st_alloc_max_buckets)
				c_mp->stat.st_alloc_max_buckets = buckets;
			c_mp->stat.st_alloc_buckets += buckets;
		}
		if (buffers != 0) {
			if (buffers > c_mp->stat.st_alloc_max_pages)
				c_mp->stat.st_alloc_max_pages = buffers;
			c_mp->stat.st_alloc_pages += buffers;
		}
		return (0);
	}

	/*
	 * We re-attempt the allocation every time we've freed 3 times what
	 * we need.  Reset our free-space counter.
	 */
	freed_space = 0;

	/*
	 * Walk the hash buckets and find the next two with potentially useful
	 * buffers.  Free the buffer with the lowest priority from the buckets'
	 * chains.
	 */
	for (hp_tmp = NULL;;) {
		/* Check for wrap around. */
		hp = &dbht[c_mp->last_checked++];
		if (hp >= hp_end) {
			c_mp->last_checked = 0;

			/*
			 * If we've gone through all of the hash buckets, try
			 * an allocation.  If the cache is small, the old page
			 * size is small, and the new page size is large, we
			 * might have freed enough memory (but not 3 times the
			 * memory).
			 */
			goto alloc;
		}

		/*
		 * Skip empty buckets.
		 *
		 * We can check for empty buckets before locking as we
		 * only care if the pointer is zero or non-zero.
		 */
		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
			continue;

		/*
		 * The failure mode is when there are too many buffers we can't
		 * write or there's not enough memory in the system.  We don't
		 * have a metric for deciding if allocation has no possible way
		 * to succeed, so we don't ever fail, we assume memory will be
		 * available if we wait long enough.
		 *
		 * Get aggressive if we've tried to flush 5 times the number of
		 * hash buckets as are in the system -- it's possible we have
		 * been repeatedly trying to flush the same buffers, although
		 * it's unlikely.  Aggressive means:
		 *
		 * a: set a flag to attempt to flush high priority buffers as
		 *    well as other buffers.
		 * b: sync the mpool to force out queue extent pages.  While we
		 *    might not have enough space for what we want and flushing
		 *    is expensive, why not?
		 * c: sleep for a second -- hopefully someone else will run and
		 *    free up some memory.  Try to allocate memory too, in case
		 *    the other thread returns its memory to the region.
		 * d: look at a buffer in every hash bucket rather than choose
		 *    the more preferable of two.
		 *
		 * !!!
		 * This test ignores pathological cases like no buffers in the
		 * system -- that shouldn't be possible.
		 */
		if ((++buckets % max_na) == 0) {
			aggressive = 1;

			R_UNLOCK(dbenv, memreg);

			(void)__memp_sync_int(
			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);

			(void)__os_sleep(dbenv, 1, 0);

			R_LOCK(dbenv, memreg);
			goto alloc;
		}

		if (!aggressive) {
			/* Skip high priority buckets. */
			if (hp->hash_priority > high_priority)
				continue;

			/*
			 * Find two buckets and select the one with the lowest
			 * priority.  Performance testing shows that looking
			 * at two improves the LRUness and looking at more only
			 * does a little better.
			 */
			if (hp_tmp == NULL) {
				hp_tmp = hp;
				continue;
			}
			if (hp->hash_priority > hp_tmp->hash_priority)
				hp = hp_tmp;
			hp_tmp = NULL;
		}

		/* Remember the priority of the buffer we're looking for. */
		priority = hp->hash_priority;

		/* Unlock the region and lock the hash bucket. */
		R_UNLOCK(dbenv, memreg);
		mutexp = &hp->hash_mutex;
		MUTEX_LOCK(dbenv, mutexp);

#ifdef DIAGNOSTIC
		__memp_check_order(hp);
#endif
		/*
		 * The lowest priority page is first in the bucket, as they are
		 * maintained in sorted order.
		 *
		 * The buffer may have been freed or its priority changed while
		 * we switched from the region lock to the hash lock.  If so,
		 * we have to restart.  We will still take the first buffer on
		 * the bucket's list, though, if it has a low enough priority.
		 */
		if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
		    bhp->ref != 0 || bhp->priority > priority)
			goto next_hb;

		buffers++;

		/* Find the associated MPOOLFILE. */
		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);

		/* If the page is dirty, pin it and write it. */
		ret = 0;
		if (F_ISSET(bhp, BH_DIRTY)) {
			++bhp->ref;
			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
			--bhp->ref;
			if (ret == 0)
				++c_mp->stat.st_rw_evict;
		} else
			++c_mp->stat.st_ro_evict;

		/*
		 * If a write fails for any reason, we can't proceed.
		 *
		 * We released the hash bucket lock while doing I/O, so another
		 * thread may have acquired this buffer and incremented the ref
		 * count after we wrote it, in which case we can't have it.
		 *
		 * If there's a write error, avoid selecting this buffer again
		 * by making it the bucket's least-desirable buffer.
		 */
		if (ret != 0 || bhp->ref != 0) {
			if (ret != 0 && aggressive)
				__memp_bad_buffer(hp);
			goto next_hb;
		}

		/*
		 * Check to see if the buffer is the size we're looking for.
		 * If so, we can simply reuse it.  Else, free the buffer and
		 * its space and keep looking.
		 */
		if (mfp != NULL &&
		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
			__memp_bhfree(dbmp, hp, bhp, 0);

			p = bhp;
			goto found;
		}

		freed_space += __db_shsizeof(bhp);
		__memp_bhfree(dbmp, hp, bhp, 1);

		/*
		 * Unlock this hash bucket and re-acquire the region lock. If
		 * we're reaching here as a result of calling memp_bhfree, the
		 * hash bucket lock has already been discarded.
		 */
		if (0) {
next_hb:		MUTEX_UNLOCK(dbenv, mutexp);
		}
		R_LOCK(dbenv, memreg);

		/*
		 * Retry the allocation as soon as we've freed up sufficient
		 * space.  We're likely to have to coalesce of memory to
		 * satisfy the request, don't try until it's likely (possible?)
		 * we'll succeed.
		 */
		if (freed_space >= 3 * len)
			goto alloc;
	}
	/* NOTREACHED */
}

/*
 * __memp_bad_buffer --
 *	Make the first buffer in a hash bucket the least desirable buffer.
 */
static void
__memp_bad_buffer(hp)
	DB_MPOOL_HASH *hp;
{
	BH *bhp, *t_bhp;
	u_int32_t priority;

	/* Remove the first buffer from the bucket. */
	bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);

	/*
	 * Find the highest priority buffer in the bucket.  Buffers are
	 * sorted by priority, so it's the last one in the bucket.
	 *
	 * XXX
	 * Should use SH_TAILQ_LAST, but I think that macro is broken.
	 */
	priority = bhp->priority;
	for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
	    t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
		priority = t_bhp->priority;

	/*
	 * Set our buffer's priority to be just as bad, and append it to
	 * the bucket.
	 */
	bhp->priority = priority;
	SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);

	/* Reset the hash bucket's priority. */
	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}

/*
 * __memp_reset_lru --
 *	Reset the cache LRU counter.
 */
static void
__memp_reset_lru(dbenv, memreg, c_mp)
	DB_ENV *dbenv;
	REGINFO *memreg;
	MPOOL *c_mp;
{
	BH *bhp;
	DB_MPOOL_HASH *hp;
	int bucket;

	/*
	 * Update the counter so all future allocations will start at the
	 * bottom.
	 */
	c_mp->lru_count -= MPOOL_BASE_DECREMENT;

	/* Release the region lock. */
	R_UNLOCK(dbenv, memreg);

	/* Adjust the priority of every buffer in the system. */
	for (hp = R_ADDR(memreg, c_mp->htab),
	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
		/*
		 * Skip empty buckets.
		 *
		 * We can check for empty buckets before locking as we
		 * only care if the pointer is zero or non-zero.
		 */
		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
			continue;

		MUTEX_LOCK(dbenv, &hp->hash_mutex);
		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
			if (bhp->priority != UINT32_T_MAX &&
			    bhp->priority > MPOOL_BASE_DECREMENT)
				bhp->priority -= MPOOL_BASE_DECREMENT;
		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
	}

	/* Reacquire the region lock. */
	R_LOCK(dbenv, memreg);
}

#ifdef DIAGNOSTIC
/*
 * __memp_check_order --
 *	Verify the priority ordering of a hash bucket chain.
 *
 * PUBLIC: #ifdef DIAGNOSTIC
 * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
 * PUBLIC: #endif
 */
void
__memp_check_order(hp)
	DB_MPOOL_HASH *hp;
{
	BH *bhp;
	u_int32_t priority;

	/*
	 * Assumes the hash bucket is locked.
	 */
	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
		return;

	DB_ASSERT(bhp->priority == hp->hash_priority);

	for (priority = bhp->priority;
	    (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
	    priority = bhp->priority)
		DB_ASSERT(priority <= bhp->priority);
}
#endif