/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #endif #include "db_int.h" #include "dbinc/db_shash.h" #include "dbinc/mp.h" typedef struct { DB_MPOOL_HASH *bucket; u_int32_t priority; } HS; static void __memp_bad_buffer __P((DB_MPOOL_HASH *)); static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *)); /* * __memp_alloc -- * Allocate some space from a cache region. * * PUBLIC: int __memp_alloc __P((DB_MPOOL *, * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); */ int __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp) DB_MPOOL *dbmp; REGINFO *memreg; MPOOLFILE *mfp; size_t len; roff_t *offsetp; void *retp; { BH *bhp; DB_ENV *dbenv; DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp; DB_MUTEX *mutexp; MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; u_int32_t buckets, buffers, high_priority, max_na, priority; int aggressive, ret; void *p; dbenv = dbmp->dbenv; c_mp = memreg->primary; dbht = R_ADDR(memreg, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; buckets = buffers = 0; aggressive = 0; c_mp->stat.st_alloc++; /* * Get aggressive if we've tried to flush the number of pages as are * in the system without finding space. */ max_na = 5 * c_mp->htab_buckets; /* * If we're allocating a buffer, and the one we're discarding is the * same size, we don't want to waste the time to re-integrate it into * the shared memory free list. If the DB_MPOOLFILE argument isn't * NULL, we'll compare the underlying page sizes of the two buffers * before free-ing and re-allocating buffers. */ if (mfp != NULL) len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize; R_LOCK(dbenv, memreg); /* * On every buffer allocation we update the buffer generation number * and check for wraparound. */ if (++c_mp->lru_count == UINT32_T_MAX) __memp_reset_lru(dbenv, memreg, c_mp); /* * Anything newer than 1/10th of the buffer pool is ignored during * allocation (unless allocation starts failing). */ DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10); high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; /* * First we try to allocate from free memory. If that fails, scan the * buffer pool to find buffers with low priorities. We consider small * sets of hash buckets each time to limit the amount of work needing * to be done. This approximates LRU, but not very well. We either * find a buffer of the same size to use, or we will free 3 times what * we need in the hopes it will coalesce into a contiguous chunk of the * right size. In the latter case we branch back here and try again. */ alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) { if (mfp != NULL) c_mp->stat.st_pages++; R_UNLOCK(dbenv, memreg); found: if (offsetp != NULL) *offsetp = R_OFFSET(memreg, p); *(void **)retp = p; /* * Update the search statistics. * * We're not holding the region locked here, these statistics * can't be trusted. */ if (buckets != 0) { if (buckets > c_mp->stat.st_alloc_max_buckets) c_mp->stat.st_alloc_max_buckets = buckets; c_mp->stat.st_alloc_buckets += buckets; } if (buffers != 0) { if (buffers > c_mp->stat.st_alloc_max_pages) c_mp->stat.st_alloc_max_pages = buffers; c_mp->stat.st_alloc_pages += buffers; } return (0); } /* * We re-attempt the allocation every time we've freed 3 times what * we need. Reset our free-space counter. */ freed_space = 0; /* * Walk the hash buckets and find the next two with potentially useful * buffers. Free the buffer with the lowest priority from the buckets' * chains. */ for (hp_tmp = NULL;;) { /* Check for wrap around. */ hp = &dbht[c_mp->last_checked++]; if (hp >= hp_end) { c_mp->last_checked = 0; /* * If we've gone through all of the hash buckets, try * an allocation. If the cache is small, the old page * size is small, and the new page size is large, we * might have freed enough memory (but not 3 times the * memory). */ goto alloc; } /* * Skip empty buckets. * * We can check for empty buckets before locking as we * only care if the pointer is zero or non-zero. */ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) continue; /* * The failure mode is when there are too many buffers we can't * write or there's not enough memory in the system. We don't * have a metric for deciding if allocation has no possible way * to succeed, so we don't ever fail, we assume memory will be * available if we wait long enough. * * Get aggressive if we've tried to flush 5 times the number of * hash buckets as are in the system -- it's possible we have * been repeatedly trying to flush the same buffers, although * it's unlikely. Aggressive means: * * a: set a flag to attempt to flush high priority buffers as * well as other buffers. * b: sync the mpool to force out queue extent pages. While we * might not have enough space for what we want and flushing * is expensive, why not? * c: sleep for a second -- hopefully someone else will run and * free up some memory. Try to allocate memory too, in case * the other thread returns its memory to the region. * d: look at a buffer in every hash bucket rather than choose * the more preferable of two. * * !!! * This test ignores pathological cases like no buffers in the * system -- that shouldn't be possible. */ if ((++buckets % max_na) == 0) { aggressive = 1; R_UNLOCK(dbenv, memreg); (void)__memp_sync_int( dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); (void)__os_sleep(dbenv, 1, 0); R_LOCK(dbenv, memreg); goto alloc; } if (!aggressive) { /* Skip high priority buckets. */ if (hp->hash_priority > high_priority) continue; /* * Find two buckets and select the one with the lowest * priority. Performance testing shows that looking * at two improves the LRUness and looking at more only * does a little better. */ if (hp_tmp == NULL) { hp_tmp = hp; continue; } if (hp->hash_priority > hp_tmp->hash_priority) hp = hp_tmp; hp_tmp = NULL; } /* Remember the priority of the buffer we're looking for. */ priority = hp->hash_priority; /* Unlock the region and lock the hash bucket. */ R_UNLOCK(dbenv, memreg); mutexp = &hp->hash_mutex; MUTEX_LOCK(dbenv, mutexp); #ifdef DIAGNOSTIC __memp_check_order(hp); #endif /* * The lowest priority page is first in the bucket, as they are * maintained in sorted order. * * The buffer may have been freed or its priority changed while * we switched from the region lock to the hash lock. If so, * we have to restart. We will still take the first buffer on * the bucket's list, though, if it has a low enough priority. */ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL || bhp->ref != 0 || bhp->priority > priority) goto next_hb; buffers++; /* Find the associated MPOOLFILE. */ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); /* If the page is dirty, pin it and write it. */ ret = 0; if (F_ISSET(bhp, BH_DIRTY)) { ++bhp->ref; ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0); --bhp->ref; if (ret == 0) ++c_mp->stat.st_rw_evict; } else ++c_mp->stat.st_ro_evict; /* * If a write fails for any reason, we can't proceed. * * We released the hash bucket lock while doing I/O, so another * thread may have acquired this buffer and incremented the ref * count after we wrote it, in which case we can't have it. * * If there's a write error, avoid selecting this buffer again * by making it the bucket's least-desirable buffer. */ if (ret != 0 || bhp->ref != 0) { if (ret != 0 && aggressive) __memp_bad_buffer(hp); goto next_hb; } /* * Check to see if the buffer is the size we're looking for. * If so, we can simply reuse it. Else, free the buffer and * its space and keep looking. */ if (mfp != NULL && mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { __memp_bhfree(dbmp, hp, bhp, 0); p = bhp; goto found; } freed_space += __db_shsizeof(bhp); __memp_bhfree(dbmp, hp, bhp, 1); /* * Unlock this hash bucket and re-acquire the region lock. If * we're reaching here as a result of calling memp_bhfree, the * hash bucket lock has already been discarded. */ if (0) { next_hb: MUTEX_UNLOCK(dbenv, mutexp); } R_LOCK(dbenv, memreg); /* * Retry the allocation as soon as we've freed up sufficient * space. We're likely to have to coalesce of memory to * satisfy the request, don't try until it's likely (possible?) * we'll succeed. */ if (freed_space >= 3 * len) goto alloc; } /* NOTREACHED */ } /* * __memp_bad_buffer -- * Make the first buffer in a hash bucket the least desirable buffer. */ static void __memp_bad_buffer(hp) DB_MPOOL_HASH *hp; { BH *bhp, *t_bhp; u_int32_t priority; /* Remove the first buffer from the bucket. */ bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); /* * Find the highest priority buffer in the bucket. Buffers are * sorted by priority, so it's the last one in the bucket. * * XXX * Should use SH_TAILQ_LAST, but I think that macro is broken. */ priority = bhp->priority; for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh)) priority = t_bhp->priority; /* * Set our buffer's priority to be just as bad, and append it to * the bucket. */ bhp->priority = priority; SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); /* Reset the hash bucket's priority. */ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; } /* * __memp_reset_lru -- * Reset the cache LRU counter. */ static void __memp_reset_lru(dbenv, memreg, c_mp) DB_ENV *dbenv; REGINFO *memreg; MPOOL *c_mp; { BH *bhp; DB_MPOOL_HASH *hp; int bucket; /* * Update the counter so all future allocations will start at the * bottom. */ c_mp->lru_count -= MPOOL_BASE_DECREMENT; /* Release the region lock. */ R_UNLOCK(dbenv, memreg); /* Adjust the priority of every buffer in the system. */ for (hp = R_ADDR(memreg, c_mp->htab), bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { /* * Skip empty buckets. * * We can check for empty buckets before locking as we * only care if the pointer is zero or non-zero. */ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) continue; MUTEX_LOCK(dbenv, &hp->hash_mutex); for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) if (bhp->priority != UINT32_T_MAX && bhp->priority > MPOOL_BASE_DECREMENT) bhp->priority -= MPOOL_BASE_DECREMENT; MUTEX_UNLOCK(dbenv, &hp->hash_mutex); } /* Reacquire the region lock. */ R_LOCK(dbenv, memreg); } #ifdef DIAGNOSTIC /* * __memp_check_order -- * Verify the priority ordering of a hash bucket chain. * * PUBLIC: #ifdef DIAGNOSTIC * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *)); * PUBLIC: #endif */ void __memp_check_order(hp) DB_MPOOL_HASH *hp; { BH *bhp; u_int32_t priority; /* * Assumes the hash bucket is locked. */ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) return; DB_ASSERT(bhp->priority == hp->hash_priority); for (priority = bhp->priority; (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL; priority = bhp->priority) DB_ASSERT(priority <= bhp->priority); } #endif