BDB 4.1.24

BitKeeper/deleted/.del-ex_access.wpj~3df6ae8c99bf7c5f: Delete: bdb/build_vxworks/ex_access/ex_access.wpj BitKeeper/deleted/.del-ex_btrec.wpj~a7622f1c6f432dc6: Delete: bdb/build_vxworks/ex_btrec/ex_btrec.wpj BitKeeper/deleted/.del-ex_dbclient.wpj~7345440f3b204cdd: Delete: bdb/build_vxworks/ex_dbclient/ex_dbclient.wpj BitKeeper/deleted/.del-ex_env.wpj~fbe1ab10b04e8b74: Delete: bdb/build_vxworks/ex_env/ex_env.wpj BitKeeper/deleted/.del-ex_mpool.wpj~4479cfd5c45f327d: Delete: bdb/build_vxworks/ex_mpool/ex_mpool.wpj BitKeeper/deleted/.del-ex_tpcb.wpj~f78093006e14bf41: Delete: bdb/build_vxworks/ex_tpcb/ex_tpcb.wpj BitKeeper/deleted/.del-db_buildall.dsp~bd749ff6da11682: Delete: bdb/build_win32/db_buildall.dsp BitKeeper/deleted/.del-cxx_app.cpp~ad8df8e0791011ed: Delete: bdb/cxx/cxx_app.cpp BitKeeper/deleted/.del-cxx_log.cpp~a50ff3118fe06952: Delete: bdb/cxx/cxx_log.cpp BitKeeper/deleted/.del-cxx_table.cpp~ecd751e79b055556: Delete: bdb/cxx/cxx_table.cpp BitKeeper/deleted/.del-namemap.txt~796a3acd3885d8fd: Delete: bdb/cxx/namemap.txt BitKeeper/deleted/.del-Design.fileop~3ca4da68f1727373: Delete: bdb/db/Design.fileop BitKeeper/deleted/.del-db185_int.h~61bee3736e7959ef: Delete: bdb/db185/db185_int.h BitKeeper/deleted/.del-acconfig.h~411e8854d67ad8b5: Delete: bdb/dist/acconfig.h BitKeeper/deleted/.del-mutex.m4~a13383cde18a64e1: Delete: bdb/dist/aclocal/mutex.m4 BitKeeper/deleted/.del-options.m4~b9d0ca637213750a: Delete: bdb/dist/aclocal/options.m4 BitKeeper/deleted/.del-programs.m4~3ce7890b47732b30: Delete: bdb/dist/aclocal/programs.m4 BitKeeper/deleted/.del-tcl.m4~f944e2db93c3b6db: Delete: bdb/dist/aclocal/tcl.m4 BitKeeper/deleted/.del-types.m4~59cae158c9a32cff: Delete: bdb/dist/aclocal/types.m4 BitKeeper/deleted/.del-script~d38f6d3a4f159cb4: Delete: bdb/dist/build/script BitKeeper/deleted/.del-configure.in~ac795a92c8fe049c: Delete: bdb/dist/configure.in BitKeeper/deleted/.del-ltconfig~66bbd007d8024af: Delete: bdb/dist/ltconfig BitKeeper/deleted/.del-rec_ctemp~a28554362534f00a: Delete: bdb/dist/rec_ctemp BitKeeper/deleted/.del-s_tcl~2ffe4326459fcd9f: Delete: bdb/dist/s_tcl BitKeeper/deleted/.del-.IGNORE_ME~d8148b08fa7d5d15: Delete: bdb/dist/template/.IGNORE_ME BitKeeper/deleted/.del-btree.h~179f2aefec1753d: Delete: bdb/include/btree.h BitKeeper/deleted/.del-cxx_int.h~6b649c04766508f8: Delete: bdb/include/cxx_int.h BitKeeper/deleted/.del-db.src~6b433ae615b16a8d: Delete: bdb/include/db.src BitKeeper/deleted/.del-db_185.h~ad8b373d9391d35c: Delete: bdb/include/db_185.h BitKeeper/deleted/.del-db_am.h~a714912b6b75932f: Delete: bdb/include/db_am.h BitKeeper/deleted/.del-db_cxx.h~fcafadf45f5d19e9: Delete: bdb/include/db_cxx.h BitKeeper/deleted/.del-db_dispatch.h~6844f20f7eb46904: Delete: bdb/include/db_dispatch.h BitKeeper/deleted/.del-db_int.src~419a3f48b6a01da7: Delete: bdb/include/db_int.src BitKeeper/deleted/.del-db_join.h~76f9747a42c3399a: Delete: bdb/include/db_join.h BitKeeper/deleted/.del-db_page.h~e302ca3a4db3abdc: Delete: bdb/include/db_page.h BitKeeper/deleted/.del-db_server_int.h~e1d20b6ba3bca1ab: Delete: bdb/include/db_server_int.h BitKeeper/deleted/.del-db_shash.h~5fbf2d696fac90f3: Delete: bdb/include/db_shash.h BitKeeper/deleted/.del-db_swap.h~1e60887550864a59: Delete: bdb/include/db_swap.h BitKeeper/deleted/.del-db_upgrade.h~c644eee73701fc8d: Delete: bdb/include/db_upgrade.h BitKeeper/deleted/.del-db_verify.h~b8d6c297c61f342e: Delete: bdb/include/db_verify.h BitKeeper/deleted/.del-debug.h~dc2b4f2cf27ccebc: Delete: bdb/include/debug.h BitKeeper/deleted/.del-hash.h~2aaa548b28882dfb: Delete: bdb/include/hash.h BitKeeper/deleted/.del-lock.h~a761c1b7de57b77f: Delete: bdb/include/lock.h BitKeeper/deleted/.del-log.h~ff20184238e35e4d: Delete: bdb/include/log.h BitKeeper/deleted/.del-mp.h~7e317597622f3411: Delete: bdb/include/mp.h BitKeeper/deleted/.del-mutex.h~d3ae7a2977a68137: Delete: bdb/include/mutex.h BitKeeper/deleted/.del-os.h~91867cc8757cd0e3: Delete: bdb/include/os.h BitKeeper/deleted/.del-os_jump.h~e1b939fa5151d4be: Delete: bdb/include/os_jump.h BitKeeper/deleted/.del-qam.h~6fad0c1b5723d597: Delete: bdb/include/qam.h BitKeeper/deleted/.del-queue.h~4c72c0826c123d5: Delete: bdb/include/queue.h BitKeeper/deleted/.del-region.h~513fe04d977ca0fc: Delete: bdb/include/region.h BitKeeper/deleted/.del-shqueue.h~525fc3e6c2025c36: Delete: bdb/include/shqueue.h BitKeeper/deleted/.del-tcl_db.h~c536fd61a844f23f: Delete: bdb/include/tcl_db.h BitKeeper/deleted/.del-txn.h~c8d94b221ec147e4: Delete: bdb/include/txn.h BitKeeper/deleted/.del-xa.h~ecc466493aae9d9a: Delete: bdb/include/xa.h BitKeeper/deleted/.del-DbRecoveryInit.java~756b52601a0b9023: Delete: bdb/java/src/com/sleepycat/db/DbRecoveryInit.java BitKeeper/deleted/.del-DbTxnRecover.java~74607cba7ab89d6d: Delete: bdb/java/src/com/sleepycat/db/DbTxnRecover.java BitKeeper/deleted/.del-lock_conflict.c~fc5e0f14cf597a2b: Delete: bdb/lock/lock_conflict.c BitKeeper/deleted/.del-log.src~53ac9e7b5cb023f2: Delete: bdb/log/log.src BitKeeper/deleted/.del-log_findckp.c~24287f008916e81f: Delete: bdb/log/log_findckp.c BitKeeper/deleted/.del-log_rec.c~d51711f2cac09297: Delete: bdb/log/log_rec.c BitKeeper/deleted/.del-log_register.c~b40bb4efac75ca15: Delete: bdb/log/log_register.c BitKeeper/deleted/.del-Design~b3d0f179f2767b: Delete: bdb/mp/Design BitKeeper/deleted/.del-os_finit.c~95dbefc6fe79b26c: Delete: bdb/os/os_finit.c BitKeeper/deleted/.del-os_abs.c~df95d1e7db81924: Delete: bdb/os_vxworks/os_abs.c BitKeeper/deleted/.del-os_finit.c~803b484bdb9d0122: Delete: bdb/os_vxworks/os_finit.c BitKeeper/deleted/.del-os_map.c~3a6d7926398b76d3: Delete: bdb/os_vxworks/os_map.c BitKeeper/deleted/.del-os_finit.c~19a227c6d3c78ad: Delete: bdb/os_win32/os_finit.c BitKeeper/deleted/.del-log-corruption.patch~1cf2ecc7c6408d5d: Delete: bdb/patches/log-corruption.patch BitKeeper/deleted/.del-Btree.pm~af6d0c5eaed4a98e: Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Btree.pm BitKeeper/deleted/.del-BerkeleyDB.pm~7244036d4482643: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pm BitKeeper/deleted/.del-BerkeleyDB.pod~e7b18fd6132448e3: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod BitKeeper/deleted/.del-Hash.pm~10292a26c06a5c95: Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Hash.pm BitKeeper/deleted/.del-BerkeleyDB.pod.P~79f76a1495eda203: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod.P BitKeeper/deleted/.del-BerkeleyDB.xs~80c99afbd98e392c: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.xs BitKeeper/deleted/.del-Changes~729c1891efa60de9: Delete: bdb/perl.BerkeleyDB/Changes BitKeeper/deleted/.del-MANIFEST~63a1e34aecf157a0: Delete: bdb/perl.BerkeleyDB/MANIFEST BitKeeper/deleted/.del-Makefile.PL~c68797707d8df87a: Delete: bdb/perl.BerkeleyDB/Makefile.PL BitKeeper/deleted/.del-README~5f2f579b1a241407: Delete: bdb/perl.BerkeleyDB/README BitKeeper/deleted/.del-Todo~dca3c66c193adda9: Delete: bdb/perl.BerkeleyDB/Todo BitKeeper/deleted/.del-config.in~ae81681e450e0999: Delete: bdb/perl.BerkeleyDB/config.in BitKeeper/deleted/.del-dbinfo~28ad67d83be4f68e: Delete: bdb/perl.BerkeleyDB/dbinfo BitKeeper/deleted/.del-mkconsts~543ab60669c7a04e: Delete: bdb/perl.BerkeleyDB/mkconsts BitKeeper/deleted/.del-mkpod~182c0ca54e439afb: Delete: bdb/perl.BerkeleyDB/mkpod BitKeeper/deleted/.del-5.004~e008cb5a48805543: Delete: bdb/perl.BerkeleyDB/patches/5.004 BitKeeper/deleted/.del-irix_6_5.pl~61662bb08afcdec8: Delete: bdb/perl.BerkeleyDB/hints/irix_6_5.pl BitKeeper/deleted/.del-solaris.pl~6771e7182394e152: Delete: bdb/perl.BerkeleyDB/hints/solaris.pl BitKeeper/deleted/.del-typemap~783b8f5295b05f3d: Delete: bdb/perl.BerkeleyDB/typemap BitKeeper/deleted/.del-5.004_01~6081ce2fff7b0bc: Delete: bdb/perl.BerkeleyDB/patches/5.004_01 BitKeeper/deleted/.del-5.004_02~87214eac35ad9e6: Delete: bdb/perl.BerkeleyDB/patches/5.004_02 BitKeeper/deleted/.del-5.004_03~9a672becec7cb40f: Delete: bdb/perl.BerkeleyDB/patches/5.004_03 BitKeeper/deleted/.del-5.004_04~e326cb51af09d154: Delete: bdb/perl.BerkeleyDB/patches/5.004_04 BitKeeper/deleted/.del-5.004_05~7ab457a1e41a92fe: Delete: bdb/perl.BerkeleyDB/patches/5.004_05 BitKeeper/deleted/.del-5.005~f9e2d59b5964cd4b: Delete: bdb/perl.BerkeleyDB/patches/5.005 BitKeeper/deleted/.del-5.005_01~3eb9fb7b5842ea8e: Delete: bdb/perl.BerkeleyDB/patches/5.005_01 BitKeeper/deleted/.del-5.005_02~67477ce0bef717cb: Delete: bdb/perl.BerkeleyDB/patches/5.005_02 BitKeeper/deleted/.del-5.005_03~c4c29a1fb21e290a: Delete: bdb/perl.BerkeleyDB/patches/5.005_03 BitKeeper/deleted/.del-5.6.0~e1fb9897d124ee22: Delete: bdb/perl.BerkeleyDB/patches/5.6.0 BitKeeper/deleted/.del-btree.t~e4a1a3c675ddc406: Delete: bdb/perl.BerkeleyDB/t/btree.t BitKeeper/deleted/.del-db-3.0.t~d2c60991d84558f2: Delete: bdb/perl.BerkeleyDB/t/db-3.0.t BitKeeper/deleted/.del-db-3.1.t~6ee88cd13f55e018: Delete: bdb/perl.BerkeleyDB/t/db-3.1.t BitKeeper/deleted/.del-db-3.2.t~f73b6461f98fd1cf: Delete: bdb/perl.BerkeleyDB/t/db-3.2.t BitKeeper/deleted/.del-destroy.t~cc6a2ae1980a2ecd: Delete: bdb/perl.BerkeleyDB/t/destroy.t BitKeeper/deleted/.del-env.t~a8604a4499c4bd07: Delete: bdb/perl.BerkeleyDB/t/env.t BitKeeper/deleted/.del-examples.t~2571b77c3cc75574: Delete: bdb/perl.BerkeleyDB/t/examples.t BitKeeper/deleted/.del-examples.t.T~8228bdd75ac78b88: Delete: bdb/perl.BerkeleyDB/t/examples.t.T BitKeeper/deleted/.del-examples3.t.T~66a186897a87026d: Delete: bdb/perl.BerkeleyDB/t/examples3.t.T BitKeeper/deleted/.del-examples3.t~fe3822ba2f2d7f83: Delete: bdb/perl.BerkeleyDB/t/examples3.t BitKeeper/deleted/.del-filter.t~f87b045c1b708637: Delete: bdb/perl.BerkeleyDB/t/filter.t BitKeeper/deleted/.del-hash.t~616bfb4d644de3a3: Delete: bdb/perl.BerkeleyDB/t/hash.t BitKeeper/deleted/.del-join.t~29fc39f74a83ca22: Delete: bdb/perl.BerkeleyDB/t/join.t BitKeeper/deleted/.del-mldbm.t~31f5015341eea040: Delete: bdb/perl.BerkeleyDB/t/mldbm.t BitKeeper/deleted/.del-queue.t~8f338034ce44a641: Delete: bdb/perl.BerkeleyDB/t/queue.t BitKeeper/deleted/.del-recno.t~d4ddbd3743add63e: Delete: bdb/perl.BerkeleyDB/t/recno.t BitKeeper/deleted/.del-strict.t~6885cdd2ea71ca2d: Delete: bdb/perl.BerkeleyDB/t/strict.t BitKeeper/deleted/.del-subdb.t~aab62a5d5864c603: Delete: bdb/perl.BerkeleyDB/t/subdb.t BitKeeper/deleted/.del-txn.t~65033b8558ae1216: Delete: bdb/perl.BerkeleyDB/t/txn.t BitKeeper/deleted/.del-unknown.t~f3710458682665e1: Delete: bdb/perl.BerkeleyDB/t/unknown.t BitKeeper/deleted/.del-Changes~436f74a5c414c65b: Delete: bdb/perl.DB_File/Changes BitKeeper/deleted/.del-DB_File.pm~ae0951c6c7665a82: Delete: bdb/perl.DB_File/DB_File.pm BitKeeper/deleted/.del-DB_File.xs~89e49a0b5556f1d8: Delete: bdb/perl.DB_File/DB_File.xs BitKeeper/deleted/.del-DB_File_BS~290fad5dbbb87069: Delete: bdb/perl.DB_File/DB_File_BS BitKeeper/deleted/.del-MANIFEST~90ee581572bdd4ac: Delete: bdb/perl.DB_File/MANIFEST BitKeeper/deleted/.del-Makefile.PL~ac0567bb5a377e38: Delete: bdb/perl.DB_File/Makefile.PL BitKeeper/deleted/.del-README~77e924a5a9bae6b3: Delete: bdb/perl.DB_File/README BitKeeper/deleted/.del-config.in~ab4c2792b86a810b: Delete: bdb/perl.DB_File/config.in BitKeeper/deleted/.del-dbinfo~461c43b30fab2cb: Delete: bdb/perl.DB_File/dbinfo BitKeeper/deleted/.del-dynixptx.pl~50dcddfae25d17e9: Delete: bdb/perl.DB_File/hints/dynixptx.pl BitKeeper/deleted/.del-typemap~55cffb3288a9e587: Delete: bdb/perl.DB_File/typemap BitKeeper/deleted/.del-version.c~a4df0e646f8b3975: Delete: bdb/perl.DB_File/version.c BitKeeper/deleted/.del-5.004_01~d6830d0082702af7: Delete: bdb/perl.DB_File/patches/5.004_01 BitKeeper/deleted/.del-5.004_02~78b082dc80c91031: Delete: bdb/perl.DB_File/patches/5.004_02 BitKeeper/deleted/.del-5.004~4411ec2e3c9e008b: Delete: bdb/perl.DB_File/patches/5.004 BitKeeper/deleted/.del-sco.pl~1e795fe14fe4dcfe: Delete: bdb/perl.DB_File/hints/sco.pl BitKeeper/deleted/.del-5.004_03~33f274648b160d95: Delete: bdb/perl.DB_File/patches/5.004_03 BitKeeper/deleted/.del-5.004_04~8f3d1b3cf18bb20a: Delete: bdb/perl.DB_File/patches/5.004_04 BitKeeper/deleted/.del-5.004_05~9c0f02e7331e142: Delete: bdb/perl.DB_File/patches/5.004_05 BitKeeper/deleted/.del-5.005~c2108cb2e3c8d951: Delete: bdb/perl.DB_File/patches/5.005 BitKeeper/deleted/.del-5.005_01~3b45e9673afc4cfa: Delete: bdb/perl.DB_File/patches/5.005_01 BitKeeper/deleted/.del-5.005_02~9fe5766bb02a4522: Delete: bdb/perl.DB_File/patches/5.005_02 BitKeeper/deleted/.del-5.005_03~ffa1c38c19ae72ea: Delete: bdb/perl.DB_File/patches/5.005_03 BitKeeper/deleted/.del-5.6.0~373be3a5ce47be85: Delete: bdb/perl.DB_File/patches/5.6.0 BitKeeper/deleted/.del-db-btree.t~3231595a1c241eb3: Delete: bdb/perl.DB_File/t/db-btree.t BitKeeper/deleted/.del-db-hash.t~7c4ad0c795c7fad2: Delete: bdb/perl.DB_File/t/db-hash.t BitKeeper/deleted/.del-db-recno.t~6c2d3d80b9ba4a50: Delete: bdb/perl.DB_File/t/db-recno.t BitKeeper/deleted/.del-db_server.sed~cdb00ebcd48a64e2: Delete: bdb/rpc_server/db_server.sed BitKeeper/deleted/.del-db_server_proc.c~d46c8f409c3747f4: Delete: bdb/rpc_server/db_server_proc.c BitKeeper/deleted/.del-db_server_svc.sed~3f5e59f334fa4607: Delete: bdb/rpc_server/db_server_svc.sed BitKeeper/deleted/.del-db_server_util.c~a809f3a4629acda: Delete: bdb/rpc_server/db_server_util.c BitKeeper/deleted/.del-log.tcl~ff1b41f1355b97d7: Delete: bdb/test/log.tcl BitKeeper/deleted/.del-mpool.tcl~b0df4dc1b04db26c: Delete: bdb/test/mpool.tcl BitKeeper/deleted/.del-mutex.tcl~52fd5c73a150565: Delete: bdb/test/mutex.tcl BitKeeper/deleted/.del-txn.tcl~c4ff071550b5446e: Delete: bdb/test/txn.tcl BitKeeper/deleted/.del-README~e800a12a5392010a: Delete: bdb/test/upgrade/README BitKeeper/deleted/.del-pack-2.6.6.pl~89d5076d758d3e98: Delete: bdb/test/upgrade/generate-2.X/pack-2.6.6.pl BitKeeper/deleted/.del-test-2.6.patch~4a52dc83d447547b: Delete: bdb/test/upgrade/generate-2.X/test-2.6.patch
2026-05-14 19:07:15 +02:00 · 2002-10-30 15:57:05 +04:00 · 2002-10-30 15:57:05 +04:00 · 155e78f014
commit 155e78f014
parent b8798d25ab
1191 changed files with 170446 additions and 57453 deletions
--- a/bdb/mp/Design
+++ b/bdb/mp/Design
@ -1,52 +0,0 @@
-$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
-
-There are three ways we do locking in the mpool code:
-
-Locking a handle mutex to provide concurrency for DB_THREAD operations.
-Locking the region mutex to provide mutual exclusion while reading and
-    writing structures in the shared region.
-Locking buffer header mutexes during I/O.
-
-The first will not be further described here.  We use the shared mpool
-region lock to provide mutual exclusion while reading/modifying all of
-the data structures, including the buffer headers.  We use a per-buffer
-header lock to wait on buffer I/O.  The order of locking is as follows:
-
-Searching for a buffer:
-    Acquire the region lock.
-    Find the buffer header.
-    Increment the reference count (guarantee the buffer stays).
-    While the BH_LOCKED flag is set (I/O is going on) {
-	Release the region lock.
-	    Explicitly yield the processor if it's not the first pass
-	    through this loop, otherwise, we can simply spin because
-	    we'll be simply switching between the two locks.
-	Request the buffer lock.
-	The I/O will complete...
-	Acquire the buffer lock.
-	Release the buffer lock.
-	Acquire the region lock.
-    }
-    Return the buffer.
-
-Reading/writing a buffer:
-    Acquire the region lock.
-    Find/create the buffer header.
-    If reading, increment the reference count (guarantee the buffer stays).
-    Set the BH_LOCKED flag.
-    Acquire the buffer lock (guaranteed not to block).
-    Release the region lock.
-    Do the I/O and/or initialize the buffer contents.
-    Release the buffer lock.
-	At this point, the buffer lock is available, but the logical
-	operation (flagged by BH_LOCKED) is not yet completed.  For
-	this reason, among others, threads checking the BH_LOCKED flag
-	must loop around their test.
-    Acquire the region lock.
-    Clear the BH_LOCKED flag.
-    Release the region lock.
-    Return/discard the buffer.
-
-Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
-not reacquired when a region lock is reacquired because they couldn't
-have been closed/discarded and because they never move in memory.
--- a/bdb/mp/mp_alloc.c
+++ b/bdb/mp/mp_alloc.c
@ -1,22 +1,31 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
+static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
+#include <string.h>
 #endif

 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+typedef struct {
+	DB_MPOOL_HASH *bucket;
+	u_int32_t priority;
+} HS;
+
+static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));

 /*
 * __memp_alloc --
@ -34,14 +43,32 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
 	roff_t *offsetp;
 	void *retp;
 {
-	BH *bhp, *nbhp;
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
+	DB_MUTEX *mutexp;
 	MPOOL *c_mp;
 	MPOOLFILE *bh_mfp;
-	size_t total;
-	int nomore, restart, ret, wrote;
+	size_t freed_space;
+	u_int32_t buckets, buffers, high_priority, max_na, priority;
+	int aggressive, ret;
 	void *p;

+	dbenv = dbmp->dbenv;
 	c_mp = memreg->primary;
+	dbht = R_ADDR(memreg, c_mp->htab);
+	hp_end = &dbht[c_mp->htab_buckets];
+
+	buckets = buffers = 0;
+	aggressive = 0;
+
+	c_mp->stat.st_alloc++;
+
+	/*
+	 * Get aggressive if we've tried to flush the number of pages as are
+	 * in the system without finding space.
+	 */
+	max_na = 5 * c_mp->htab_buckets;

 	/*
 	 * If we're allocating a buffer, and the one we're discarding is the
@ -53,100 +80,363 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
 	if (mfp != NULL)
 		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;

-	nomore = 0;
+	R_LOCK(dbenv, memreg);
+
+	/*
+	 * On every buffer allocation we update the buffer generation number
+	 * and check for wraparound.
+	 */
+	if (++c_mp->lru_count == UINT32_T_MAX)
+		__memp_reset_lru(dbenv, memreg, c_mp);
+
+	/*
+	 * Anything newer than 1/10th of the buffer pool is ignored during
+	 * allocation (unless allocation starts failing).
+	 */
+	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+	/*
+	 * First we try to allocate from free memory.  If that fails, scan the
+	 * buffer pool to find buffers with low priorities.  We consider small
+	 * sets of hash buckets each time to limit the amount of work needing
+	 * to be done.  This approximates LRU, but not very well.  We either
+	 * find a buffer of the same size to use, or we will free 3 times what
+	 * we need in the hopes it will coalesce into a contiguous chunk of the
+	 * right size.  In the latter case we branch back here and try again.
+	 */
 alloc:	if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
-		if (offsetp != NULL)
+		if (mfp != NULL)
+			c_mp->stat.st_pages++;
+		R_UNLOCK(dbenv, memreg);
+
+found:		if (offsetp != NULL)
 			*offsetp = R_OFFSET(memreg, p);
 		*(void **)retp = p;
+
+		/*
+		 * Update the search statistics.
+		 *
+		 * We're not holding the region locked here, these statistics
+		 * can't be trusted.
+		 */
+		if (buckets != 0) {
+			if (buckets > c_mp->stat.st_alloc_max_buckets)
+				c_mp->stat.st_alloc_max_buckets = buckets;
+			c_mp->stat.st_alloc_buckets += buckets;
+		}
+		if (buffers != 0) {
+			if (buffers > c_mp->stat.st_alloc_max_pages)
+				c_mp->stat.st_alloc_max_pages = buffers;
+			c_mp->stat.st_alloc_pages += buffers;
+		}
 		return (0);
 	}
-	if (nomore) {
-		__db_err(dbmp->dbenv,
-	    "Unable to allocate %lu bytes from mpool shared region: %s\n",
-		    (u_long)len, db_strerror(ret));
-		return (ret);
-	}

-retry:	/* Find a buffer we can flush; pure LRU. */
-	restart = total = 0;
-	for (bhp =
-	    SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
-		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+	/*
+	 * We re-attempt the allocation every time we've freed 3 times what
+	 * we need.  Reset our free-space counter.
+	 */
+	freed_space = 0;

-		/* Ignore pinned or locked (I/O in progress) buffers. */
-		if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+	/*
+	 * Walk the hash buckets and find the next two with potentially useful
+	 * buffers.  Free the buffer with the lowest priority from the buckets'
+	 * chains.
+	 */
+	for (hp_tmp = NULL;;) {
+		/* Check for wrap around. */
+		hp = &dbht[c_mp->last_checked++];
+		if (hp >= hp_end) {
+			c_mp->last_checked = 0;
+
+			/*
+			 * If we've gone through all of the hash buckets, try
+			 * an allocation.  If the cache is small, the old page
+			 * size is small, and the new page size is large, we
+			 * might have freed enough memory (but not 3 times the
+			 * memory).
+			 */
+			goto alloc;
+		}
+
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
 			continue;

+		/*
+		 * The failure mode is when there are too many buffers we can't
+		 * write or there's not enough memory in the system.  We don't
+		 * have a metric for deciding if allocation has no possible way
+		 * to succeed, so we don't ever fail, we assume memory will be
+		 * available if we wait long enough.
+		 *
+		 * Get aggressive if we've tried to flush 5 times the number of
+		 * hash buckets as are in the system -- it's possible we have
+		 * been repeatedly trying to flush the same buffers, although
+		 * it's unlikely.  Aggressive means:
+		 *
+		 * a: set a flag to attempt to flush high priority buffers as
+		 *    well as other buffers.
+		 * b: sync the mpool to force out queue extent pages.  While we
+		 *    might not have enough space for what we want and flushing
+		 *    is expensive, why not?
+		 * c: sleep for a second -- hopefully someone else will run and
+		 *    free up some memory.  Try to allocate memory too, in case
+		 *    the other thread returns its memory to the region.
+		 * d: look at a buffer in every hash bucket rather than choose
+		 *    the more preferable of two.
+		 *
+		 * !!!
+		 * This test ignores pathological cases like no buffers in the
+		 * system -- that shouldn't be possible.
+		 */
+		if ((++buckets % max_na) == 0) {
+			aggressive = 1;
+
+			R_UNLOCK(dbenv, memreg);
+
+			(void)__memp_sync_int(
+			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+			(void)__os_sleep(dbenv, 1, 0);
+
+			R_LOCK(dbenv, memreg);
+			goto alloc;
+		}
+
+		if (!aggressive) {
+			/* Skip high priority buckets. */
+			if (hp->hash_priority > high_priority)
+				continue;
+
+			/*
+			 * Find two buckets and select the one with the lowest
+			 * priority.  Performance testing shows that looking
+			 * at two improves the LRUness and looking at more only
+			 * does a little better.
+			 */
+			if (hp_tmp == NULL) {
+				hp_tmp = hp;
+				continue;
+			}
+			if (hp->hash_priority > hp_tmp->hash_priority)
+				hp = hp_tmp;
+			hp_tmp = NULL;
+		}
+
+		/* Remember the priority of the buffer we're looking for. */
+		priority = hp->hash_priority;
+
+		/* Unlock the region and lock the hash bucket. */
+		R_UNLOCK(dbenv, memreg);
+		mutexp = &hp->hash_mutex;
+		MUTEX_LOCK(dbenv, mutexp);
+
+#ifdef DIAGNOSTIC
+		__memp_check_order(hp);
+#endif
+		/*
+		 * The lowest priority page is first in the bucket, as they are
+		 * maintained in sorted order.
+		 *
+		 * The buffer may have been freed or its priority changed while
+		 * we switched from the region lock to the hash lock.  If so,
+		 * we have to restart.  We will still take the first buffer on
+		 * the bucket's list, though, if it has a low enough priority.
+		 */
+		if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
+		    bhp->ref != 0 || bhp->priority > priority)
+			goto next_hb;
+
+		buffers++;
+
 		/* Find the associated MPOOLFILE. */
 		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);

-		/* Write the page if it's dirty. */
+		/* If the page is dirty, pin it and write it. */
+		ret = 0;
 		if (F_ISSET(bhp, BH_DIRTY)) {
 			++bhp->ref;
-			if ((ret = __memp_bhwrite(dbmp,
-			    bh_mfp, bhp, &restart, &wrote)) != 0)
-				return (ret);
+			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
 			--bhp->ref;
-
-			/*
-			 * Another process may have acquired this buffer and
-			 * incremented the ref count after we wrote it.
-			 */
-			if (bhp->ref != 0)
-				goto retry;
-
-			/*
-			 * If we wrote the page, continue and free the buffer.
-			 * We don't have to rewalk the list to acquire the
-			 * buffer because it was never available for any other
-			 * process to modify it.
-			 *
-			 * If we didn't write the page, but we discarded and
-			 * reacquired the region lock, restart the list walk.
-			 *
-			 * If we neither wrote the buffer nor discarded the
-			 * region lock, continue down the buffer list.
-			 */
-			if (wrote)
+			if (ret == 0)
 				++c_mp->stat.st_rw_evict;
-			else {
-				if (restart)
-					goto retry;
-				continue;
-			}
 		} else
 			++c_mp->stat.st_ro_evict;

+		/*
+		 * If a write fails for any reason, we can't proceed.
+		 *
+		 * We released the hash bucket lock while doing I/O, so another
+		 * thread may have acquired this buffer and incremented the ref
+		 * count after we wrote it, in which case we can't have it.
+		 *
+		 * If there's a write error, avoid selecting this buffer again
+		 * by making it the bucket's least-desirable buffer.
+		 */
+		if (ret != 0 || bhp->ref != 0) {
+			if (ret != 0 && aggressive)
+				__memp_bad_buffer(hp);
+			goto next_hb;
+		}
+
 		/*
 		 * Check to see if the buffer is the size we're looking for.
-		 * If it is, simply reuse it.
+		 * If so, we can simply reuse it.  Else, free the buffer and
+		 * its space and keep looking.
 		 */
 		if (mfp != NULL &&
 		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
-			__memp_bhfree(dbmp, bhp, 0);
+			__memp_bhfree(dbmp, hp, bhp, 0);

-			if (offsetp != NULL)
-				*offsetp = R_OFFSET(memreg, bhp);
-			*(void **)retp = bhp;
-			return (0);
+			p = bhp;
+			goto found;
 		}

-		/* Note how much space we've freed, and free the buffer. */
-		total += __db_shsizeof(bhp);
-		__memp_bhfree(dbmp, bhp, 1);
+		freed_space += __db_shsizeof(bhp);
+		__memp_bhfree(dbmp, hp, bhp, 1);

 		/*
-		 * Retry as soon as we've freed up sufficient space.  If we
-		 * have to coalesce of memory to satisfy the request, don't
-		 * try until it's likely (possible?) that we'll succeed.
+		 * Unlock this hash bucket and re-acquire the region lock. If
+		 * we're reaching here as a result of calling memp_bhfree, the
+		 * hash bucket lock has already been discarded.
 		 */
-		if (total >= 3 * len)
-			goto alloc;
+		if (0) {
+next_hb:		MUTEX_UNLOCK(dbenv, mutexp);
+		}
+		R_LOCK(dbenv, memreg);

-		/* Restart the walk if we discarded the region lock. */
-		if (restart)
-			goto retry;
+		/*
+		 * Retry the allocation as soon as we've freed up sufficient
+		 * space.  We're likely to have to coalesce of memory to
+		 * satisfy the request, don't try until it's likely (possible?)
+		 * we'll succeed.
+		 */
+		if (freed_space >= 3 * len)
+			goto alloc;
 	}
-	nomore = 1;
-	goto alloc;
+	/* NOTREACHED */
 }
+
+/*
+ * __memp_bad_buffer --
+ *	Make the first buffer in a hash bucket the least desirable buffer.
+ */
+static void
+__memp_bad_buffer(hp)
+	DB_MPOOL_HASH *hp;
+{
+	BH *bhp, *t_bhp;
+	u_int32_t priority;
+
+	/* Remove the first buffer from the bucket. */
+	bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+
+	/*
+	 * Find the highest priority buffer in the bucket.  Buffers are
+	 * sorted by priority, so it's the last one in the bucket.
+	 *
+	 * XXX
+	 * Should use SH_TAILQ_LAST, but I think that macro is broken.
+	 */
+	priority = bhp->priority;
+	for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	    t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
+		priority = t_bhp->priority;
+
+	/*
+	 * Set our buffer's priority to be just as bad, and append it to
+	 * the bucket.
+	 */
+	bhp->priority = priority;
+	SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+
+	/* Reset the hash bucket's priority. */
+	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+}
+
+/*
+ * __memp_reset_lru --
+ *	Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg, c_mp)
+	DB_ENV *dbenv;
+	REGINFO *memreg;
+	MPOOL *c_mp;
+{
+	BH *bhp;
+	DB_MPOOL_HASH *hp;
+	int bucket;
+
+	/*
+	 * Update the counter so all future allocations will start at the
+	 * bottom.
+	 */
+	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+	/* Release the region lock. */
+	R_UNLOCK(dbenv, memreg);
+
+	/* Adjust the priority of every buffer in the system. */
+	for (hp = R_ADDR(memreg, c_mp->htab),
+	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+
+		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+			if (bhp->priority != UINT32_T_MAX &&
+			    bhp->priority > MPOOL_BASE_DECREMENT)
+				bhp->priority -= MPOOL_BASE_DECREMENT;
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+	}
+
+	/* Reacquire the region lock. */
+	R_LOCK(dbenv, memreg);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __memp_check_order --
+ *	Verify the priority ordering of a hash bucket chain.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
+ * PUBLIC: #endif
+ */
+void
+__memp_check_order(hp)
+	DB_MPOOL_HASH *hp;
+{
+	BH *bhp;
+	u_int32_t priority;
+
+	/*
+	 * Assumes the hash bucket is locked.
+	 */
+	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
+		return;
+
+	DB_ASSERT(bhp->priority == hp->hash_priority);
+
+	for (priority = bhp->priority;
+	    (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
+	    priority = bhp->priority)
+		DB_ASSERT(priority <= bhp->priority);
+}
+#endif
--- a/bdb/mp/mp_bh.c
+++ b/bdb/mp/mp_bh.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_bh.c,v 11.71 2002/09/04 19:06:45 margo Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -18,40 +18,41 @@ static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp
 #endif

 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-#include "log.h"
-#include "db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/db_page.h"

+static int __memp_pgwrite
+	   __P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
 static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));

 /*
 * __memp_bhwrite --
- *	Write the page associated with a given bucket header.
+ *	Write the page associated with a given buffer header.
 *
- * PUBLIC: int __memp_bhwrite
- * PUBLIC:     __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
+ * PUBLIC:      DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
 */
 int
-__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
 	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
 	MPOOLFILE *mfp;
 	BH *bhp;
-	int *restartp, *wrotep;
+	int open_extents;
 {
+	DB_ENV *dbenv;
 	DB_MPOOLFILE *dbmfp;
 	DB_MPREG *mpreg;
-	int incremented, ret;
+	int local_open, incremented, ret;

-	if (restartp != NULL)
-		*restartp = 0;
-	if (wrotep != NULL)
-		*wrotep = 0;
-	incremented = 0;
+	dbenv = dbmp->dbenv;
+	local_open = incremented = 0;

 	/*
-	 * If the file has been removed or is a closed temporary file, Jump
-	 * right ahead and pretend that we've found the file we want-- the
+	 * If the file has been removed or is a closed temporary file, jump
+	 * right ahead and pretend that we've found the file we want -- the
 	 * page-write function knows how to handle the fact that we don't have
 	 * (or need!) any real file descriptor information.
 	 */
@ -66,52 +67,60 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	 * If we find a descriptor on the file that's not open for writing, we
 	 * try and upgrade it to make it writeable.  If that fails, we're done.
 	 */
-	MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
 	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
 		if (dbmfp->mfp == mfp) {
 			if (F_ISSET(dbmfp, MP_READONLY) &&
-			    __memp_upgrade(dbmp, dbmfp, mfp)) {
-				MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
-				return (0);
+			    !F_ISSET(dbmfp, MP_UPGRADE) &&
+			    (F_ISSET(dbmfp, MP_UPGRADE_FAIL) ||
+			    __memp_upgrade(dbmp, dbmfp, mfp))) {
+				MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+				return (EPERM);
 			}

 			/*
 			 * Increment the reference count -- see the comment in
-			 * memp_fclose().
+			 * __memp_fclose_int().
 			 */
 			++dbmfp->ref;
 			incremented = 1;
 			break;
 		}
-	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
 	if (dbmfp != NULL)
 		goto found;

+	/*
+	 * !!!
+	 * It's the caller's choice if we're going to open extent files.
+	 */
+	if (!open_extents && F_ISSET(mfp, MP_EXTENT))
+		return (EPERM);
+
 	/*
 	 * !!!
 	 * Don't try to attach to temporary files.  There are two problems in
 	 * trying to do that.  First, if we have different privileges than the
 	 * process that "owns" the temporary file, we might create the backing
 	 * disk file such that the owning process couldn't read/write its own
-	 * buffers, e.g., memp_trickle() running as root creating a file owned
+	 * buffers, e.g., memp_trickle running as root creating a file owned
 	 * as root, mode 600.  Second, if the temporary file has already been
 	 * created, we don't have any way of finding out what its real name is,
 	 * and, even if we did, it was already unlinked (so that it won't be
 	 * left if the process dies horribly).  This decision causes a problem,
 	 * however: if the temporary file consumes the entire buffer cache,
 	 * and the owner doesn't flush the buffers to disk, we could end up
-	 * with resource starvation, and the memp_trickle() thread couldn't do
+	 * with resource starvation, and the memp_trickle thread couldn't do
 	 * anything about it.  That's a pretty unlikely scenario, though.
 	 *
-	 * Note that we should never get here when the temporary file
-	 * in question has already been closed in another process, in which
-	 * case it should be marked MP_DEADFILE.
+	 * Note we should never get here when the temporary file in question
+	 * has already been closed in another process, in which case it should
+	 * be marked MP_DEADFILE.
 	 */
-	if (F_ISSET(mfp, MP_TEMP)) {
-		DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
-		return (0);
-	}
+	if (F_ISSET(mfp, MP_TEMP))
+		return (EPERM);

 	/*
 	 * It's not a page from a file we've opened.  If the file requires
@ -120,14 +129,14 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	 * nothing we can do.
 	 */
 	if (mfp->ftype != 0) {
-		MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 		for (mpreg = LIST_FIRST(&dbmp->dbregq);
 		    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
 			if (mpreg->ftype == mfp->ftype)
 				break;
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 		if (mpreg == NULL)
-			return (0);
+			return (EPERM);
 	}

 	/*
@ -138,17 +147,24 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
 	 * There's no negative cache, so we may repeatedly try and open files
 	 * that we have previously tried (and failed) to open.
 	 */
-	if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
-	    0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
-		return (0);
-
-found:	ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
-
-	if (incremented) {
-		MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
-		--dbmfp->ref;
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+	if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+		return (ret);
+	if ((ret = __memp_fopen_int(dbmfp, mfp,
+	    R_ADDR(dbmp->reginfo, mfp->path_off),
+	    0, 0, mfp->stat.st_pagesize)) != 0) {
+		(void)dbmfp->close(dbmfp, 0);
+		return (ret);
 	}
+	local_open = 1;
+
+found:	ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp);
+
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	if (incremented)
+		--dbmfp->ref;
+	else if (local_open)
+		F_SET(dbmfp, MP_FLUSH);
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);

 	return (ret);
 }
@ -157,11 +173,12 @@ found:	ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
 * __memp_pgread --
 *	Read a page from a file.
 *
- * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int));
 */
 int
-__memp_pgread(dbmfp, bhp, can_create)
+__memp_pgread(dbmfp, mutexp, bhp, can_create)
 	DB_MPOOLFILE *dbmfp;
+	DB_MUTEX *mutexp;
 	BH *bhp;
 	int can_create;
 {
@ -169,171 +186,129 @@ __memp_pgread(dbmfp, bhp, can_create)
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
-	size_t len, pagesize;
-	size_t nr;
-	int created, ret;
+	size_t len, nr, pagesize;
+	int ret;

 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
 	mfp = dbmfp->mfp;
 	pagesize = mfp->stat.st_pagesize;

+	/* We should never be called with a dirty or a locked buffer. */
+	DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED));
+
+	/* Lock the buffer and swap the hash bucket lock for the buffer lock. */
 	F_SET(bhp, BH_LOCKED | BH_TRASH);
-	MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	MUTEX_LOCK(dbenv, &bhp->mutex);
+	MUTEX_UNLOCK(dbenv, mutexp);

 	/*
 	 * Temporary files may not yet have been created.  We don't create
 	 * them now, we create them when the pages have to be flushed.
 	 */
 	nr = 0;
-	if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
-		/*
-		 * Ignore read errors if we have permission to create the page.
-		 * Assume that the page doesn't exist, and that we'll create it
-		 * when we write it out.
-		 *
-		 * XXX
-		 * Theoretically, we could overwrite a page of data if it were
-		 * possible for a file to be successfully opened for reading
-		 * and then for the read to fail.  Shouldn't ever happen, but
-		 * it might be worth checking to see if the offset is past the
-		 * known end-of-file.
-		 */
-		db_io.fhp = &dbmfp->fh;
+	if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
+		db_io.fhp = dbmfp->fhp;
 		db_io.mutexp = dbmfp->mutexp;
 		db_io.pagesize = db_io.bytes = pagesize;
 		db_io.pgno = bhp->pgno;
 		db_io.buf = bhp->buf;

-		ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
-	} else
-		ret = 0;
-
-	created = 0;
-	if (nr < pagesize) {
-		if (can_create)
-			created = 1;
-		else {
-			/*
-			 * If we had a short read, ret may be 0.  This may not
-			 * be an error -- in particular DB recovery processing
-			 * may request pages that have never been written to
-			 * disk, in which case we won't find the page.  So, the
-			 * caller must know how to handle the error.
-			 */
-			if (ret == 0)
-				ret = EIO;
+		/*
+		 * The page may not exist; if it doesn't, nr may well be 0,
+		 * but we expect the underlying OS calls not to return an
+		 * error code in this case.
+		 */
+		if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0)
 			goto err;
-		}
 	}

-	/*
-	 * Clear any bytes we didn't read that need to be cleared.  If we're
-	 * running in diagnostic mode, smash any bytes on the page that are
-	 * unknown quantities for the caller.
-	 */
-	if (nr != pagesize) {
+	if (nr < pagesize) {
+		/*
+		 * Don't output error messages for short reads.  In particular,
+		 * DB recovery processing may request pages never written to
+		 * disk or for which only some part have been written to disk,
+		 * in which case we won't find the page.  The caller must know
+		 * how to handle the error.
+		 */
+		if (can_create == 0) {
+			ret = DB_PAGE_NOTFOUND;
+			goto err;
+		}
+
+		/* Clear any bytes that need to be cleared. */
 		len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
-		if (nr < len)
-			memset(bhp->buf + nr, 0, len - nr);
-#ifdef DIAGNOSTIC
-		if (nr > len)
-			len = nr;
+		memset(bhp->buf, 0, len);
+
+#if defined(DIAGNOSTIC) || defined(UMRW)
+		/*
+		 * If we're running in diagnostic mode, corrupt any bytes on
+		 * the page that are unknown quantities for the caller.
+		 */
 		if (len < pagesize)
 			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
 #endif
-	}
+		++mfp->stat.st_page_create;
+	} else
+		++mfp->stat.st_page_in;

 	/* Call any pgin function. */
 	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);

-	/* Unlock the buffer and reacquire the region lock. */
+	/* Unlock the buffer and reacquire the hash bucket lock. */
 err:	MUTEX_UNLOCK(dbenv, &bhp->mutex);
-	R_LOCK(dbenv, dbmp->reginfo);
+	MUTEX_LOCK(dbenv, mutexp);

 	/*
 	 * If no errors occurred, the data is now valid, clear the BH_TRASH
 	 * flag; regardless, clear the lock bit and let other threads proceed.
 	 */
 	F_CLR(bhp, BH_LOCKED);
-	if (ret == 0) {
+	if (ret == 0)
 		F_CLR(bhp, BH_TRASH);

-		/* Update the statistics. */
-		if (created)
-			++mfp->stat.st_page_create;
-		else
-			++mfp->stat.st_page_in;
-	}
-
 	return (ret);
 }

 /*
 * __memp_pgwrite --
 *	Write a page to a file.
- *
- * PUBLIC: int __memp_pgwrite
- * PUBLIC:     __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
 */
-int
-__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
+static int
+__memp_pgwrite(dbmp, dbmfp, hp, bhp)
 	DB_MPOOL *dbmp;
 	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
 	BH *bhp;
-	int *restartp, *wrotep;
 {
 	DB_ENV *dbenv;
 	DB_IO db_io;
 	DB_LSN lsn;
-	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
 	size_t nw;
-	int callpgin, dosync, ret, syncfail;
-	const char *fail;
+	int callpgin, ret;

 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
 	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
-
-	if (restartp != NULL)
-		*restartp = 0;
-	if (wrotep != NULL)
-		*wrotep = 0;
-	callpgin = 0;
+	callpgin = ret = 0;

 	/*
-	 * Check the dirty bit -- this buffer may have been written since we
-	 * decided to write it.
+	 * We should never be called with a clean or trash buffer.
+	 * The sync code does call us with already locked buffers.
 	 */
-	if (!F_ISSET(bhp, BH_DIRTY)) {
-		if (wrotep != NULL)
-			*wrotep = 1;
-		return (0);
-	}
-
-	MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+	DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
+	DB_ASSERT(!F_ISSET(bhp, BH_TRASH));

 	/*
-	 * If there were two writers, we may have just been waiting while the
-	 * other writer completed I/O on this buffer.  Check the dirty bit one
-	 * more time.
+	 * If we have not already traded the hash bucket lock for the buffer
+	 * lock, do so now.
 	 */
-	if (!F_ISSET(bhp, BH_DIRTY)) {
-		MUTEX_UNLOCK(dbenv, &bhp->mutex);
-
-		if (wrotep != NULL)
-			*wrotep = 1;
-		return (0);
+	if (!F_ISSET(bhp, BH_LOCKED)) {
+		F_SET(bhp, BH_LOCKED);
+		MUTEX_LOCK(dbenv, &bhp->mutex);
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 	}

-	F_SET(bhp, BH_LOCKED);
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
-	if (restartp != NULL)
-		*restartp = 1;
-
 	/*
 	 * It's possible that the underlying file doesn't exist, either
 	 * because of an outright removal or because it was a temporary
@ -347,155 +322,122 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
 		goto file_dead;

 	/*
-	 * Ensure the appropriate log records are on disk.  If the page is
-	 * being written as part of a sync operation, the flush has already
-	 * been done, unless it was written by the application *after* the
-	 * sync was scheduled.
+	 * If the page is in a file for which we have LSN information, we have
+	 * to ensure the appropriate log records are on disk.
 	 */
-	if (LOGGING_ON(dbenv) &&
-	    (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
+	if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) {
 		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
-		if ((ret = log_flush(dbenv, &lsn)) != 0)
+		if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0)
 			goto err;
 	}
-	DB_ASSERT(!LOGGING_ON(dbenv) ||
-	   log_compare(&((LOG *)((DB_LOG *)
-	   dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Verify write-ahead logging semantics.
+	 *
+	 * !!!
+	 * One special case.  There is a single field on the meta-data page,
+	 * the last-page-number-in-the-file field, for which we do not log
+	 * changes.  If the page was originally created in a database that
+	 * didn't have logging turned on, we can see a page marked dirty but
+	 * for which no corresponding log record has been written.  However,
+	 * the only way that a page can be created for which there isn't a
+	 * previous log record and valid LSN is when the page was created
+	 * without logging turned on, and so we check for that special-case
+	 * LSN value.
+	 */
+	if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) {
+		/*
+		 * There is a potential race here.  If we are in the midst of
+		 * switching log files, it's possible we could test against the
+		 * old file and the new offset in the log region's LSN.  If we
+		 * fail the first test, acquire the log mutex and check again.
+		 */
+		DB_LOG *dblp;
+		LOG *lp;
+
+		dblp = dbenv->lg_handle;
+		lp = dblp->reginfo.primary;
+		if (!IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
+		    log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
+			R_LOCK(dbenv, &dblp->reginfo);
+			DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0);
+			R_UNLOCK(dbenv, &dblp->reginfo);
+		}
+	}
+#endif

 	/*
 	 * Call any pgout function.  We set the callpgin flag so that we flag
 	 * that the contents of the buffer will need to be passed through pgin
 	 * before they are reused.
 	 */
-	if (mfp->ftype == 0)
-		ret = 0;
-	else {
+	if (mfp->ftype != 0) {
 		callpgin = 1;
 		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
 			goto err;
 	}

 	/* Temporary files may not yet have been created. */
-	if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+	if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
 		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
-		if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
-		    ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
-		    DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
-		    &dbmfp->fh, NULL)) != 0 ||
-		    !F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
-			MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+		ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 :
+		    __db_appname(dbenv, DB_APP_TMP, NULL,
+		    F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0,
+		    dbmfp->fhp, NULL);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+		if (ret != 0) {
 			__db_err(dbenv,
 			    "unable to create temporary backing file");
 			goto err;
 		}
-		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 	}

 	/* Write the page. */
-	db_io.fhp = &dbmfp->fh;
+	db_io.fhp = dbmfp->fhp;
 	db_io.mutexp = dbmfp->mutexp;
 	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
 	db_io.pgno = bhp->pgno;
 	db_io.buf = bhp->buf;
 	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
-		ret = __db_panic(dbenv, ret);
-		fail = "write";
-		goto syserr;
-	}
-	if (nw != mfp->stat.st_pagesize) {
-		ret = EIO;
-		fail = "write";
-		goto syserr;
+		__db_err(dbenv, "%s: write failed for page %lu",
+		    __memp_fn(dbmfp), (u_long)bhp->pgno);
+		goto err;
 	}
+	++mfp->stat.st_page_out;

+err:
 file_dead:
 	/*
 	 * !!!
 	 * Once we pass this point, dbmfp and mfp may be NULL, we may not have
 	 * a valid file reference.
 	 *
-	 * Unlock the buffer and reacquire the region lock.
+	 * Unlock the buffer and reacquire the hash lock.
 	 */
 	MUTEX_UNLOCK(dbenv, &bhp->mutex);
-	R_LOCK(dbenv, dbmp->reginfo);
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);

 	/*
-	 * Clean up the flags based on a successful write.
-	 *
 	 * If we rewrote the page, it will need processing by the pgin
 	 * routine before reuse.
 	 */
 	if (callpgin)
 		F_SET(bhp, BH_CALLPGIN);
-	F_CLR(bhp, BH_DIRTY | BH_LOCKED);

 	/*
-	 * If we write a buffer for which a checkpoint is waiting, update
-	 * the count of pending buffers (both in the mpool as a whole and
-	 * for this file).  If the count for this file goes to zero, set a
-	 * flag so we flush the writes.
+	 * Update the hash bucket statistics, reset the flags.
+	 * If we were successful, the page is no longer dirty.
 	 */
-	dosync = 0;
-	if (F_ISSET(bhp, BH_SYNC)) {
-		F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+	if (ret == 0) {
+		DB_ASSERT(hp->hash_page_dirty != 0);
+		--hp->hash_page_dirty;

-		--mp->lsn_cnt;
-		if (mfp != NULL)
-			dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
+		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
 	}

-	/* Update the page clean/dirty statistics. */
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-	++c_mp->stat.st_page_clean;
-	--c_mp->stat.st_page_dirty;
-
-	/* Update I/O statistics. */
-	if (mfp != NULL)
-		++mfp->stat.st_page_out;
-
-	/*
-	 * Do the sync after everything else has been updated, so any incoming
-	 * checkpoint doesn't see inconsistent information.
-	 *
-	 * XXX:
-	 * Don't lock the region around the sync, fsync(2) has no atomicity
-	 * issues.
-	 *
-	 * XXX:
-	 * We ignore errors from the sync -- it makes no sense to return an
-	 * error to the calling process, so set a flag causing the checkpoint
-	 * to be retried later.  There is a possibility, of course, that a
-	 * subsequent checkpoint was started and that we're going to force it
-	 * to fail.  That should be unlikely, and fixing it would be difficult.
-	 */
-	if (dosync) {
-		R_UNLOCK(dbenv, dbmp->reginfo);
-		syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
-		R_LOCK(dbenv, dbmp->reginfo);
-		if (syncfail)
-			F_SET(mp, MP_LSN_RETRY);
-	}
-
-	if (wrotep != NULL)
-		*wrotep = 1;
-
-	return (0);
-
-syserr:	__db_err(dbenv, "%s: %s failed for page %lu",
-	    __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
-
-err:	/* Unlock the buffer and reacquire the region lock. */
-	MUTEX_UNLOCK(dbenv, &bhp->mutex);
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Clean up the flags based on a failure.
-	 *
-	 * The page remains dirty but we remove our lock.  If we rewrote the
-	 * page, it will need processing by the pgin routine before reuse.
-	 */
-	if (callpgin)
-		F_SET(bhp, BH_CALLPGIN);
+	/* Regardless, clear any sync wait-for count and remove our lock. */
+	bhp->ref_sync = 0;
 	F_CLR(bhp, BH_LOCKED);

 	return (ret);
@ -514,15 +456,17 @@ __memp_pg(dbmfp, bhp, is_pgin)
 	int is_pgin;
 {
 	DBT dbt, *dbtp;
+	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	DB_MPREG *mpreg;
 	MPOOLFILE *mfp;
 	int ftype, ret;

 	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
 	mfp = dbmfp->mfp;

-	MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);

 	ftype = mfp->ftype;
 	for (mpreg = LIST_FIRST(&dbmp->dbregq);
@ -536,28 +480,28 @@ __memp_pg(dbmfp, bhp, is_pgin)
 			dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
 			dbtp = &dbt;
 		}
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);

 		if (is_pgin) {
 			if (mpreg->pgin != NULL &&
-			    (ret = mpreg->pgin(dbmp->dbenv,
+			    (ret = mpreg->pgin(dbenv,
 			    bhp->pgno, bhp->buf, dbtp)) != 0)
 				goto err;
 		} else
 			if (mpreg->pgout != NULL &&
-			    (ret = mpreg->pgout(dbmp->dbenv,
+			    (ret = mpreg->pgout(dbenv,
 			    bhp->pgno, bhp->buf, dbtp)) != 0)
 				goto err;
 		break;
 	}

 	if (mpreg == NULL)
-		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);

 	return (0);

-err:	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
-	__db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+err:	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+	__db_err(dbenv, "%s: %s failed for page %lu",
 	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
 	return (ret);
 }
@ -566,55 +510,78 @@ err:	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
 * __memp_bhfree --
 *	Free a bucket header and its referenced data.
 *
- * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int));
 */
 void
-__memp_bhfree(dbmp, bhp, free_mem)
+__memp_bhfree(dbmp, hp, bhp, free_mem)
 	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
 	BH *bhp;
 	int free_mem;
 {
-	DB_HASHTAB *dbht;
+	DB_ENV *dbenv;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	int n_bucket, n_cache;
+	u_int32_t n_cache;

+	/*
+	 * Assumes the hash bucket is locked and the MPOOL is not.
+	 */
+	dbenv = dbmp->dbenv;
 	mp = dbmp->reginfo[0].primary;
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-	n_cache = NCACHE(mp, bhp->pgno);
-	n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
-	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);

-	/* Delete the buffer header from the hash bucket queue. */
-	SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
+	/*
+	 * Delete the buffer header from the hash bucket queue and reset
+	 * the hash bucket's priority, if necessary.
+	 */
+	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+	if (bhp->priority == hp->hash_priority)
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ?
+		    0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;

-	/* Delete the buffer header from the LRU queue. */
-	SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+	/*
+	 * Discard the hash bucket's mutex, it's no longer needed, and
+	 * we don't want to be holding it when acquiring other locks.
+	 */
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);

-	/* Clear the mutex this buffer recorded */
-	__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
-	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
 	/*
 	 * Find the underlying MPOOLFILE and decrement its reference count.
 	 * If this is its last reference, remove it.
 	 */
 	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+	MUTEX_LOCK(dbenv, &mfp->mutex);
 	if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
 		__memp_mf_discard(dbmp, mfp);
+	else
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
+
+	R_LOCK(dbenv, &dbmp->reginfo[n_cache]);

 	/*
-	 * If we're not reusing it immediately, free the buffer header
+	 * Clear the mutex this buffer recorded; requires the region lock
+	 * be held.
+	 */
+	__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
+	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
+
+	/*
+	 * If we're not reusing the buffer immediately, free the buffer header
 	 * and data for real.
 	 */
 	if (free_mem) {
-		--c_mp->stat.st_page_clean;
 		__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
+		c_mp = dbmp->reginfo[n_cache].primary;
+		c_mp->stat.st_pages--;
 	}
+	R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
 }

 /*
 * __memp_upgrade --
- *	Upgrade a file descriptor from readonly to readwrite.
+ *	Upgrade a file descriptor from read-only to read-write.
 */
 static int
 __memp_upgrade(dbmp, dbmfp, mfp)
@ -622,41 +589,58 @@ __memp_upgrade(dbmp, dbmfp, mfp)
 	DB_MPOOLFILE *dbmfp;
 	MPOOLFILE *mfp;
 {
-	DB_FH fh;
+	DB_ENV *dbenv;
+	DB_FH *fhp, *tfhp;
 	int ret;
 	char *rpath;

-	/*
-	 * !!!
-	 * We expect the handle to already be locked.
-	 */
-
-	/* Check to see if we've already upgraded. */
-	if (F_ISSET(dbmfp, MP_UPGRADE))
-		return (0);
-
-	/* Check to see if we've already failed. */
-	if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
-		return (1);
+	dbenv = dbmp->dbenv;
+	fhp = NULL;
+	rpath = NULL;

 	/*
 	 * Calculate the real name for this file and try to open it read/write.
 	 * We know we have a valid pathname for the file because it's the only
 	 * way we could have gotten a file descriptor of any kind.
 	 */
-	if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
-	    NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
-		return (ret);
-	if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0)
+		goto err;
+
+	if ((ret = __db_appname(dbenv, DB_APP_DATA,
+	    R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
+		goto err;
+
+	if (__os_open(dbenv, rpath,
+	    F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) {
 		F_SET(dbmfp, MP_UPGRADE_FAIL);
-		ret = 1;
-	} else {
-		/* Swap the descriptors and set the upgrade flag. */
-		(void)__os_closehandle(&dbmfp->fh);
-		dbmfp->fh = fh;
-		F_SET(dbmfp, MP_UPGRADE);
-		ret = 0;
+		goto err;
 	}
-	__os_freestr(rpath);
+
+	/*
+	 * Swap the descriptors and set the upgrade flag.
+	 *
+	 * XXX
+	 * There is a race here.  If another process schedules a read using the
+	 * existing file descriptor and is swapped out before making the system
+	 * call, this code could theoretically close the file descriptor out
+	 * from under it.  While it's very unlikely, this code should still be
+	 * rewritten.
+	 */
+	tfhp = dbmfp->fhp;
+	dbmfp->fhp = fhp;
+	fhp = tfhp;
+
+	(void)__os_closehandle(dbenv, fhp);
+	F_SET(dbmfp, MP_UPGRADE);
+
+	ret = 0;
+	if (0) {
+err:		ret = 1;
+	}
+	if (fhp != NULL)
+		__os_free(dbenv, fhp);
+	if (rpath != NULL)
+		__os_free(dbenv, rpath);
+
 	return (ret);
 }
--- a/bdb/mp/mp_fget.c
+++ b/bdb/mp/mp_fget.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex
 #include <string.h>
 #endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#ifdef HAVE_FILESYSTEM_NOTZERO
+static int __memp_fs_notzero
+    __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
 #endif

 /*
- * memp_fget --
+ * __memp_fget --
 *	Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget
+ * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
 */
 int
-memp_fget(dbmfp, pgnoaddr, flags, addrp)
+__memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	DB_MPOOLFILE *dbmfp;
 	db_pgno_t *pgnoaddr;
 	u_int32_t flags;
 	void *addrp;
 {
-	BH *bhp;
+	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+	BH *alloc_bhp, *bhp;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	DB_HASHTAB *dbht;
+	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	size_t n_bucket, n_cache, mf_offset;
-	u_int32_t st_hsearch;
-	int b_incr, first, ret;
+	roff_t mf_offset;
+	u_int32_t n_cache, st_hsearch;
+	int b_incr, extending, first, ret;
+
+	*(void **)addrp = NULL;

 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-	mfp = dbmfp->mfp;
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
-#endif

 	PANIC_CHECK(dbenv);

+	mp = dbmp->reginfo[0].primary;
+	mfp = dbmfp->mfp;
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	alloc_bhp = bhp = NULL;
+	hp = NULL;
+	b_incr = extending = ret = 0;
+
 	/*
 	 * Validate arguments.
 	 *
@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	 * is to keep database files small.  It's sleazy as hell, but we catch
 	 * any attempt to actually write the file in memp_fput().
 	 */
-#define	OKFLAGS	\
-    (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
-    DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
 	if (flags != 0) {
 		if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
 			return (ret);

-		switch (flags & ~DB_MPOOL_EXTENT) {
+		switch (flags) {
 		case DB_MPOOL_CREATE:
+			break;
 		case DB_MPOOL_LAST:
+			/* Get the last page number in the file. */
+			if (flags == DB_MPOOL_LAST) {
+				R_LOCK(dbenv, dbmp->reginfo);
+				*pgnoaddr = mfp->last_pgno;
+				R_UNLOCK(dbenv, dbmp->reginfo);
+			}
+			break;
 		case DB_MPOOL_NEW:
-		case DB_MPOOL_NEW_GROUP:
-		case 0:
+			/*
+			 * If always creating a page, skip the first search
+			 * of the hash bucket.
+			 */
+			if (flags == DB_MPOOL_NEW)
+				goto alloc;
 			break;
 		default:
 			return (__db_ferr(dbenv, "memp_fget", 1));
 		}
 	}

-#ifdef DIAGNOSTIC
-	/*
-	 * XXX
-	 * We want to switch threads as often as possible.  Yield every time
-	 * we get a new page to ensure contention.
-	 */
-	if (DB_GLOBAL(db_pageyield))
-		__os_yield(dbenv, 1);
-#endif
-
-	/* Initialize remaining local variables. */
-	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
-	bhp = NULL;
-	st_hsearch = 0;
-	b_incr = ret = 0;
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Check for the new, last or last + 1 page requests.
-	 *
-	 * Examine and update the file's last_pgno value.  We don't care if
-	 * the last_pgno value immediately changes due to another thread --
-	 * at this instant in time, the value is correct.  We do increment the
-	 * current last_pgno value if the thread is asking for a new page,
-	 * however, to ensure that two threads creating pages don't get the
-	 * same one.
-	 *
-	 * If we create a page, there is the potential that a page after it
-	 * in the file will be written before it will be written.  Recovery
-	 * depends on pages that are "created" in the file by subsequent pages
-	 * being written be zeroed out, not have random garbage.  Ensure that
-	 * the OS agrees.
-	 *
-	 * !!!
-	 * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
-	 * to allocate contiguous groups of pages in order to do subdatabases.
-	 * We return the first page in the group, but the caller must put an
-	 * LSN on the *last* page and write it, otherwise after a crash we may
-	 * not create all of the pages we need to create.
-	 */
-	if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
-		if (LF_ISSET(DB_MPOOL_NEW)) {
-			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
-			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
-			    1, mfp->stat.st_pagesize)) != 0) {
-				R_UNLOCK(dbenv, dbmp->reginfo);
-				return (ret);
-			}
-			++mfp->last_pgno;
-		}
-		if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
-			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
-			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
-			    (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
-				R_UNLOCK(dbenv, dbmp->reginfo);
-				return (ret);
-			}
-			mfp->last_pgno += *pgnoaddr;
-		}
-		*pgnoaddr = mfp->last_pgno;
-	}
-
-	/*
-	 * Determine the hash bucket where this page will live, and get local
-	 * pointers to the cache and its hash table.
-	 */
-	n_cache = NCACHE(mp, *pgnoaddr);
-	c_mp = dbmp->reginfo[n_cache].primary;
-	n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
-	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-
-	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
-		goto alloc;
-
 	/*
 	 * If mmap'ing the file and the page is not past the end of the file,
 	 * just return a pointer.
@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
 	 * goes through the cache.  All pages previously returned will be safe,
 	 * as long as the correct locking protocol was observed.
 	 *
-	 * XXX
 	 * We don't discard the map because we don't know when all of the
 	 * pages will have been discarded from the process' address space.
 	 * It would be possible to do so by reference counting the open
 	 * pages from the mmap, but it's unclear to me that it's worth it.
 	 */
-	if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
-		if (*pgnoaddr > mfp->orig_last_pgno) {
-			/*
-			 * !!!
-			 * See the comment above about non-existent pages and
-			 * the hash access method.
-			 */
-			if (!LF_ISSET(DB_MPOOL_CREATE)) {
-				if (!LF_ISSET(DB_MPOOL_EXTENT))
-					__db_err(dbenv,
-					    "%s: page %lu doesn't exist",
-					    __memp_fn(dbmfp), (u_long)*pgnoaddr);
-				ret = EINVAL;
-				goto err;
-			}
-		} else {
-			*(void **)addrp =
-			    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
-			++mfp->stat.st_map;
-			goto done;
-		}
+	if (dbmfp->addr != NULL &&
+	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+		*(void **)addrp =
+		    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+		++mfp->stat.st_map;
+		return (0);
 	}

+hb_search:
+	/*
+	 * Determine the cache and hash bucket where this page lives and get
+	 * local pointers to them.  Reset on each pass through this code, the
+	 * page number can change.
+	 */
+	n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
+
 	/* Search the hash chain for the page. */
-	for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+retry:	st_hsearch = 0;
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);
+	for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
 	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
 		++st_hsearch;
 		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
 			continue;

-		/* Increment the reference count. */
+		/*
+		 * Increment the reference count.  We may discard the hash
+		 * bucket lock as we evaluate and/or read the buffer, so we
+		 * need to ensure it doesn't move and its contents remain
+		 * unchanged.
+		 */
 		if (bhp->ref == UINT16_T_MAX) {
 			__db_err(dbenv,
 			    "%s: page %lu: reference count overflow",
 			    __memp_fn(dbmfp), (u_long)bhp->pgno);
 			ret = EINVAL;
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 			goto err;
 		}
-
-		/*
-		 * Increment the reference count.  We may discard the region
-		 * lock as we evaluate and/or read the buffer, so we need to
-		 * ensure that it doesn't move and that its contents remain
-		 * unchanged.
-		 */
 		++bhp->ref;
 		b_incr = 1;

 		/*
-		 * Any buffer we find might be trouble.
-		 *
 		 * BH_LOCKED --
-		 * I/O is in progress.  Because we've incremented the buffer
-		 * reference count, we know the buffer can't move.  Unlock
-		 * the region lock, wait for the I/O to complete, and reacquire
-		 * the region.
+		 * I/O is in progress or sync is waiting on the buffer to write
+		 * it.  Because we've incremented the buffer reference count,
+		 * we know the buffer can't move.  Unlock the bucket lock, wait
+		 * for the buffer to become available, reacquire the bucket.
 		 */
-		for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
-			R_UNLOCK(dbenv, dbmp->reginfo);
-
+		for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
+		    !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
 			/*
-			 * Explicitly yield the processor if it's not the first
-			 * pass through this loop -- if we don't, we might end
-			 * up running to the end of our CPU quantum as we will
-			 * simply be swapping between the two locks.
+			 * If someone is trying to sync this buffer and the
+			 * buffer is hot, they may never get in.  Give up
+			 * and try again.
+			 */
+			if (!first && bhp->ref_sync != 0) {
+				--bhp->ref;
+				b_incr = 0;
+				MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+				__os_yield(dbenv, 1);
+				goto retry;
+			}
+
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+			/*
+			 * Explicitly yield the processor if not the first pass
+			 * through this loop -- if we don't, we might run to the
+			 * end of our CPU quantum as we will simply be swapping
+			 * between the two locks.
 			 */
 			if (!first)
 				__os_yield(dbenv, 1);

-			MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+			MUTEX_LOCK(dbenv, &bhp->mutex);
 			/* Wait for I/O to finish... */
 			MUTEX_UNLOCK(dbenv, &bhp->mutex);
-			R_LOCK(dbenv, dbmp->reginfo);
-		}
-
-		/*
-		 * BH_TRASH --
-		 * The contents of the buffer are garbage.  Shouldn't happen,
-		 * and this read is likely to fail, but might as well try.
-		 */
-		if (F_ISSET(bhp, BH_TRASH))
-			goto reread;
-
-		/*
-		 * BH_CALLPGIN --
-		 * The buffer was converted so it could be written, and the
-		 * contents need to be converted again.
-		 */
-		if (F_ISSET(bhp, BH_CALLPGIN)) {
-			if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
-				goto err;
-			F_CLR(bhp, BH_CALLPGIN);
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
 		}

 		++mfp->stat.st_cache_hit;
-		*(void **)addrp = bhp->buf;
-		goto done;
-	}
-
-alloc:	/* Allocate new buffer header and data space. */
-	if ((ret = __memp_alloc(dbmp,
-	    &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
-		goto err;
-
-	++c_mp->stat.st_page_clean;
-
-	/*
-	 * Initialize the BH fields so that we can call the __memp_bhfree
-	 * routine if an error occurs.
-	 */
-	memset(bhp, 0, sizeof(BH));
-	bhp->ref = 1;
-	bhp->pgno = *pgnoaddr;
-	bhp->mf_offset = mf_offset;
-
-	/* Increment the count of buffers referenced by this MPOOLFILE. */
-	++mfp->block_cnt;
-
-	/*
-	 * Prepend the bucket header to the head of the appropriate MPOOL
-	 * bucket hash list.  Append the bucket header to the tail of the
-	 * MPOOL LRU chain.
-	 */
-	SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
-	SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
-
-#ifdef DIAGNOSTIC
-	if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
-		__db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
-		ret = EINVAL;
-		__memp_bhfree(dbmp, bhp, 1);
-		goto err;
-	}
-#endif
-
-	if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
-	    R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
-	    0, &dbmp->reginfo[n_cache],
-	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
-	    != 0) {
-		__memp_bhfree(dbmp, bhp, 1);
-		goto err;
+		break;
 	}

 	/*
-	 * If we created the page, zero it out and continue.
-	 *
-	 * !!!
-	 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
-	 * If DB_MPOOL_CREATE is used, then the application's pgin function
-	 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
-	 * it can detect all of its page creates, and not bother.
-	 *
-	 * If we're running in diagnostic mode, smash any bytes on the
-	 * page that are unknown quantities for the caller.
-	 *
-	 * Otherwise, read the page into memory, optionally creating it if
-	 * DB_MPOOL_CREATE is set.
+	 * Update the hash bucket search statistics -- do now because our next
+	 * search may be for a different bucket.
 	 */
-	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
-		if (mfp->clear_len == 0)
-			memset(bhp->buf, 0, mfp->stat.st_pagesize);
-		else {
-			memset(bhp->buf, 0, mfp->clear_len);
-#ifdef DIAGNOSTIC
-			memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
-			    mfp->stat.st_pagesize - mfp->clear_len);
-#endif
-		}
+	++c_mp->stat.st_hash_searches;
+	if (st_hsearch > c_mp->stat.st_hash_longest)
+		c_mp->stat.st_hash_longest = st_hsearch;
+	c_mp->stat.st_hash_examined += st_hsearch;

-		++mfp->stat.st_page_create;
-	} else {
+	/*
+	 * There are 4 possible paths to this location:
+	 *
+	 * FIRST_MISS:
+	 *	Didn't find the page in the hash bucket on our first pass:
+	 *	bhp == NULL, alloc_bhp == NULL
+	 *
+	 * FIRST_FOUND:
+	 *	Found the page in the hash bucket on our first pass:
+	 *	bhp != NULL, alloc_bhp == NULL
+	 *
+	 * SECOND_FOUND:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and found the page in the hash bucket on
+	 *	our second pass:
+	 *	bhp != NULL, alloc_bhp != NULL
+	 *
+	 * SECOND_MISS:
+	 *	Didn't find the page in the hash bucket on the first pass,
+	 *	allocated space, and didn't find the page in the hash bucket
+	 *	on our second pass:
+	 *	bhp == NULL, alloc_bhp != NULL
+	 */
+	state = bhp == NULL ?
+	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+	switch (state) {
+	case FIRST_FOUND:
+		/* We found the buffer in our first check -- we're done. */
+		break;
+	case FIRST_MISS:
 		/*
-		 * It's possible for the read function to fail, which means
-		 * that we fail as well.  Note, the __memp_pgread() function
-		 * discards the region lock, so the buffer must be pinned
-		 * down so that it cannot move and its contents are unchanged.
+		 * We didn't find the buffer in our first check.  Figure out
+		 * if the page exists, and allocate structures so we can add
+		 * the page to the buffer pool.
 		 */
-reread:		if ((ret = __memp_pgread(dbmfp,
-		    bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
-			/*
-			 * !!!
-			 * Discard the buffer unless another thread is waiting
-			 * on our I/O to complete.  Regardless, the header has
-			 * the BH_TRASH flag set.
-			 */
-			if (bhp->ref == 1)
-				__memp_bhfree(dbmp, bhp, 1);
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+alloc:		/*
+		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
+		 * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
+		 * it's an error to try and get a page past the end of file.
+		 */
+		COMPQUIET(n_cache, 0);
+
+		extending = ret = 0;
+		R_LOCK(dbenv, dbmp->reginfo);
+		switch (flags) {
+		case DB_MPOOL_NEW:
+			extending = 1;
+			*pgnoaddr = mfp->last_pgno + 1;
+			break;
+		case DB_MPOOL_CREATE:
+			extending = *pgnoaddr > mfp->last_pgno;
+			break;
+		default:
+			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+			break;
+		}
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * !!!
+		 * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+		 * not yet been initialized.
+		 */
+		mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+		n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+
+		/* Allocate a new buffer header and data space. */
+		if ((ret = __memp_alloc(dbmp,
+		    &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
+			goto err;
+#ifdef DIAGNOSTIC
+		if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+			__db_err(dbenv,
+			    "Error: buffer data is NOT size_t aligned");
+			ret = EINVAL;
 			goto err;
 		}
+#endif
+		/*
+		 * If we are extending the file, we'll need the region lock
+		 * again.
+		 */
+		if (extending)
+			R_LOCK(dbenv, dbmp->reginfo);

-		++mfp->stat.st_cache_miss;
+		/*
+		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control.  (That guarantee is interesting
+		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+		 * did not specify the page number, and so, may reasonably not
+		 * have any way to lock the page outside of mpool.) Regardless,
+		 * if we allocate the page, and some other thread of control
+		 * requests the page by number, we will not detect that and the
+		 * thread of control that allocated using DB_MPOOL_NEW may not
+		 * have a chance to initialize the page.  (Note: we *could*
+		 * detect this case if we set a flag in the buffer header which
+		 * guaranteed that no gets of the page would succeed until the
+		 * reference count went to 0, that is, until the creating page
+		 * put the page.)  What we do guarantee is that if two threads
+		 * of control are both doing DB_MPOOL_NEW calls, they won't
+		 * collide, that is, they won't both get the same page.
+		 *
+		 * There's a possibility that another thread allocated the page
+		 * we were planning to allocate while we were off doing buffer
+		 * allocation.  We can do that by making sure the page number
+		 * we were going to use is still available.  If it's not, then
+		 * we check to see if the next available page number hashes to
+		 * the same mpool region as the old one -- if it does, we can
+		 * continue, otherwise, we have to start over.
+		 */
+		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+			*pgnoaddr = mfp->last_pgno + 1;
+			if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+				__db_shalloc_free(
+				    dbmp->reginfo[n_cache].addr, alloc_bhp);
+				/*
+				 * flags == DB_MPOOL_NEW, so extending is set
+				 * and we're holding the region locked.
+				 */
+				R_UNLOCK(dbenv, dbmp->reginfo);
+
+				alloc_bhp = NULL;
+				goto alloc;
+			}
+		}
+
+		/*
+		 * We released the region lock, so another thread might have
+		 * extended the file.  Update the last_pgno and initialize
+		 * the file, as necessary, if we extended the file.
+		 */
+		if (extending) {
+#ifdef HAVE_FILESYSTEM_NOTZERO
+			if (*pgnoaddr > mfp->last_pgno &&
+			    __os_fs_notzero() &&
+			    F_ISSET(dbmfp->fhp, DB_FH_VALID))
+				ret = __memp_fs_notzero(
+				    dbenv, dbmfp, mfp, pgnoaddr);
+			else
+				ret = 0;
+#endif
+			if (ret == 0 && *pgnoaddr > mfp->last_pgno)
+				mfp->last_pgno = *pgnoaddr;
+
+			R_UNLOCK(dbenv, dbmp->reginfo);
+			if (ret != 0)
+				goto err;
+		}
+		goto hb_search;
+	case SECOND_FOUND:
+		/*
+		 * We allocated buffer space for the requested page, but then
+		 * found the page in the buffer cache on our second check.
+		 * That's OK -- we can use the page we found in the pool,
+		 * unless DB_MPOOL_NEW is set.
+		 *
+		 * Free the allocated memory, we no longer need it.  Since we
+		 * can't acquire the region lock while holding the hash bucket
+		 * lock, we have to release the hash bucket and re-acquire it.
+		 * That's OK, because we have the buffer pinned down.
+		 */
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
+		alloc_bhp = NULL;
+		R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+
+		/*
+		 * We can't use the page we found in the pool if DB_MPOOL_NEW
+		 * was set.  (For details, see the above comment beginning
+		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
+		 * release our pin on this particular buffer, and try to get
+		 * another one.
+		 */
+		if (flags == DB_MPOOL_NEW) {
+			--bhp->ref;
+			b_incr = 0;
+			goto alloc;
+		}
+		break;
+	case SECOND_MISS:
+		/*
+		 * We allocated buffer space for the requested page, and found
+		 * the page still missing on our second pass through the buffer
+		 * cache.  Instantiate the page.
+		 */
+		bhp = alloc_bhp;
+		alloc_bhp = NULL;
+
+		/*
+		 * Initialize all the BH and hash bucket fields so we can call
+		 * __memp_bhfree if an error occurs.
+		 *
+		 * Append the buffer to the tail of the bucket list and update
+		 * the hash bucket's priority.
+		 */
+		b_incr = 1;
+
+		memset(bhp, 0, sizeof(BH));
+		bhp->ref = 1;
+		bhp->priority = UINT32_T_MAX;
+		bhp->pgno = *pgnoaddr;
+		bhp->mf_offset = mf_offset;
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+		/* If we extended the file, make sure the page is never lost. */
+		if (extending) {
+			++hp->hash_page_dirty;
+			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+		}
+
+		/*
+		 * If we created the page, zero it out.  If we didn't create
+		 * the page, read from the backing file.
+		 *
+		 * !!!
+		 * DB_MPOOL_NEW doesn't call the pgin function.
+		 *
+		 * If DB_MPOOL_CREATE is used, then the application's pgin
+		 * function has to be able to handle pages of 0's -- if it
+		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
+		 * and not bother.
+		 *
+		 * If we're running in diagnostic mode, smash any bytes on the
+		 * page that are unknown quantities for the caller.
+		 *
+		 * Otherwise, read the page into memory, optionally creating it
+		 * if DB_MPOOL_CREATE is set.
+		 */
+		if (extending) {
+			if (mfp->clear_len == 0)
+				memset(bhp->buf, 0, mfp->stat.st_pagesize);
+			else {
+				memset(bhp->buf, 0, mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+				    mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+			}
+
+			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
+				F_SET(bhp, BH_CALLPGIN);
+
+			++mfp->stat.st_page_create;
+		} else {
+			F_SET(bhp, BH_TRASH);
+			++mfp->stat.st_cache_miss;
+		}
+
+		/* Increment buffer count referenced by MPOOLFILE. */
+		MUTEX_LOCK(dbenv, &mfp->mutex);
+		++mfp->block_cnt;
+		MUTEX_UNLOCK(dbenv, &mfp->mutex);
+
+		/*
+		 * Initialize the mutex.  This is the last initialization step,
+		 * because it's the only one that can fail, and everything else
+		 * must be set up or we can't jump to the err label because it
+		 * will call __memp_bhfree.
+		 */
+		if ((ret = __db_mutex_setup(dbenv,
+		    &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
+			goto err;
+	}
+
+	DB_ASSERT(bhp->ref != 0);
+
+	/*
+	 * If we're the only reference, update buffer and bucket priorities.
+	 * We may be about to release the hash bucket lock, and everything
+	 * should be correct, first.  (We've already done this if we created
+	 * the buffer, so there is no need to do it again.)
+	 */
+	if (state != SECOND_MISS && bhp->ref == 1) {
+		bhp->priority = UINT32_T_MAX;
+		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+		hp->hash_priority =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
 	}

 	/*
-	 * If we're returning a page after our current notion of the last-page,
-	 * update our information.  Note, there's no way to un-instantiate this
-	 * page, it's going to exist whether it's returned to us dirty or not.
+	 * BH_TRASH --
+	 * The buffer we found may need to be filled from the disk.
+	 *
+	 * It's possible for the read function to fail, which means we fail as
+	 * well.  Note, the __memp_pgread() function discards and reacquires
+	 * the hash lock, so the buffer must be pinned down so that it cannot
+	 * move and its contents are unchanged.  Discard the buffer on failure
+	 * unless another thread is waiting on our I/O to complete.  It's OK to
+	 * leave the buffer around, as the waiting thread will see the BH_TRASH
+	 * flag set, and will also attempt to discard it.  If there's a waiter,
+	 * we need to decrement our reference count.
 	 */
-	if (bhp->pgno > mfp->last_pgno)
-		mfp->last_pgno = bhp->pgno;
+	if (F_ISSET(bhp, BH_TRASH) &&
+	    (ret = __memp_pgread(dbmfp,
+	    &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+		goto err;

-	*(void **)addrp = bhp->buf;
-
-done:	/* Update the chain search statistics. */
-	if (st_hsearch) {
-		++c_mp->stat.st_hash_searches;
-		if (st_hsearch > c_mp->stat.st_hash_longest)
-			c_mp->stat.st_hash_longest = st_hsearch;
-		c_mp->stat.st_hash_examined += st_hsearch;
+	/*
+	 * BH_CALLPGIN --
+	 * The buffer was processed for being written to disk, and now has
+	 * to be re-converted for use.
+	 */
+	if (F_ISSET(bhp, BH_CALLPGIN)) {
+		if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+			goto err;
+		F_CLR(bhp, BH_CALLPGIN);
 	}

-	++dbmfp->pinref;
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);

+#ifdef DIAGNOSTIC
+	/* Update the file's pinned reference count. */
+	R_LOCK(dbenv, dbmp->reginfo);
+	++dbmfp->pinref;
 	R_UNLOCK(dbenv, dbmp->reginfo);

+	/*
+	 * We want to switch threads as often as possible, and at awkward
+	 * times.  Yield every time we get a new page to ensure contention.
+	 */
+	if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+		__os_yield(dbenv, 1);
+#endif
+
+	*(void **)addrp = bhp->buf;
 	return (0);

-err:	/* Discard our reference. */
-	if (b_incr)
-		--bhp->ref;
-	R_UNLOCK(dbenv, dbmp->reginfo);
+err:	/*
+	 * Discard our reference.  If we're the only reference, discard the
+	 * the buffer entirely.  If we held a reference to a buffer, we are
+	 * also still holding the hash bucket mutex.
+	 */
+	if (b_incr) {
+		if (bhp->ref == 1)
+			(void)__memp_bhfree(dbmp, hp, bhp, 1);
+		else {
+			--bhp->ref;
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		}
+	}
+
+	/* If alloc_bhp is set, free the memory. */
+	if (alloc_bhp != NULL)
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);

-	*(void **)addrp = NULL;
 	return (ret);
 }
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+/*
+ * __memp_fs_notzero --
+ *	Initialize the underlying allocated pages in the file.
+ */
+static int
+__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
+	DB_ENV *dbenv;
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+	db_pgno_t *pgnoaddr;
+{
+	DB_IO db_io;
+	u_int32_t i, npages;
+	size_t nw;
+	int ret;
+	u_int8_t *page;
+	char *fail;
+
+	/*
+	 * Pages allocated by writing pages past end-of-file are not zeroed,
+	 * on some systems.  Recovery could theoretically be fooled by a page
+	 * showing up that contained garbage.  In order to avoid this, we
+	 * have to write the pages out to disk, and flush them.  The reason
+	 * for the flush is because if we don't sync, the allocation of another
+	 * page subsequent to this one might reach the disk first, and if we
+	 * crashed at the right moment, leave us with this page as the one
+	 * allocated by writing a page past it in the file.
+	 *
+	 * Hash is the only access method that allocates groups of pages.  We
+	 * know that it will use the existence of the last page in a group to
+	 * signify that the entire group is OK; so, write all the pages but
+	 * the last one in the group, flush them to disk, and then write the
+	 * last one to disk and flush it.
+	 */
+	if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
+		return (ret);
+
+	db_io.fhp = dbmfp->fhp;
+	db_io.mutexp = dbmfp->mutexp;
+	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+	db_io.buf = page;
+
+	npages = *pgnoaddr - mfp->last_pgno;
+	for (i = 1; i < npages; ++i) {
+		db_io.pgno = mfp->last_pgno + i;
+		if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+			fail = "write";
+			goto err;
+		}
+	}
+	if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+		fail = "sync";
+		goto err;
+	}
+
+	db_io.pgno = mfp->last_pgno + npages;
+	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+		fail = "write";
+		goto err;
+	}
+	if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+		fail = "sync";
+err:		__db_err(dbenv, "%s: %s failed for page %lu",
+		    __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
+	}
+
+	__os_free(dbenv, page);
+	return (ret);
+}
+#endif
--- a/bdb/mp/mp_fopen.c
+++ b/bdb/mp/mp_fopen.c
--- a/bdb/mp/mp_fput.c
+++ b/bdb/mp/mp_fput.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_fput.c,v 11.36 2002/08/09 19:04:11 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -15,43 +15,32 @@ static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Ex

 #endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

 /*
- * memp_fput --
+ * __memp_fput --
 *	Mpool file put function.
+ *
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
 */
 int
-memp_fput(dbmfp, pgaddr, flags)
+__memp_fput(dbmfp, pgaddr, flags)
 	DB_MPOOLFILE *dbmfp;
 	void *pgaddr;
 	u_int32_t flags;
 {
-	BH *bhp;
+	BH *argbhp, *bhp, *prev;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	MPOOL *c_mp, *mp;
-	int ret, wrote;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp;
+	u_int32_t n_cache;
+	int adjust, ret;

 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fput(dbmfp, pgaddr, flags));
-#endif

 	PANIC_CHECK(dbenv);

@ -72,17 +61,6 @@ memp_fput(dbmfp, pgaddr, flags)
 		}
 	}

-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/* Decrement the pinned reference count. */
-	if (dbmfp->pinref == 0) {
-		__db_err(dbenv,
-		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
-		R_UNLOCK(dbenv, dbmp->reginfo);
-		return (EINVAL);
-	} else
-		--dbmfp->pinref;
-
 	/*
 	 * If we're mapping the file, there's nothing to do.  Because we can
 	 * stop mapping the file at any time, we have to check on each buffer
@ -90,40 +68,51 @@ memp_fput(dbmfp, pgaddr, flags)
 	 * region.
 	 */
 	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
-	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
-		R_UNLOCK(dbenv, dbmp->reginfo);
+	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
 		return (0);
+
+#ifdef DIAGNOSTIC
+	/*
+	 * Decrement the per-file pinned buffer count (mapped pages aren't
+	 * counted).
+	 */
+	R_LOCK(dbenv, dbmp->reginfo);
+	if (dbmfp->pinref == 0) {
+		ret = EINVAL;
+		__db_err(dbenv,
+		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
+	} else {
+		ret = 0;
+		--dbmfp->pinref;
 	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (ret != 0)
+		return (ret);
+#endif

-	/* Convert the page address to a buffer header. */
+	/* Convert a page address to a buffer header and hash bucket. */
 	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+	n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];

-	/* Convert the buffer header to a cache. */
-	c_mp = BH_TO_CACHE(dbmp, bhp);
-
-/* UNLOCK THE REGION, LOCK THE CACHE. */
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);

 	/* Set/clear the page bits. */
-	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
-		++c_mp->stat.st_page_clean;
-		--c_mp->stat.st_page_dirty;
+	if (LF_ISSET(DB_MPOOL_CLEAN) &&
+	    F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
+		DB_ASSERT(hp->hash_page_dirty != 0);
+		--hp->hash_page_dirty;
 		F_CLR(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
-		--c_mp->stat.st_page_clean;
-		++c_mp->stat.st_page_dirty;
+		++hp->hash_page_dirty;
 		F_SET(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DISCARD))
 		F_SET(bhp, BH_DISCARD);

-	/*
-	 * If the page is dirty and being scheduled to be written as part of
-	 * a checkpoint, we no longer know that the log is up-to-date.
-	 */
-	if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC))
-		F_SET(bhp, BH_SYNC_LOGFLSH);
-
 	/*
 	 * Check for a reference count going to zero.  This can happen if the
 	 * application returns a page twice.
@ -131,56 +120,83 @@ memp_fput(dbmfp, pgaddr, flags)
 	if (bhp->ref == 0) {
 		__db_err(dbenv, "%s: page %lu: unpinned page returned",
 		    __memp_fn(dbmfp), (u_long)bhp->pgno);
-		R_UNLOCK(dbenv, dbmp->reginfo);
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 		return (EINVAL);
 	}

 	/*
-	 * If more than one reference to the page, we're done.  Ignore the
-	 * discard flags (for now) and leave it at its position in the LRU
-	 * chain.  The rest gets done at last reference close.
+	 * If more than one reference to the page or a reference other than a
+	 * thread waiting to flush the buffer to disk, we're done.  Ignore the
+	 * discard flags (for now) and leave the buffer's priority alone.
 	 */
-	if (--bhp->ref > 0) {
-		R_UNLOCK(dbenv, dbmp->reginfo);
+	if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) {
+		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 		return (0);
 	}

-	/*
-	 * Move the buffer to the head/tail of the LRU chain.  We do this
-	 * before writing the buffer for checkpoint purposes, as the write
-	 * can discard the region lock and allow another process to acquire
-	 * buffer.  We could keep that from happening, but there seems no
-	 * reason to do so.
-	 */
-	SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
-	if (F_ISSET(bhp, BH_DISCARD))
-		SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh);
-	else
-		SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+	/* Update priority values. */
+	if (F_ISSET(bhp, BH_DISCARD) ||
+	    dbmfp->mfp->priority == MPOOL_PRI_VERY_LOW)
+		bhp->priority = 0;
+	else {
+		/*
+		 * We don't lock the LRU counter or the stat.st_pages field, if
+		 * we get garbage (which won't happen on a 32-bit machine), it
+		 * only means a buffer has the wrong priority.
+		 */
+		bhp->priority = c_mp->lru_count;

-	/*
-	 * If this buffer is scheduled for writing because of a checkpoint, we
-	 * need to write it (if it's dirty), or update the checkpoint counters
-	 * (if it's not dirty).  If we try to write it and can't, that's not
-	 * necessarily an error as it's not completely unreasonable that the
-	 * application have permission to write the underlying file, but set a
-	 * flag so that the next time the memp_sync function is called we try
-	 * writing it there, as the checkpoint thread of control better be able
-	 * to write all of the files.
-	 */
-	if (F_ISSET(bhp, BH_SYNC)) {
-		if (F_ISSET(bhp, BH_DIRTY)) {
-			if (__memp_bhwrite(dbmp,
-			    dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
-				F_SET(mp, MP_LSN_RETRY);
-		} else {
-			F_CLR(bhp, BH_SYNC);
+		adjust = 0;
+		if (dbmfp->mfp->priority != 0)
+			adjust =
+			    (int)c_mp->stat.st_pages / dbmfp->mfp->priority;
+		if (F_ISSET(bhp, BH_DIRTY))
+			adjust += c_mp->stat.st_pages / MPOOL_PRI_DIRTY;

-			--mp->lsn_cnt;
-			--dbmfp->mfp->lsn_cnt;
-		}
+		if (adjust > 0) {
+			if (UINT32_T_MAX - bhp->priority <= (u_int32_t)adjust)
+				bhp->priority += adjust;
+		} else if (adjust < 0)
+			if (bhp->priority > (u_int32_t)-adjust)
+				bhp->priority += adjust;
 	}

-	R_UNLOCK(dbenv, dbmp->reginfo);
+	/*
+	 * Buffers on hash buckets are sorted by priority -- move the buffer
+	 * to the correct position in the list.
+	 */
+	argbhp = bhp;
+	SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh);
+
+	prev = NULL;
+	for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+	    bhp != NULL; prev = bhp, bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+		if (bhp->priority > argbhp->priority)
+			break;
+	if (prev == NULL)
+		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, argbhp, hq, __bh);
+	else
+		SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh);
+
+	/* Reset the hash bucket's priority. */
+	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+#ifdef DIAGNOSTIC
+	__memp_check_order(hp);
+#endif
+
+	/*
+	 * The sync code has a separate counter for buffers on which it waits.
+	 * It reads that value without holding a lock so we update it as the
+	 * last thing we do.  Once that value goes to 0, we won't see another
+	 * reference to that buffer being returned to the cache until the sync
+	 * code has finished, so we're safe as long as we don't let the value
+	 * go to 0 before we finish with the buffer.
+	 */
+	if (F_ISSET(argbhp, BH_LOCKED) && argbhp->ref_sync != 0)
+		--argbhp->ref_sync;
+
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
 	return (0);
 }
--- a/bdb/mp/mp_fset.c
+++ b/bdb/mp/mp_fset.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_fset.c,v 11.25 2002/05/03 15:21:17 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -15,25 +15,18 @@ static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Ex

 #endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

 /*
- * memp_fset --
+ * __memp_fset --
 *	Mpool page set-flag routine.
+ *
+ * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
 */
 int
-memp_fset(dbmfp, pgaddr, flags)
+__memp_fset(dbmfp, pgaddr, flags)
 	DB_MPOOLFILE *dbmfp;
 	void *pgaddr;
 	u_int32_t flags;
@ -41,17 +34,13 @@ memp_fset(dbmfp, pgaddr, flags)
 	BH *bhp;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	MPOOL *c_mp, *mp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp;
+	u_int32_t n_cache;
 	int ret;

 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fset(dbmfp, pgaddr, flags));
-#endif

 	PANIC_CHECK(dbenv);

@ -60,7 +49,7 @@ memp_fset(dbmfp, pgaddr, flags)
 		return (__db_ferr(dbenv, "memp_fset", 1));

 	if ((ret = __db_fchk(dbenv, "memp_fset", flags,
-	    DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+	    DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
 		return (ret);
 	if ((ret = __db_fcchk(dbenv, "memp_fset",
 	    flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
@ -72,27 +61,29 @@ memp_fset(dbmfp, pgaddr, flags)
 		return (EACCES);
 	}

-	/* Convert the page address to a buffer header. */
+	/* Convert the page address to a buffer header and hash bucket. */
 	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+	n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];

-	/* Convert the buffer header to a cache. */
-	c_mp = BH_TO_CACHE(dbmp, bhp);
+	MUTEX_LOCK(dbenv, &hp->hash_mutex);

-	R_LOCK(dbenv, dbmp->reginfo);
-
-	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
-		++c_mp->stat.st_page_clean;
-		--c_mp->stat.st_page_dirty;
+	/* Set/clear the page bits. */
+	if (LF_ISSET(DB_MPOOL_CLEAN) &&
+	    F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
+		DB_ASSERT(hp->hash_page_dirty != 0);
+		--hp->hash_page_dirty;
 		F_CLR(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
-		--c_mp->stat.st_page_clean;
-		++c_mp->stat.st_page_dirty;
+		++hp->hash_page_dirty;
 		F_SET(bhp, BH_DIRTY);
 	}
 	if (LF_ISSET(DB_MPOOL_DISCARD))
 		F_SET(bhp, BH_DISCARD);

-	R_UNLOCK(dbenv, dbmp->reginfo);
+	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
 	return (0);
 }
--- a/bdb/mp/mp_method.c
+++ b/bdb/mp/mp_method.c
@ -1,30 +1,30 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $";
+static const char revid[] = "$Id: mp_method.c,v 11.29 2002/03/27 04:32:27 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
-#endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
+#ifdef HAVE_RPC
+#include <rpc/rpc.h>
+#endif
 #endif

 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

 #ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#include "dbinc_auto/db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
 #endif

 static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
@ -41,29 +41,46 @@ __memp_dbenv_create(dbenv)
 	DB_ENV *dbenv;
 {
 	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 *
 	 * We default to 32 8K pages.  We don't default to a flat 256K, because
 	 * some systems require significantly more memory to hold 32 pages than
 	 * others.  For example, HP-UX with POSIX pthreads needs 88 bytes for
 	 * a POSIX pthread mutex and almost 200 bytes per buffer header, while
-	 * Solaris needs 24 and 52 bytes for the same structures.
+	 * Solaris needs 24 and 52 bytes for the same structures.  The minimum
+	 * number of hash buckets is 37.  These contain a mutex also.
 	 */
-	dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH));
+	dbenv->mp_bytes =
+	    32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
 	dbenv->mp_ncache = 1;

-	dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
-	dbenv->set_cachesize = __memp_set_cachesize;
-
-#ifdef	HAVE_RPC
-	/*
-	 * If we have a client, overwrite what we just setup to
-	 * point to client functions.
-	 */
+#ifdef HAVE_RPC
 	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
 		dbenv->set_cachesize = __dbcl_env_cachesize;
 		dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize;
-	}
+		dbenv->memp_dump_region = NULL;
+		dbenv->memp_fcreate = __dbcl_memp_fcreate;
+		dbenv->memp_nameop = NULL;
+		dbenv->memp_register = __dbcl_memp_register;
+		dbenv->memp_stat = __dbcl_memp_stat;
+		dbenv->memp_sync = __dbcl_memp_sync;
+		dbenv->memp_trickle = __dbcl_memp_trickle;
+	} else
 #endif
-
+	{
+		dbenv->set_cachesize = __memp_set_cachesize;
+		dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+		dbenv->memp_dump_region = __memp_dump_region;
+		dbenv->memp_fcreate = __memp_fcreate;
+		dbenv->memp_nameop = __memp_nameop;
+		dbenv->memp_register = __memp_register;
+		dbenv->memp_stat = __memp_stat;
+		dbenv->memp_sync = __memp_sync;
+		dbenv->memp_trickle = __memp_trickle;
+	}
 }

 /*
@ -78,26 +95,50 @@ __memp_set_cachesize(dbenv, gbytes, bytes, ncache)
 {
 	ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize");

-	dbenv->mp_gbytes = gbytes + bytes / GIGABYTE;
-	dbenv->mp_bytes = bytes % GIGABYTE;
-	dbenv->mp_ncache = ncache == 0 ? 1 : ncache;
+	/* Normalize the values. */
+	if (ncache == 0)
+		ncache = 1;

 	/*
-	 * If the application requested less than 500Mb, increase the
-	 * cachesize by 25% to account for our overhead.  (I'm guessing
-	 * that caches over 500Mb are specifically sized, i.e., it's
-	 * a large server and the application actually knows how much
-	 * memory is available.)
+	 * You can only store 4GB-1 in an unsigned 32-bit value, so correct for
+	 * applications that specify 4GB cache sizes -- we know what they meant.
+	 */
+	if (gbytes / ncache == 4 && bytes == 0) {
+		--gbytes;
+		bytes = GIGABYTE - 1;
+	} else {
+		gbytes += bytes / GIGABYTE;
+		bytes %= GIGABYTE;
+	}
+
+	/* Avoid too-large cache sizes, they result in a region size of zero. */
+	if (gbytes / ncache > 4 || (gbytes / ncache == 4 && bytes != 0)) {
+		__db_err(dbenv, "individual cache size too large");
+		return (EINVAL);
+	}
+
+	/*
+	 * If the application requested less than 500Mb, increase the cachesize
+	 * by 25% and factor in the size of the hash buckets to account for our
+	 * overhead.  (I'm guessing caches over 500Mb are specifically sized,
+	 * that is, it's a large server and the application actually knows how
+	 * much memory is available.  We only document the 25% overhead number,
+	 * not the hash buckets, but I don't see a reason to confuse the issue,
+	 * it shouldn't matter to an application.)
 	 *
 	 * There is a minimum cache size, regardless.
 	 */
-	if (dbenv->mp_gbytes == 0) {
-		if (dbenv->mp_bytes < 500 * MEGABYTE)
-			dbenv->mp_bytes += dbenv->mp_bytes / 4;
-		if (dbenv->mp_bytes < DB_CACHESIZE_MIN)
-			dbenv->mp_bytes = DB_CACHESIZE_MIN;
+	if (gbytes == 0) {
+		if (bytes < 500 * MEGABYTE)
+			bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
+		if (bytes / ncache < DB_CACHESIZE_MIN)
+			bytes = ncache * DB_CACHESIZE_MIN;
 	}

+	dbenv->mp_gbytes = gbytes;
+	dbenv->mp_bytes = bytes;
+	dbenv->mp_ncache = ncache;
+
 	return (0);
 }

--- a/bdb/mp/mp_region.c
+++ b/bdb/mp/mp_region.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_region.c,v 11.49 2002/05/07 18:42:20 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -17,11 +17,11 @@ static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell
 #endif

 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

 static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int));
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
 static size_t __mpool_region_maint __P((REGINFO *));
 #endif

@ -119,6 +119,8 @@ __memp_open(dbenv)

 			regids[i] = dbmp->reginfo[i].id;
 		}
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
 	} else {
 		/*
 		 * Determine how many regions there are going to be, allocate
@ -135,6 +137,19 @@ __memp_open(dbenv)
 			dbmp->reginfo[i].id = INVALID_REGION_ID;
 		dbmp->reginfo[0] = reginfo;

+		/*
+		 * We have to unlock the primary mpool region before we attempt
+		 * to join the additional mpool regions.  If we don't, we can
+		 * deadlock.  The scenario is that we hold the primary mpool
+		 * region lock.  We then try to attach to an additional mpool
+		 * region, which requires the acquisition/release of the main
+		 * region lock (to search the list of regions).  If another
+		 * thread of control already holds the main region lock and is
+		 * waiting on our primary mpool region lock, we'll deadlock.
+		 * See [#4696] for more information.
+		 */
+		R_UNLOCK(dbenv, dbmp->reginfo);
+
 		/* Join remaining regions. */
 		regids = R_ADDR(dbmp->reginfo, mp->regids);
 		for (i = 1; i < dbmp->nreg; ++i) {
@ -155,17 +170,10 @@ __memp_open(dbenv)
 		    R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);

 	/* If the region is threaded, allocate a mutex to lock the handles. */
-	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-		if ((ret = __db_mutex_alloc(
-		    dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) {
-			goto err;
-		}
-		if ((ret =
-		    __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0)
-			goto err;
-	}
-
-	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+	    (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmp->mutexp,
+	    MUTEX_ALLOC | MUTEX_THREAD)) != 0)
+		goto err;

 	dbenv->mp_handle = dbmp;
 	return (0);
@ -180,12 +188,11 @@ err:	if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
 			if (dbmp->reginfo[i].id != INVALID_REGION_ID)
 				(void)__db_r_detach(
 				    dbenv, &dbmp->reginfo[i], 0);
-		__os_free(dbmp->reginfo,
-		    dbmp->nreg * sizeof(*dbmp->reginfo));
+		__os_free(dbenv, dbmp->reginfo);
 	}
 	if (dbmp->mutexp != NULL)
 		__db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
-	__os_free(dbmp, sizeof(*dbmp));
+	__os_free(dbenv, dbmp);
 	return (ret);
 }

@ -199,13 +206,13 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	DB_MPOOL *dbmp;
 	int reginfo_off, htab_buckets;
 {
-	DB_HASHTAB *htab;
+	DB_MPOOL_HASH *htab;
 	MPOOL *mp;
 	REGINFO *reginfo;
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
 	size_t maint_size;
 #endif
-	int ret;
+	int i, ret;
 	void *p;

 	mp = NULL;
@ -218,7 +225,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	mp = reginfo->primary;
 	memset(mp, 0, sizeof(*mp));

-#ifdef	MUTEX_SYSTEM_RESOURCES
+#ifdef	HAVE_MUTEX_SYSTEM_RESOURCES
 	maint_size = __mpool_region_maint(reginfo);
 	/* Allocate room for the maintenance info and initialize it. */
 	if ((ret = __db_shalloc(reginfo->addr,
@ -231,14 +238,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	if (reginfo_off == 0) {
 		SH_TAILQ_INIT(&mp->mpfq);

-		if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex,
-		    R_OFFSET(dbmp->reginfo, &mp->sync_mutex) +
-		    DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo,
-		    (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0)
-			goto err;
-
 		ZERO_LSN(mp->lsn);
-		mp->lsn_cnt = 0;

 		mp->nreg = dbmp->nreg;
 		if ((ret = __db_shalloc(dbmp->reginfo[0].addr,
@ -247,32 +247,41 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
 		mp->regids = R_OFFSET(dbmp->reginfo, p);
 	}

-	SH_TAILQ_INIT(&mp->bhq);
-
 	/* Allocate hash table space and initialize it. */
 	if ((ret = __db_shalloc(reginfo->addr,
-	    htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0)
+	    htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
 		goto mem_err;
-	__db_hashinit(htab, htab_buckets);
 	mp->htab = R_OFFSET(reginfo, htab);
-	mp->htab_buckets = htab_buckets;
+	for (i = 0; i < htab_buckets; i++) {
+		if ((ret = __db_mutex_setup(dbenv,
+		    reginfo, &htab[i].hash_mutex,
+		    MUTEX_NO_RLOCK)) != 0)
+			return (ret);
+		SH_TAILQ_INIT(&htab[i].hash_bucket);
+		htab[i].hash_page_dirty = htab[i].hash_priority = 0;
+	}
+	mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets;

+	/*
+	 * Only the environment creator knows the total cache size, fill in
+	 * those statistics now.
+	 */
+	mp->stat.st_gbytes = dbenv->mp_gbytes;
+	mp->stat.st_bytes = dbenv->mp_bytes;
 	return (0);

 mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region");
-err:	if (reginfo->primary != NULL)
-		__db_shalloc_free(reginfo->addr, reginfo->primary);
 	return (ret);
 }

 /*
- * __memp_close --
- *	Internal version of memp_close: only called from DB_ENV->close.
+ * __memp_dbenv_refresh --
+ *	Clean up after the mpool system on a close or failed open.
 *
- * PUBLIC: int __memp_close __P((DB_ENV *));
+ * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *));
 */
 int
-__memp_close(dbenv)
+__memp_dbenv_refresh(dbenv)
 	DB_ENV *dbenv;
 {
 	DB_MPOOL *dbmp;
@ -287,12 +296,12 @@ __memp_close(dbenv)
 	/* Discard DB_MPREGs. */
 	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
 		LIST_REMOVE(mpreg, q);
-		__os_free(mpreg, sizeof(DB_MPREG));
+		__os_free(dbenv, mpreg);
 	}

 	/* Discard DB_MPOOLFILEs. */
 	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
-		if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+		if ((t_ret = __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
 			ret = t_ret;

 	/* Discard the thread mutex. */
@ -305,14 +314,14 @@ __memp_close(dbenv)
 		    dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0)
 			ret = t_ret;

-	__os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo));
-	__os_free(dbmp, sizeof(*dbmp));
+	__os_free(dbenv, dbmp->reginfo);
+	__os_free(dbenv, dbmp);

 	dbenv->mp_handle = NULL;
 	return (ret);
 }

-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
 /*
 * __mpool_region_maint --
 *	Return the amount of space needed for region maintenance info.
@ -328,9 +337,11 @@ __mpool_region_maint(infop)
 	/*
 	 * For mutex maintenance we need one mutex per possible page.
 	 * Compute the maximum number of pages this cache can have.
-	 * Also add in an mpool mutex.
+	 * Also add in an mpool mutex and mutexes for all dbenv and db
+	 * handles.
 	 */
 	numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1);
+	numlocks += DB_MAX_HANDLES;
 	s = sizeof(roff_t) * numlocks;
 	return (s);
 }
@ -347,11 +358,109 @@ __mpool_region_destroy(dbenv, infop)
 	DB_ENV *dbenv;
 	REGINFO *infop;
 {
-	MPOOL *mp;
+	__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop,
+	    ((MPOOL *)R_ADDR(infop, infop->rp->primary))->maint_off));

 	COMPQUIET(dbenv, NULL);
-	mp = R_ADDR(infop, infop->rp->primary);
-
-	__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off));
-	return;
+	COMPQUIET(infop, NULL);
+}
+
+/*
+ * __memp_nameop
+ *	Remove or rename a file in the pool.
+ *
+ * PUBLIC: int  __memp_nameop __P((DB_ENV *,
+ * PUBLIC:     u_int8_t *, const char *, const char *, const char *));
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+int
+__memp_nameop(dbenv, fileid, newname, fullold, fullnew)
+	DB_ENV *dbenv;
+	u_int8_t *fileid;
+	const char *newname, *fullold, *fullnew;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	roff_t newname_off;
+	int locked, ret;
+	void *p;
+
+	locked = 0;
+	dbmp = NULL;
+
+	if (!MPOOL_ON(dbenv))
+		goto fsop;
+
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	/*
+	 * Remove or rename a file that the mpool might know about.  We assume
+	 * that the fop layer has the file locked for exclusive access, so we
+	 * don't worry about locking except for the mpool mutexes.  Checkpoint
+	 * can happen at any time, independent of file locking, so we have to
+	 * do the actual unlink or rename system call to avoid any race.
+	 *
+	 * If this is a rename, allocate first, because we can't recursively
+	 * grab the region lock.
+	 */
+	if (newname == NULL)
+		p = NULL;
+	else {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, strlen(newname) + 1, &newname_off, &p)) != 0)
+			return (ret);
+		memcpy(p, newname, strlen(newname) + 1);
+	}
+
+	locked = 1;
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Find the file -- if mpool doesn't know about this file, that's not
+	 * an error-- we may not have it open.
+	 */
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+		/* Ignore non-active files. */
+		if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+			continue;
+
+		/* Ignore non-matching files. */
+		if (memcmp(fileid, R_ADDR(
+		    dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+			continue;
+
+		/* If newname is NULL, we're removing the file. */
+		if (newname == NULL) {
+			MUTEX_LOCK(dbenv, &mfp->mutex);
+			MPOOLFILE_IGNORE(mfp);
+			MUTEX_UNLOCK(dbenv, &mfp->mutex);
+		} else {
+			/*
+			 * Else, it's a rename.  We've allocated memory
+			 * for the new name.  Swap it with the old one.
+			 */
+			p = R_ADDR(dbmp->reginfo, mfp->path_off);
+			mfp->path_off = newname_off;
+		}
+		break;
+	}
+
+	/* Delete the memory we no longer need. */
+	if (p != NULL)
+		__db_shalloc_free(dbmp->reginfo[0].addr, p);
+
+fsop:	if (newname == NULL)
+		(void)__os_unlink(dbenv, fullold);
+	else
+		(void)__os_rename(dbenv, fullold, fullnew, 1);
+
+	if (locked)
+		R_UNLOCK(dbenv, dbmp->reginfo);
+
+	return (0);
 }
--- a/bdb/mp/mp_register.c
+++ b/bdb/mp/mp_register.c
@ -1,38 +1,33 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $";
+static const char revid[] = "$Id: mp_register.c,v 11.21 2002/03/27 04:32:27 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 #endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

 /*
 * memp_register --
 *	Register a file type's pgin, pgout routines.
+ *
+ * PUBLIC: int __memp_register __P((DB_ENV *, int,
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC:     int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
 */
 int
-memp_register(dbenv, ftype, pgin, pgout)
+__memp_register(dbenv, ftype, pgin, pgout)
 	DB_ENV *dbenv;
 	int ftype;
 	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
@ -42,13 +37,9 @@ memp_register(dbenv, ftype, pgin, pgout)
 	DB_MPREG *mpreg;
 	int ret;

-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_register(dbenv, ftype, pgin, pgout));
-#endif
-
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL);

 	dbmp = dbenv->mp_handle;

@ -70,7 +61,7 @@ memp_register(dbenv, ftype, pgin, pgout)
 		return (0);

 	/* New entry. */
-	if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0)
+	if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), &mpreg)) != 0)
 		return (ret);

 	mpreg->ftype = ftype;
--- a/bdb/mp/mp_stat.c
+++ b/bdb/mp/mp_stat.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $";
+static const char revid[] = "$Id: mp_stat.c,v 11.51 2002/08/06 06:13:47 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -18,123 +18,150 @@ static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic E
 #include <unistd.h>
 #endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "db_am.h"
-#include "mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"

-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-static void __memp_dumpcache
-		__P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
+static void __memp_dumpcache __P((DB_ENV *,
+		DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
 static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *));
+static void __memp_stat_wait __P((REGINFO *, MPOOL *, DB_MPOOL_STAT *, int));

 /*
- * memp_stat --
+ * __memp_stat --
 *	Display MPOOL statistics.
+ *
+ * PUBLIC: int __memp_stat
+ * PUBLIC:     __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
 */
 int
-memp_stat(dbenv, gspp, fspp, db_malloc)
+__memp_stat(dbenv, gspp, fspp, flags)
 	DB_ENV *dbenv;
 	DB_MPOOL_STAT **gspp;
 	DB_MPOOL_FSTAT ***fspp;
-	void *(*db_malloc) __P((size_t));
+	u_int32_t flags;
 {
 	DB_MPOOL *dbmp;
 	DB_MPOOL_FSTAT **tfsp, *tstruct;
 	DB_MPOOL_STAT *sp;
 	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
-	char *tname;
-	size_t len, nlen;
-	u_int32_t i;
+	size_t len, nlen, pagesize;
+	u_int32_t pages, i;
 	int ret;
-	char *name;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc));
-#endif
+	char *name, *tname;

 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_stat", DB_INIT_MPOOL);
+
+	if ((ret = __db_fchk(dbenv,
+	    "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);

 	dbmp = dbenv->mp_handle;
-	sp = NULL;
+	mp = dbmp->reginfo[0].primary;

 	/* Global statistics. */
-	mp = dbmp->reginfo[0].primary;
 	if (gspp != NULL) {
 		*gspp = NULL;

-		if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0)
+		if ((ret = __os_umalloc(dbenv, sizeof(**gspp), gspp)) != 0)
 			return (ret);
+		memset(*gspp, 0, sizeof(**gspp));
 		sp = *gspp;

 		/*
 		 * Initialization and information that is not maintained on
 		 * a per-cache basis.
 		 */
-		sp->st_hash_longest = 0;
-		sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait;
-		sp->st_region_nowait =
-		    dbmp->reginfo[0].rp->mutex.mutex_set_nowait;
-		sp->st_gbytes = dbenv->mp_gbytes;
-		sp->st_bytes = dbenv->mp_bytes;
+		c_mp = dbmp->reginfo[0].primary;
+		sp->st_gbytes = c_mp->stat.st_gbytes;
+		sp->st_bytes = c_mp->stat.st_bytes;
 		sp->st_ncache = dbmp->nreg;
 		sp->st_regsize = dbmp->reginfo[0].rp->size;

-		R_LOCK(dbenv, dbmp->reginfo);
-
 		/* Walk the cache list and accumulate the global information. */
 		for (i = 0; i < mp->nreg; ++i) {
 			c_mp = dbmp->reginfo[i].primary;
+
+			sp->st_map += c_mp->stat.st_map;
 			sp->st_cache_hit += c_mp->stat.st_cache_hit;
 			sp->st_cache_miss += c_mp->stat.st_cache_miss;
-			sp->st_map += c_mp->stat.st_map;
 			sp->st_page_create += c_mp->stat.st_page_create;
 			sp->st_page_in += c_mp->stat.st_page_in;
 			sp->st_page_out += c_mp->stat.st_page_out;
 			sp->st_ro_evict += c_mp->stat.st_ro_evict;
 			sp->st_rw_evict += c_mp->stat.st_rw_evict;
+			sp->st_page_trickle += c_mp->stat.st_page_trickle;
+			sp->st_pages += c_mp->stat.st_pages;
+			/*
+			 * st_page_dirty	calculated by __memp_stat_hash
+			 * st_page_clean	calculated here
+			 */
+			__memp_stat_hash(
+			    &dbmp->reginfo[i], c_mp, &sp->st_page_dirty);
+			sp->st_page_clean = sp->st_pages - sp->st_page_dirty;
 			sp->st_hash_buckets += c_mp->stat.st_hash_buckets;
 			sp->st_hash_searches += c_mp->stat.st_hash_searches;
-			if (c_mp->stat.st_hash_longest > sp->st_hash_longest)
-				sp->st_hash_longest =
-				    c_mp->stat.st_hash_longest;
+			sp->st_hash_longest += c_mp->stat.st_hash_longest;
 			sp->st_hash_examined += c_mp->stat.st_hash_examined;
-			sp->st_page_clean += c_mp->stat.st_page_clean;
-			sp->st_page_dirty += c_mp->stat.st_page_dirty;
-			sp->st_page_trickle += c_mp->stat.st_page_trickle;
-			sp->st_region_wait += c_mp->stat.st_region_wait;
-			sp->st_region_nowait += c_mp->stat.st_region_nowait;
+			/*
+			 * st_hash_nowait	calculated by __memp_stat_wait
+			 * st_hash_wait
+			 */
+			__memp_stat_wait(&dbmp->reginfo[i], c_mp, sp, flags);
+			sp->st_region_nowait +=
+			    dbmp->reginfo[i].rp->mutex.mutex_set_nowait;
+			sp->st_region_wait +=
+			    dbmp->reginfo[i].rp->mutex.mutex_set_wait;
+			sp->st_alloc += c_mp->stat.st_alloc;
+			sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets;
+			if (sp->st_alloc_max_buckets <
+			    c_mp->stat.st_alloc_max_buckets)
+				sp->st_alloc_max_buckets =
+				    c_mp->stat.st_alloc_max_buckets;
+			sp->st_alloc_pages += c_mp->stat.st_alloc_pages;
+			if (sp->st_alloc_max_pages <
+			    c_mp->stat.st_alloc_max_pages)
+				sp->st_alloc_max_pages =
+				    c_mp->stat.st_alloc_max_pages;
+
+			if (LF_ISSET(DB_STAT_CLEAR)) {
+				dbmp->reginfo[i].rp->mutex.mutex_set_wait = 0;
+				dbmp->reginfo[i].rp->mutex.mutex_set_nowait = 0;
+				pages = c_mp->stat.st_pages;
+				memset(&c_mp->stat, 0, sizeof(c_mp->stat));
+				c_mp->stat.st_hash_buckets = c_mp->htab_buckets;
+				c_mp->stat.st_pages = pages;
+			}
 		}

 		/*
-		 * We have duplicate statistics fields in the cache and
-		 * per-file structures.  The counters are only incremented
-		 * in the per-file structures, though.  The intent is that
-		 * if we ever flush files from the pool we can save their
-		 * last known totals in the cache structure.
+		 * We have duplicate statistics fields in per-file structures
+		 * and the cache.  The counters are only incremented in the
+		 * per-file structures, except if a file is flushed from the
+		 * mpool, at which time we copy its information into the cache
+		 * statistics.  We added the cache information above, now we
+		 * add the per-file information.
 		 */
+		R_LOCK(dbenv, dbmp->reginfo);
 		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
 		    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+			sp->st_map += mfp->stat.st_map;
 			sp->st_cache_hit += mfp->stat.st_cache_hit;
 			sp->st_cache_miss += mfp->stat.st_cache_miss;
-			sp->st_map += mfp->stat.st_map;
 			sp->st_page_create += mfp->stat.st_page_create;
 			sp->st_page_in += mfp->stat.st_page_in;
 			sp->st_page_out += mfp->stat.st_page_out;
+			if (fspp == NULL && LF_ISSET(DB_STAT_CLEAR)) {
+				pagesize = mfp->stat.st_pagesize;
+				memset(&mfp->stat, 0, sizeof(mfp->stat));
+				mfp->stat.st_pagesize = pagesize;
+			}
 		}
-
 		R_UNLOCK(dbenv, dbmp->reginfo);
 	}

@ -142,9 +169,8 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 	if (fspp != NULL) {
 		*fspp = NULL;

-		R_LOCK(dbenv, dbmp->reginfo);
-
 		/* Count the MPOOLFILE structures. */
+		R_LOCK(dbenv, dbmp->reginfo);
 		for (i = 0, len = 0,
 		    mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
 		    mfp != NULL;
@ -153,18 +179,15 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 			    sizeof(DB_MPOOL_FSTAT) +
 			    strlen(__memp_fns(dbmp, mfp)) + 1;
 		len += sizeof(DB_MPOOL_FSTAT *);	/* Trailing NULL */
-
 		R_UNLOCK(dbenv, dbmp->reginfo);

-		if (len == 0)
+		if (i == 0)
 			return (0);

 		/* Allocate space */
-		if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0)
+		if ((ret = __os_umalloc(dbenv, len, fspp)) != 0)
 			return (ret);

-		R_LOCK(dbenv, dbmp->reginfo);
-
 		/*
 		 * Build each individual entry.  We assume that an array of
 		 * pointers are aligned correctly to be followed by an array
@ -179,20 +202,30 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 		tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1);
 		tname = (char *)(tstruct + i);

+		/*
+		 * Files may have been opened since we counted, don't walk
+		 * off the end of the allocated space.
+		 */
+		R_LOCK(dbenv, dbmp->reginfo);
 		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
-		    mfp != NULL;
+		    mfp != NULL && i-- > 0;
 		    ++tfsp, ++tstruct, tname += nlen,
 		    mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
 			name = __memp_fns(dbmp, mfp);
 			nlen = strlen(name) + 1;
 			*tfsp = tstruct;
 			*tstruct = mfp->stat;
+			if (LF_ISSET(DB_STAT_CLEAR)) {
+				pagesize = mfp->stat.st_pagesize;
+				memset(&mfp->stat, 0, sizeof(mfp->stat));
+				mfp->stat.st_pagesize = pagesize;
+			}
 			tstruct->file_name = tname;
 			memcpy(tname, name, nlen);
 		}
-		*tfsp = NULL;
-
 		R_UNLOCK(dbenv, dbmp->reginfo);
+
+		*tfsp = NULL;
 	}
 	return (0);
 }
@ -200,7 +233,6 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 #define	FMAP_ENTRIES	200			/* Files we map. */

 #define	MPOOL_DUMP_HASH	0x01			/* Debug hash chains. */
-#define	MPOOL_DUMP_LRU	0x02			/* Debug LRU chains. */
 #define	MPOOL_DUMP_MEM	0x04			/* Debug region memory. */
 #define	MPOOL_DUMP_ALL	0x07			/* Debug all. */

@ -208,14 +240,23 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
 * __memp_dump_region --
 *	Display MPOOL structures.
 *
- * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *));
+ * PUBLIC: int __memp_dump_region __P((DB_ENV *, char *, FILE *));
 */
-void
+int
 __memp_dump_region(dbenv, area, fp)
 	DB_ENV *dbenv;
 	char *area;
 	FILE *fp;
 {
+	static const FN fn[] = {
+		{ MP_CAN_MMAP,	"mmapped" },
+		{ MP_DEADFILE,	"dead" },
+		{ MP_DIRECT,	"no buffer" },
+		{ MP_EXTENT,	"extent" },
+		{ MP_TEMP,	"temporary" },
+		{ MP_UNLINK,	"unlink" },
+		{ 0,		NULL }
+	};
 	DB_MPOOL *dbmp;
 	DB_MPOOLFILE *dbmfp;
 	MPOOL *mp;
@ -225,6 +266,10 @@ __memp_dump_region(dbenv, area, fp)
 	int cnt;
 	u_int8_t *p;

+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_dump_region", DB_INIT_MPOOL);
+
 	dbmp = dbenv->mp_handle;

 	/* Make it easy to call from the debugger. */
@ -239,40 +284,42 @@ __memp_dump_region(dbenv, area, fp)
 		case 'h':
 			LF_SET(MPOOL_DUMP_HASH);
 			break;
-		case 'l':
-			LF_SET(MPOOL_DUMP_LRU);
-			break;
 		case 'm':
 			LF_SET(MPOOL_DUMP_MEM);
 			break;
 		}

-	R_LOCK(dbenv, dbmp->reginfo);
-
 	mp = dbmp->reginfo[0].primary;

 	/* Display MPOOL structures. */
 	(void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n",
-	    DB_LINE, (u_long)dbmp->reginfo[0].addr);
+	    DB_LINE, P_TO_ULONG(dbmp->reginfo[0].addr));

 	/* Display the MPOOLFILE structures. */
-	cnt = 0;
-	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	R_LOCK(dbenv, dbmp->reginfo);
+	for (cnt = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
 	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
-		(void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ",
-		    cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype,
-		    F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
+		(void)fprintf(fp, "File #%d: %s: pagesize %lu\n", cnt + 1,
+		    __memp_fns(dbmp, mfp), (u_long)mfp->stat.st_pagesize);
+		(void)fprintf(fp, "\t type %ld; ref %lu; blocks %lu; last %lu;",
+		    (long)mfp->ftype, (u_long)mfp->mpf_cnt,
+		    (u_long)mfp->block_cnt, (u_long)mfp->last_pgno);
+		__db_prflags(mfp->flags, fn, fp);
+
+		(void)fprintf(fp, "\n\t UID: ");
 		p = R_ADDR(dbmp->reginfo, mfp->fileid_off);
-		for (i = 0; i < DB_FILE_ID_LEN; ++i) {
-			(void)fprintf(fp, "%x", *p++);
+		for (i = 0; i < DB_FILE_ID_LEN; ++i, ++p) {
+			(void)fprintf(fp, "%x", (u_int)*p);
 			if (i < DB_FILE_ID_LEN - 1)
 				(void)fprintf(fp, " ");
 		}
-		(void)fprintf(fp, "]\n");
+		(void)fprintf(fp, "\n");
 		if (cnt < FMAP_ENTRIES)
 			fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
 	}
+	R_UNLOCK(dbenv, dbmp->reginfo);

+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
 	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
 	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
 		(void)fprintf(fp, "File #%d: %s: per-process, %s\n",
@ -281,6 +328,7 @@ __memp_dump_region(dbenv, area, fp)
 		    if (cnt < FMAP_ENTRIES)
 			fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
 	}
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 	if (cnt < FMAP_ENTRIES)
 		fmap[cnt] = INVALID_ROFF;
 	else
@ -289,13 +337,14 @@ __memp_dump_region(dbenv, area, fp)
 	/* Dump the memory pools. */
 	for (i = 0; i < mp->nreg; ++i) {
 		(void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1);
-		__memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags);
+		__memp_dumpcache(
+		    dbenv, dbmp, &dbmp->reginfo[i], fmap, fp, flags);
 	}

-	R_UNLOCK(dbenv, dbmp->reginfo);
-
 	/* Flush in case we're debugging. */
 	(void)fflush(fp);
+
+	return (0);
 }

 /*
@ -303,7 +352,8 @@ __memp_dump_region(dbenv, area, fp)
 *	Display statistics for a cache.
 */
 static void
-__memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
+__memp_dumpcache(dbenv, dbmp, reginfo, fmap, fp, flags)
+	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	REGINFO *reginfo;
 	size_t *fmap;
@ -311,7 +361,7 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
 	u_int32_t flags;
 {
 	BH *bhp;
-	DB_HASHTAB *dbht;
+	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp;
 	int bucket;

@ -320,25 +370,22 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
 	/* Display the hash table list of BH's. */
 	if (LF_ISSET(MPOOL_DUMP_HASH)) {
 		(void)fprintf(fp,
-	    "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n",
+		    "%s\nBH hash table (%lu hash slots)\nbucket (priority):\n",
 		    DB_LINE, (u_long)c_mp->htab_buckets);
-		for (dbht = R_ADDR(reginfo, c_mp->htab),
-		    bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) {
-			if (SH_TAILQ_FIRST(dbht, __bh) != NULL)
-				(void)fprintf(fp, "%lu:\n", (u_long)bucket);
-			for (bhp = SH_TAILQ_FIRST(dbht, __bh);
-			    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
-				__memp_pbh(dbmp, bhp, fmap, fp);
-		}
-	}
+		(void)fprintf(fp,
+		    "\tpageno, file, ref, address [LSN] priority\n");

-	/* Display the LRU list of BH's. */
-	if (LF_ISSET(MPOOL_DUMP_LRU)) {
-		(void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE);
-		(void)fprintf(fp, "pageno, file, ref, address\n");
-		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
-			__memp_pbh(dbmp, bhp, fmap, fp);
+		for (hp = R_ADDR(reginfo, c_mp->htab),
+		    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
+			if ((bhp =
+			    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+				(void)fprintf(fp, "%lu (%u):\n",
+				    (u_long)bucket, hp->hash_priority);
+			for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+				__memp_pbh(dbmp, bhp, fmap, fp);
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+		}
 	}

 	/* Dump the memory pool. */
@ -360,10 +407,9 @@ __memp_pbh(dbmp, bhp, fmap, fp)
 	static const FN fn[] = {
 		{ BH_CALLPGIN,		"callpgin" },
 		{ BH_DIRTY,		"dirty" },
+		{ BH_DIRTY_CREATE,	"created" },
 		{ BH_DISCARD,		"discard" },
 		{ BH_LOCKED,		"locked" },
-		{ BH_SYNC,		"sync" },
-		{ BH_SYNC_LOGFLSH,	"sync:logflush" },
 		{ BH_TRASH,		"trash" },
 		{ 0,			NULL }
 	};
@ -374,15 +420,72 @@ __memp_pbh(dbmp, bhp, fmap, fp)
 			break;

 	if (fmap[i] == INVALID_ROFF)
-		(void)fprintf(fp, "  %4lu, %lu, %2lu, %lu",
+		(void)fprintf(fp, "\t%5lu, %lu, %2lu, %8lu [%lu,%lu] %lu",
 		    (u_long)bhp->pgno, (u_long)bhp->mf_offset,
-		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
+		    (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
+		    (u_long)bhp->priority);
 	else
-		(void)fprintf(fp, "  %4lu,   #%d,  %2lu, %lu",
+		(void)fprintf(fp, "\t%5lu,   #%d,  %2lu, %8lu [%lu,%lu] %lu",
 		    (u_long)bhp->pgno, i + 1,
-		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
+		    (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
+		    (u_long)bhp->priority);

 	__db_prflags(bhp->flags, fn, fp);

 	(void)fprintf(fp, "\n");
 }
+
+/*
+ * __memp_stat_hash --
+ *	Total hash bucket stats (other than mutex wait) into the region.
+ *
+ * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+ */
+void
+__memp_stat_hash(reginfo, mp, dirtyp)
+	REGINFO *reginfo;
+	MPOOL *mp;
+	u_int32_t *dirtyp;
+{
+	DB_MPOOL_HASH *hp;
+	u_int32_t dirty;
+	int i;
+
+	hp = R_ADDR(reginfo, mp->htab);
+	for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++)
+		dirty += hp->hash_page_dirty;
+	*dirtyp = dirty;
+}
+
+/*
+ * __memp_stat_wait --
+ *	Total hash bucket wait stats into the region.
+ */
+static void
+__memp_stat_wait(reginfo, mp, mstat, flags)
+	REGINFO *reginfo;
+	MPOOL *mp;
+	DB_MPOOL_STAT *mstat;
+	int flags;
+{
+	DB_MPOOL_HASH *hp;
+	DB_MUTEX *mutexp;
+	int i;
+
+	mstat->st_hash_max_wait = 0;
+	hp = R_ADDR(reginfo, mp->htab);
+	for (i = 0; i < mp->htab_buckets; i++, hp++) {
+		mutexp = &hp->hash_mutex;
+		mstat->st_hash_nowait += mutexp->mutex_set_nowait;
+		mstat->st_hash_wait += mutexp->mutex_set_wait;
+		if (mutexp->mutex_set_wait > mstat->st_hash_max_wait)
+			mstat->st_hash_max_wait = mutexp->mutex_set_wait;
+
+		if (LF_ISSET(DB_STAT_CLEAR)) {
+			mutexp->mutex_set_wait = 0;
+			mutexp->mutex_set_nowait = 0;
+		}
+	}
+}
--- a/bdb/mp/mp_sync.c
+++ b/bdb/mp/mp_sync.c
--- a/bdb/mp/mp_trickle.c
+++ b/bdb/mp/mp_trickle.c
@ -1,13 +1,13 @@
 /*-
 * See the file LICENSE for redistribution information.
 *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
 *	Sleepycat Software.  All rights reserved.
 */
 #include "db_config.h"

 #ifndef lint
-static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_trickle.c,v 11.24 2002/08/06 06:13:53 bostic Exp $";
 #endif /* not lint */

 #ifndef NO_SYSTEM_INCLUDES
@ -16,42 +16,29 @@ static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell
 #include <stdlib.h>
 #endif

-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-static int __memp_trick __P((DB_ENV *, int, int, int *));
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"

 /*
- * memp_trickle --
+ * __memp_trickle --
 *	Keep a specified percentage of the buffers clean.
+ *
+ * PUBLIC: int __memp_trickle __P((DB_ENV *, int, int *));
 */
 int
-memp_trickle(dbenv, pct, nwrotep)
+__memp_trickle(dbenv, pct, nwrotep)
 	DB_ENV *dbenv;
 	int pct, *nwrotep;
 {
 	DB_MPOOL *dbmp;
-	MPOOL *mp;
-	u_int32_t i;
-	int ret;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_trickle(dbenv, pct, nwrotep));
-#endif
+	MPOOL *c_mp, *mp;
+	u_int32_t clean, dirty, i, total, dtmp;
+	int ret, wrote;

 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_trickle", DB_INIT_MPOOL);

 	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
@ -62,88 +49,35 @@ memp_trickle(dbenv, pct, nwrotep)
 	if (pct < 1 || pct > 100)
 		return (EINVAL);

-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/* Loop through the caches... */
-	for (ret = 0, i = 0; i < mp->nreg; ++i)
-		if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0)
-			break;
-
-	R_UNLOCK(dbenv, dbmp->reginfo);
-	return (ret);
-}
-
-/*
- * __memp_trick --
- *	Trickle a single cache.
- */
-static int
-__memp_trick(dbenv, ncache, pct, nwrotep)
-	DB_ENV *dbenv;
-	int ncache, pct, *nwrotep;
-{
-	BH *bhp;
-	DB_MPOOL *dbmp;
-	MPOOL *c_mp;
-	MPOOLFILE *mfp;
-	db_pgno_t pgno;
-	u_long total;
-	int ret, wrote;
-
-	dbmp = dbenv->mp_handle;
-	c_mp = dbmp->reginfo[ncache].primary;
-
 	/*
-	 * If there are sufficient clean buffers, or no buffers or no dirty
+	 * If there are sufficient clean buffers, no buffers or no dirty
 	 * buffers, we're done.
 	 *
 	 * XXX
-	 * Using st_page_clean and st_page_dirty is our only choice at the
-	 * moment, but it's not as correct as we might like in the presence
-	 * of pools with more than one buffer size, as a free 512-byte buffer
-	 * isn't the same as a free 8K buffer.
+	 * Using hash_page_dirty is our only choice at the moment, but it's not
+	 * as correct as we might like in the presence of pools having more
+	 * than one page size, as a free 512B buffer isn't the same as a free
+	 * 8KB buffer.
+	 *
+	 * Loop through the caches counting total/dirty buffers.
 	 */
-loop:	total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty;
-	if (total == 0 || c_mp->stat.st_page_dirty == 0 ||
-	    (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct)
-		return (0);
-
-	/* Loop until we write a buffer. */
-	for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
-		if (bhp->ref != 0 ||
-		    !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
-			continue;
-
-		mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
-
-		/*
-		 * We can't write to temporary files -- see the comment in
-		 * mp_bh.c:__memp_bhwrite().
-		 */
-		if (F_ISSET(mfp, MP_TEMP))
-			continue;
-
-		pgno = bhp->pgno;
-		if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
-			return (ret);
-
-		/*
-		 * Any process syncing the shared memory buffer pool had better
-		 * be able to write to any underlying file.  Be understanding,
-		 * but firm, on this point.
-		 */
-		if (!wrote) {
-			__db_err(dbenv, "%s: unable to flush page: %lu",
-			    __memp_fns(dbmp, mfp), (u_long)pgno);
-			return (EPERM);
-		}
-
-		++c_mp->stat.st_page_trickle;
-		if (nwrotep != NULL)
-			++*nwrotep;
-		goto loop;
+	for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) {
+		c_mp = dbmp->reginfo[i].primary;
+		total += c_mp->stat.st_pages;
+		__memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp);
+		dirty += dtmp;
 	}

-	return (0);
+	clean = total - dirty;
+	if (clean == total || (clean * 100) / total >= (u_long)pct)
+		return (0);
+
+	if (nwrotep == NULL)
+		nwrotep = &wrote;
+	ret = __memp_sync_int(dbenv, NULL,
+	    ((total * pct) / 100) - clean, DB_SYNC_TRICKLE, nwrotep);
+
+	mp->stat.st_page_trickle += *nwrotep;
+
+	return (ret);
 }