BDB 4.1.24

BitKeeper/deleted/.del-ex_access.wpj~3df6ae8c99bf7c5f:
  Delete: bdb/build_vxworks/ex_access/ex_access.wpj
BitKeeper/deleted/.del-ex_btrec.wpj~a7622f1c6f432dc6:
  Delete: bdb/build_vxworks/ex_btrec/ex_btrec.wpj
BitKeeper/deleted/.del-ex_dbclient.wpj~7345440f3b204cdd:
  Delete: bdb/build_vxworks/ex_dbclient/ex_dbclient.wpj
BitKeeper/deleted/.del-ex_env.wpj~fbe1ab10b04e8b74:
  Delete: bdb/build_vxworks/ex_env/ex_env.wpj
BitKeeper/deleted/.del-ex_mpool.wpj~4479cfd5c45f327d:
  Delete: bdb/build_vxworks/ex_mpool/ex_mpool.wpj
BitKeeper/deleted/.del-ex_tpcb.wpj~f78093006e14bf41:
  Delete: bdb/build_vxworks/ex_tpcb/ex_tpcb.wpj
BitKeeper/deleted/.del-db_buildall.dsp~bd749ff6da11682:
  Delete: bdb/build_win32/db_buildall.dsp
BitKeeper/deleted/.del-cxx_app.cpp~ad8df8e0791011ed:
  Delete: bdb/cxx/cxx_app.cpp
BitKeeper/deleted/.del-cxx_log.cpp~a50ff3118fe06952:
  Delete: bdb/cxx/cxx_log.cpp
BitKeeper/deleted/.del-cxx_table.cpp~ecd751e79b055556:
  Delete: bdb/cxx/cxx_table.cpp
BitKeeper/deleted/.del-namemap.txt~796a3acd3885d8fd:
  Delete: bdb/cxx/namemap.txt
BitKeeper/deleted/.del-Design.fileop~3ca4da68f1727373:
  Delete: bdb/db/Design.fileop
BitKeeper/deleted/.del-db185_int.h~61bee3736e7959ef:
  Delete: bdb/db185/db185_int.h
BitKeeper/deleted/.del-acconfig.h~411e8854d67ad8b5:
  Delete: bdb/dist/acconfig.h
BitKeeper/deleted/.del-mutex.m4~a13383cde18a64e1:
  Delete: bdb/dist/aclocal/mutex.m4
BitKeeper/deleted/.del-options.m4~b9d0ca637213750a:
  Delete: bdb/dist/aclocal/options.m4
BitKeeper/deleted/.del-programs.m4~3ce7890b47732b30:
  Delete: bdb/dist/aclocal/programs.m4
BitKeeper/deleted/.del-tcl.m4~f944e2db93c3b6db:
  Delete: bdb/dist/aclocal/tcl.m4
BitKeeper/deleted/.del-types.m4~59cae158c9a32cff:
  Delete: bdb/dist/aclocal/types.m4
BitKeeper/deleted/.del-script~d38f6d3a4f159cb4:
  Delete: bdb/dist/build/script
BitKeeper/deleted/.del-configure.in~ac795a92c8fe049c:
  Delete: bdb/dist/configure.in
BitKeeper/deleted/.del-ltconfig~66bbd007d8024af:
  Delete: bdb/dist/ltconfig
BitKeeper/deleted/.del-rec_ctemp~a28554362534f00a:
  Delete: bdb/dist/rec_ctemp
BitKeeper/deleted/.del-s_tcl~2ffe4326459fcd9f:
  Delete: bdb/dist/s_tcl
BitKeeper/deleted/.del-.IGNORE_ME~d8148b08fa7d5d15:
  Delete: bdb/dist/template/.IGNORE_ME
BitKeeper/deleted/.del-btree.h~179f2aefec1753d:
  Delete: bdb/include/btree.h
BitKeeper/deleted/.del-cxx_int.h~6b649c04766508f8:
  Delete: bdb/include/cxx_int.h
BitKeeper/deleted/.del-db.src~6b433ae615b16a8d:
  Delete: bdb/include/db.src
BitKeeper/deleted/.del-db_185.h~ad8b373d9391d35c:
  Delete: bdb/include/db_185.h
BitKeeper/deleted/.del-db_am.h~a714912b6b75932f:
  Delete: bdb/include/db_am.h
BitKeeper/deleted/.del-db_cxx.h~fcafadf45f5d19e9:
  Delete: bdb/include/db_cxx.h
BitKeeper/deleted/.del-db_dispatch.h~6844f20f7eb46904:
  Delete: bdb/include/db_dispatch.h
BitKeeper/deleted/.del-db_int.src~419a3f48b6a01da7:
  Delete: bdb/include/db_int.src
BitKeeper/deleted/.del-db_join.h~76f9747a42c3399a:
  Delete: bdb/include/db_join.h
BitKeeper/deleted/.del-db_page.h~e302ca3a4db3abdc:
  Delete: bdb/include/db_page.h
BitKeeper/deleted/.del-db_server_int.h~e1d20b6ba3bca1ab:
  Delete: bdb/include/db_server_int.h
BitKeeper/deleted/.del-db_shash.h~5fbf2d696fac90f3:
  Delete: bdb/include/db_shash.h
BitKeeper/deleted/.del-db_swap.h~1e60887550864a59:
  Delete: bdb/include/db_swap.h
BitKeeper/deleted/.del-db_upgrade.h~c644eee73701fc8d:
  Delete: bdb/include/db_upgrade.h
BitKeeper/deleted/.del-db_verify.h~b8d6c297c61f342e:
  Delete: bdb/include/db_verify.h
BitKeeper/deleted/.del-debug.h~dc2b4f2cf27ccebc:
  Delete: bdb/include/debug.h
BitKeeper/deleted/.del-hash.h~2aaa548b28882dfb:
  Delete: bdb/include/hash.h
BitKeeper/deleted/.del-lock.h~a761c1b7de57b77f:
  Delete: bdb/include/lock.h
BitKeeper/deleted/.del-log.h~ff20184238e35e4d:
  Delete: bdb/include/log.h
BitKeeper/deleted/.del-mp.h~7e317597622f3411:
  Delete: bdb/include/mp.h
BitKeeper/deleted/.del-mutex.h~d3ae7a2977a68137:
  Delete: bdb/include/mutex.h
BitKeeper/deleted/.del-os.h~91867cc8757cd0e3:
  Delete: bdb/include/os.h
BitKeeper/deleted/.del-os_jump.h~e1b939fa5151d4be:
  Delete: bdb/include/os_jump.h
BitKeeper/deleted/.del-qam.h~6fad0c1b5723d597:
  Delete: bdb/include/qam.h
BitKeeper/deleted/.del-queue.h~4c72c0826c123d5:
  Delete: bdb/include/queue.h
BitKeeper/deleted/.del-region.h~513fe04d977ca0fc:
  Delete: bdb/include/region.h
BitKeeper/deleted/.del-shqueue.h~525fc3e6c2025c36:
  Delete: bdb/include/shqueue.h
BitKeeper/deleted/.del-tcl_db.h~c536fd61a844f23f:
  Delete: bdb/include/tcl_db.h
BitKeeper/deleted/.del-txn.h~c8d94b221ec147e4:
  Delete: bdb/include/txn.h
BitKeeper/deleted/.del-xa.h~ecc466493aae9d9a:
  Delete: bdb/include/xa.h
BitKeeper/deleted/.del-DbRecoveryInit.java~756b52601a0b9023:
  Delete: bdb/java/src/com/sleepycat/db/DbRecoveryInit.java
BitKeeper/deleted/.del-DbTxnRecover.java~74607cba7ab89d6d:
  Delete: bdb/java/src/com/sleepycat/db/DbTxnRecover.java
BitKeeper/deleted/.del-lock_conflict.c~fc5e0f14cf597a2b:
  Delete: bdb/lock/lock_conflict.c
BitKeeper/deleted/.del-log.src~53ac9e7b5cb023f2:
  Delete: bdb/log/log.src
BitKeeper/deleted/.del-log_findckp.c~24287f008916e81f:
  Delete: bdb/log/log_findckp.c
BitKeeper/deleted/.del-log_rec.c~d51711f2cac09297:
  Delete: bdb/log/log_rec.c
BitKeeper/deleted/.del-log_register.c~b40bb4efac75ca15:
  Delete: bdb/log/log_register.c
BitKeeper/deleted/.del-Design~b3d0f179f2767b:
  Delete: bdb/mp/Design
BitKeeper/deleted/.del-os_finit.c~95dbefc6fe79b26c:
  Delete: bdb/os/os_finit.c
BitKeeper/deleted/.del-os_abs.c~df95d1e7db81924:
  Delete: bdb/os_vxworks/os_abs.c
BitKeeper/deleted/.del-os_finit.c~803b484bdb9d0122:
  Delete: bdb/os_vxworks/os_finit.c
BitKeeper/deleted/.del-os_map.c~3a6d7926398b76d3:
  Delete: bdb/os_vxworks/os_map.c
BitKeeper/deleted/.del-os_finit.c~19a227c6d3c78ad:
  Delete: bdb/os_win32/os_finit.c
BitKeeper/deleted/.del-log-corruption.patch~1cf2ecc7c6408d5d:
  Delete: bdb/patches/log-corruption.patch
BitKeeper/deleted/.del-Btree.pm~af6d0c5eaed4a98e:
  Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Btree.pm
BitKeeper/deleted/.del-BerkeleyDB.pm~7244036d4482643:
  Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pm
BitKeeper/deleted/.del-BerkeleyDB.pod~e7b18fd6132448e3:
  Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod
BitKeeper/deleted/.del-Hash.pm~10292a26c06a5c95:
  Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Hash.pm
BitKeeper/deleted/.del-BerkeleyDB.pod.P~79f76a1495eda203:
  Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod.P
BitKeeper/deleted/.del-BerkeleyDB.xs~80c99afbd98e392c:
  Delete: bdb/perl.BerkeleyDB/BerkeleyDB.xs
BitKeeper/deleted/.del-Changes~729c1891efa60de9:
  Delete: bdb/perl.BerkeleyDB/Changes
BitKeeper/deleted/.del-MANIFEST~63a1e34aecf157a0:
  Delete: bdb/perl.BerkeleyDB/MANIFEST
BitKeeper/deleted/.del-Makefile.PL~c68797707d8df87a:
  Delete: bdb/perl.BerkeleyDB/Makefile.PL
BitKeeper/deleted/.del-README~5f2f579b1a241407:
  Delete: bdb/perl.BerkeleyDB/README
BitKeeper/deleted/.del-Todo~dca3c66c193adda9:
  Delete: bdb/perl.BerkeleyDB/Todo
BitKeeper/deleted/.del-config.in~ae81681e450e0999:
  Delete: bdb/perl.BerkeleyDB/config.in
BitKeeper/deleted/.del-dbinfo~28ad67d83be4f68e:
  Delete: bdb/perl.BerkeleyDB/dbinfo
BitKeeper/deleted/.del-mkconsts~543ab60669c7a04e:
  Delete: bdb/perl.BerkeleyDB/mkconsts
BitKeeper/deleted/.del-mkpod~182c0ca54e439afb:
  Delete: bdb/perl.BerkeleyDB/mkpod
BitKeeper/deleted/.del-5.004~e008cb5a48805543:
  Delete: bdb/perl.BerkeleyDB/patches/5.004
BitKeeper/deleted/.del-irix_6_5.pl~61662bb08afcdec8:
  Delete: bdb/perl.BerkeleyDB/hints/irix_6_5.pl
BitKeeper/deleted/.del-solaris.pl~6771e7182394e152:
  Delete: bdb/perl.BerkeleyDB/hints/solaris.pl
BitKeeper/deleted/.del-typemap~783b8f5295b05f3d:
  Delete: bdb/perl.BerkeleyDB/typemap
BitKeeper/deleted/.del-5.004_01~6081ce2fff7b0bc:
  Delete: bdb/perl.BerkeleyDB/patches/5.004_01
BitKeeper/deleted/.del-5.004_02~87214eac35ad9e6:
  Delete: bdb/perl.BerkeleyDB/patches/5.004_02
BitKeeper/deleted/.del-5.004_03~9a672becec7cb40f:
  Delete: bdb/perl.BerkeleyDB/patches/5.004_03
BitKeeper/deleted/.del-5.004_04~e326cb51af09d154:
  Delete: bdb/perl.BerkeleyDB/patches/5.004_04
BitKeeper/deleted/.del-5.004_05~7ab457a1e41a92fe:
  Delete: bdb/perl.BerkeleyDB/patches/5.004_05
BitKeeper/deleted/.del-5.005~f9e2d59b5964cd4b:
  Delete: bdb/perl.BerkeleyDB/patches/5.005
BitKeeper/deleted/.del-5.005_01~3eb9fb7b5842ea8e:
  Delete: bdb/perl.BerkeleyDB/patches/5.005_01
BitKeeper/deleted/.del-5.005_02~67477ce0bef717cb:
  Delete: bdb/perl.BerkeleyDB/patches/5.005_02
BitKeeper/deleted/.del-5.005_03~c4c29a1fb21e290a:
  Delete: bdb/perl.BerkeleyDB/patches/5.005_03
BitKeeper/deleted/.del-5.6.0~e1fb9897d124ee22:
  Delete: bdb/perl.BerkeleyDB/patches/5.6.0
BitKeeper/deleted/.del-btree.t~e4a1a3c675ddc406:
  Delete: bdb/perl.BerkeleyDB/t/btree.t
BitKeeper/deleted/.del-db-3.0.t~d2c60991d84558f2:
  Delete: bdb/perl.BerkeleyDB/t/db-3.0.t
BitKeeper/deleted/.del-db-3.1.t~6ee88cd13f55e018:
  Delete: bdb/perl.BerkeleyDB/t/db-3.1.t
BitKeeper/deleted/.del-db-3.2.t~f73b6461f98fd1cf:
  Delete: bdb/perl.BerkeleyDB/t/db-3.2.t
BitKeeper/deleted/.del-destroy.t~cc6a2ae1980a2ecd:
  Delete: bdb/perl.BerkeleyDB/t/destroy.t
BitKeeper/deleted/.del-env.t~a8604a4499c4bd07:
  Delete: bdb/perl.BerkeleyDB/t/env.t
BitKeeper/deleted/.del-examples.t~2571b77c3cc75574:
  Delete: bdb/perl.BerkeleyDB/t/examples.t
BitKeeper/deleted/.del-examples.t.T~8228bdd75ac78b88:
  Delete: bdb/perl.BerkeleyDB/t/examples.t.T
BitKeeper/deleted/.del-examples3.t.T~66a186897a87026d:
  Delete: bdb/perl.BerkeleyDB/t/examples3.t.T
BitKeeper/deleted/.del-examples3.t~fe3822ba2f2d7f83:
  Delete: bdb/perl.BerkeleyDB/t/examples3.t
BitKeeper/deleted/.del-filter.t~f87b045c1b708637:
  Delete: bdb/perl.BerkeleyDB/t/filter.t
BitKeeper/deleted/.del-hash.t~616bfb4d644de3a3:
  Delete: bdb/perl.BerkeleyDB/t/hash.t
BitKeeper/deleted/.del-join.t~29fc39f74a83ca22:
  Delete: bdb/perl.BerkeleyDB/t/join.t
BitKeeper/deleted/.del-mldbm.t~31f5015341eea040:
  Delete: bdb/perl.BerkeleyDB/t/mldbm.t
BitKeeper/deleted/.del-queue.t~8f338034ce44a641:
  Delete: bdb/perl.BerkeleyDB/t/queue.t
BitKeeper/deleted/.del-recno.t~d4ddbd3743add63e:
  Delete: bdb/perl.BerkeleyDB/t/recno.t
BitKeeper/deleted/.del-strict.t~6885cdd2ea71ca2d:
  Delete: bdb/perl.BerkeleyDB/t/strict.t
BitKeeper/deleted/.del-subdb.t~aab62a5d5864c603:
  Delete: bdb/perl.BerkeleyDB/t/subdb.t
BitKeeper/deleted/.del-txn.t~65033b8558ae1216:
  Delete: bdb/perl.BerkeleyDB/t/txn.t
BitKeeper/deleted/.del-unknown.t~f3710458682665e1:
  Delete: bdb/perl.BerkeleyDB/t/unknown.t
BitKeeper/deleted/.del-Changes~436f74a5c414c65b:
  Delete: bdb/perl.DB_File/Changes
BitKeeper/deleted/.del-DB_File.pm~ae0951c6c7665a82:
  Delete: bdb/perl.DB_File/DB_File.pm
BitKeeper/deleted/.del-DB_File.xs~89e49a0b5556f1d8:
  Delete: bdb/perl.DB_File/DB_File.xs
BitKeeper/deleted/.del-DB_File_BS~290fad5dbbb87069:
  Delete: bdb/perl.DB_File/DB_File_BS
BitKeeper/deleted/.del-MANIFEST~90ee581572bdd4ac:
  Delete: bdb/perl.DB_File/MANIFEST
BitKeeper/deleted/.del-Makefile.PL~ac0567bb5a377e38:
  Delete: bdb/perl.DB_File/Makefile.PL
BitKeeper/deleted/.del-README~77e924a5a9bae6b3:
  Delete: bdb/perl.DB_File/README
BitKeeper/deleted/.del-config.in~ab4c2792b86a810b:
  Delete: bdb/perl.DB_File/config.in
BitKeeper/deleted/.del-dbinfo~461c43b30fab2cb:
  Delete: bdb/perl.DB_File/dbinfo
BitKeeper/deleted/.del-dynixptx.pl~50dcddfae25d17e9:
  Delete: bdb/perl.DB_File/hints/dynixptx.pl
BitKeeper/deleted/.del-typemap~55cffb3288a9e587:
  Delete: bdb/perl.DB_File/typemap
BitKeeper/deleted/.del-version.c~a4df0e646f8b3975:
  Delete: bdb/perl.DB_File/version.c
BitKeeper/deleted/.del-5.004_01~d6830d0082702af7:
  Delete: bdb/perl.DB_File/patches/5.004_01
BitKeeper/deleted/.del-5.004_02~78b082dc80c91031:
  Delete: bdb/perl.DB_File/patches/5.004_02
BitKeeper/deleted/.del-5.004~4411ec2e3c9e008b:
  Delete: bdb/perl.DB_File/patches/5.004
BitKeeper/deleted/.del-sco.pl~1e795fe14fe4dcfe:
  Delete: bdb/perl.DB_File/hints/sco.pl
BitKeeper/deleted/.del-5.004_03~33f274648b160d95:
  Delete: bdb/perl.DB_File/patches/5.004_03
BitKeeper/deleted/.del-5.004_04~8f3d1b3cf18bb20a:
  Delete: bdb/perl.DB_File/patches/5.004_04
BitKeeper/deleted/.del-5.004_05~9c0f02e7331e142:
  Delete: bdb/perl.DB_File/patches/5.004_05
BitKeeper/deleted/.del-5.005~c2108cb2e3c8d951:
  Delete: bdb/perl.DB_File/patches/5.005
BitKeeper/deleted/.del-5.005_01~3b45e9673afc4cfa:
  Delete: bdb/perl.DB_File/patches/5.005_01
BitKeeper/deleted/.del-5.005_02~9fe5766bb02a4522:
  Delete: bdb/perl.DB_File/patches/5.005_02
BitKeeper/deleted/.del-5.005_03~ffa1c38c19ae72ea:
  Delete: bdb/perl.DB_File/patches/5.005_03
BitKeeper/deleted/.del-5.6.0~373be3a5ce47be85:
  Delete: bdb/perl.DB_File/patches/5.6.0
BitKeeper/deleted/.del-db-btree.t~3231595a1c241eb3:
  Delete: bdb/perl.DB_File/t/db-btree.t
BitKeeper/deleted/.del-db-hash.t~7c4ad0c795c7fad2:
  Delete: bdb/perl.DB_File/t/db-hash.t
BitKeeper/deleted/.del-db-recno.t~6c2d3d80b9ba4a50:
  Delete: bdb/perl.DB_File/t/db-recno.t
BitKeeper/deleted/.del-db_server.sed~cdb00ebcd48a64e2:
  Delete: bdb/rpc_server/db_server.sed
BitKeeper/deleted/.del-db_server_proc.c~d46c8f409c3747f4:
  Delete: bdb/rpc_server/db_server_proc.c
BitKeeper/deleted/.del-db_server_svc.sed~3f5e59f334fa4607:
  Delete: bdb/rpc_server/db_server_svc.sed
BitKeeper/deleted/.del-db_server_util.c~a809f3a4629acda:
  Delete: bdb/rpc_server/db_server_util.c
BitKeeper/deleted/.del-log.tcl~ff1b41f1355b97d7:
  Delete: bdb/test/log.tcl
BitKeeper/deleted/.del-mpool.tcl~b0df4dc1b04db26c:
  Delete: bdb/test/mpool.tcl
BitKeeper/deleted/.del-mutex.tcl~52fd5c73a150565:
  Delete: bdb/test/mutex.tcl
BitKeeper/deleted/.del-txn.tcl~c4ff071550b5446e:
  Delete: bdb/test/txn.tcl
BitKeeper/deleted/.del-README~e800a12a5392010a:
  Delete: bdb/test/upgrade/README
BitKeeper/deleted/.del-pack-2.6.6.pl~89d5076d758d3e98:
  Delete: bdb/test/upgrade/generate-2.X/pack-2.6.6.pl
BitKeeper/deleted/.del-test-2.6.patch~4a52dc83d447547b:
  Delete: bdb/test/upgrade/generate-2.X/test-2.6.patch
This commit is contained in:
unknown 2002-10-30 15:57:05 +04:00
commit 155e78f014
1191 changed files with 170446 additions and 57453 deletions

View file

@ -1,52 +0,0 @@
$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
There are three ways we do locking in the mpool code:
Locking a handle mutex to provide concurrency for DB_THREAD operations.
Locking the region mutex to provide mutual exclusion while reading and
writing structures in the shared region.
Locking buffer header mutexes during I/O.
The first will not be further described here. We use the shared mpool
region lock to provide mutual exclusion while reading/modifying all of
the data structures, including the buffer headers. We use a per-buffer
header lock to wait on buffer I/O. The order of locking is as follows:
Searching for a buffer:
Acquire the region lock.
Find the buffer header.
Increment the reference count (guarantee the buffer stays).
While the BH_LOCKED flag is set (I/O is going on) {
Release the region lock.
Explicitly yield the processor if it's not the first pass
through this loop, otherwise, we can simply spin because
we'll be simply switching between the two locks.
Request the buffer lock.
The I/O will complete...
Acquire the buffer lock.
Release the buffer lock.
Acquire the region lock.
}
Return the buffer.
Reading/writing a buffer:
Acquire the region lock.
Find/create the buffer header.
If reading, increment the reference count (guarantee the buffer stays).
Set the BH_LOCKED flag.
Acquire the buffer lock (guaranteed not to block).
Release the region lock.
Do the I/O and/or initialize the buffer contents.
Release the buffer lock.
At this point, the buffer lock is available, but the logical
operation (flagged by BH_LOCKED) is not yet completed. For
this reason, among others, threads checking the BH_LOCKED flag
must loop around their test.
Acquire the region lock.
Clear the BH_LOCKED flag.
Release the region lock.
Return/discard the buffer.
Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
not reacquired when a region lock is reacquired because they couldn't
have been closed/discarded and because they never move in memory.

View file

@ -1,22 +1,31 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <string.h>
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
typedef struct {
DB_MPOOL_HASH *bucket;
u_int32_t priority;
} HS;
static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
/*
* __memp_alloc --
@ -34,14 +43,32 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
roff_t *offsetp;
void *retp;
{
BH *bhp, *nbhp;
BH *bhp;
DB_ENV *dbenv;
DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
DB_MUTEX *mutexp;
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t total;
int nomore, restart, ret, wrote;
size_t freed_space;
u_int32_t buckets, buffers, high_priority, max_na, priority;
int aggressive, ret;
void *p;
dbenv = dbmp->dbenv;
c_mp = memreg->primary;
dbht = R_ADDR(memreg, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
buckets = buffers = 0;
aggressive = 0;
c_mp->stat.st_alloc++;
/*
* Get aggressive if we've tried to flush the number of pages as are
* in the system without finding space.
*/
max_na = 5 * c_mp->htab_buckets;
/*
* If we're allocating a buffer, and the one we're discarding is the
@ -53,100 +80,363 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
if (mfp != NULL)
len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
nomore = 0;
R_LOCK(dbenv, memreg);
/*
* On every buffer allocation we update the buffer generation number
* and check for wraparound.
*/
if (++c_mp->lru_count == UINT32_T_MAX)
__memp_reset_lru(dbenv, memreg, c_mp);
/*
* Anything newer than 1/10th of the buffer pool is ignored during
* allocation (unless allocation starts failing).
*/
DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
/*
* First we try to allocate from free memory. If that fails, scan the
* buffer pool to find buffers with low priorities. We consider small
* sets of hash buckets each time to limit the amount of work needing
* to be done. This approximates LRU, but not very well. We either
* find a buffer of the same size to use, or we will free 3 times what
* we need in the hopes it will coalesce into a contiguous chunk of the
* right size. In the latter case we branch back here and try again.
*/
alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
if (offsetp != NULL)
if (mfp != NULL)
c_mp->stat.st_pages++;
R_UNLOCK(dbenv, memreg);
found: if (offsetp != NULL)
*offsetp = R_OFFSET(memreg, p);
*(void **)retp = p;
/*
* Update the search statistics.
*
* We're not holding the region locked here, these statistics
* can't be trusted.
*/
if (buckets != 0) {
if (buckets > c_mp->stat.st_alloc_max_buckets)
c_mp->stat.st_alloc_max_buckets = buckets;
c_mp->stat.st_alloc_buckets += buckets;
}
if (buffers != 0) {
if (buffers > c_mp->stat.st_alloc_max_pages)
c_mp->stat.st_alloc_max_pages = buffers;
c_mp->stat.st_alloc_pages += buffers;
}
return (0);
}
if (nomore) {
__db_err(dbmp->dbenv,
"Unable to allocate %lu bytes from mpool shared region: %s\n",
(u_long)len, db_strerror(ret));
return (ret);
}
retry: /* Find a buffer we can flush; pure LRU. */
restart = total = 0;
for (bhp =
SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
/*
* We re-attempt the allocation every time we've freed 3 times what
* we need. Reset our free-space counter.
*/
freed_space = 0;
/* Ignore pinned or locked (I/O in progress) buffers. */
if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
/*
* Walk the hash buckets and find the next two with potentially useful
* buffers. Free the buffer with the lowest priority from the buckets'
* chains.
*/
for (hp_tmp = NULL;;) {
/* Check for wrap around. */
hp = &dbht[c_mp->last_checked++];
if (hp >= hp_end) {
c_mp->last_checked = 0;
/*
* If we've gone through all of the hash buckets, try
* an allocation. If the cache is small, the old page
* size is small, and the new page size is large, we
* might have freed enough memory (but not 3 times the
* memory).
*/
goto alloc;
}
/*
* Skip empty buckets.
*
* We can check for empty buckets before locking as we
* only care if the pointer is zero or non-zero.
*/
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
/*
* The failure mode is when there are too many buffers we can't
* write or there's not enough memory in the system. We don't
* have a metric for deciding if allocation has no possible way
* to succeed, so we don't ever fail, we assume memory will be
* available if we wait long enough.
*
* Get aggressive if we've tried to flush 5 times the number of
* hash buckets as are in the system -- it's possible we have
* been repeatedly trying to flush the same buffers, although
* it's unlikely. Aggressive means:
*
* a: set a flag to attempt to flush high priority buffers as
* well as other buffers.
* b: sync the mpool to force out queue extent pages. While we
* might not have enough space for what we want and flushing
* is expensive, why not?
* c: sleep for a second -- hopefully someone else will run and
* free up some memory. Try to allocate memory too, in case
* the other thread returns its memory to the region.
* d: look at a buffer in every hash bucket rather than choose
* the more preferable of two.
*
* !!!
* This test ignores pathological cases like no buffers in the
* system -- that shouldn't be possible.
*/
if ((++buckets % max_na) == 0) {
aggressive = 1;
R_UNLOCK(dbenv, memreg);
(void)__memp_sync_int(
dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
(void)__os_sleep(dbenv, 1, 0);
R_LOCK(dbenv, memreg);
goto alloc;
}
if (!aggressive) {
/* Skip high priority buckets. */
if (hp->hash_priority > high_priority)
continue;
/*
* Find two buckets and select the one with the lowest
* priority. Performance testing shows that looking
* at two improves the LRUness and looking at more only
* does a little better.
*/
if (hp_tmp == NULL) {
hp_tmp = hp;
continue;
}
if (hp->hash_priority > hp_tmp->hash_priority)
hp = hp_tmp;
hp_tmp = NULL;
}
/* Remember the priority of the buffer we're looking for. */
priority = hp->hash_priority;
/* Unlock the region and lock the hash bucket. */
R_UNLOCK(dbenv, memreg);
mutexp = &hp->hash_mutex;
MUTEX_LOCK(dbenv, mutexp);
#ifdef DIAGNOSTIC
__memp_check_order(hp);
#endif
/*
* The lowest priority page is first in the bucket, as they are
* maintained in sorted order.
*
* The buffer may have been freed or its priority changed while
* we switched from the region lock to the hash lock. If so,
* we have to restart. We will still take the first buffer on
* the bucket's list, though, if it has a low enough priority.
*/
if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
bhp->ref != 0 || bhp->priority > priority)
goto next_hb;
buffers++;
/* Find the associated MPOOLFILE. */
bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
/* Write the page if it's dirty. */
/* If the page is dirty, pin it and write it. */
ret = 0;
if (F_ISSET(bhp, BH_DIRTY)) {
++bhp->ref;
if ((ret = __memp_bhwrite(dbmp,
bh_mfp, bhp, &restart, &wrote)) != 0)
return (ret);
ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
--bhp->ref;
/*
* Another process may have acquired this buffer and
* incremented the ref count after we wrote it.
*/
if (bhp->ref != 0)
goto retry;
/*
* If we wrote the page, continue and free the buffer.
* We don't have to rewalk the list to acquire the
* buffer because it was never available for any other
* process to modify it.
*
* If we didn't write the page, but we discarded and
* reacquired the region lock, restart the list walk.
*
* If we neither wrote the buffer nor discarded the
* region lock, continue down the buffer list.
*/
if (wrote)
if (ret == 0)
++c_mp->stat.st_rw_evict;
else {
if (restart)
goto retry;
continue;
}
} else
++c_mp->stat.st_ro_evict;
/*
* If a write fails for any reason, we can't proceed.
*
* We released the hash bucket lock while doing I/O, so another
* thread may have acquired this buffer and incremented the ref
* count after we wrote it, in which case we can't have it.
*
* If there's a write error, avoid selecting this buffer again
* by making it the bucket's least-desirable buffer.
*/
if (ret != 0 || bhp->ref != 0) {
if (ret != 0 && aggressive)
__memp_bad_buffer(hp);
goto next_hb;
}
/*
* Check to see if the buffer is the size we're looking for.
* If it is, simply reuse it.
* If so, we can simply reuse it. Else, free the buffer and
* its space and keep looking.
*/
if (mfp != NULL &&
mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
__memp_bhfree(dbmp, bhp, 0);
__memp_bhfree(dbmp, hp, bhp, 0);
if (offsetp != NULL)
*offsetp = R_OFFSET(memreg, bhp);
*(void **)retp = bhp;
return (0);
p = bhp;
goto found;
}
/* Note how much space we've freed, and free the buffer. */
total += __db_shsizeof(bhp);
__memp_bhfree(dbmp, bhp, 1);
freed_space += __db_shsizeof(bhp);
__memp_bhfree(dbmp, hp, bhp, 1);
/*
* Retry as soon as we've freed up sufficient space. If we
* have to coalesce of memory to satisfy the request, don't
* try until it's likely (possible?) that we'll succeed.
* Unlock this hash bucket and re-acquire the region lock. If
* we're reaching here as a result of calling memp_bhfree, the
* hash bucket lock has already been discarded.
*/
if (total >= 3 * len)
goto alloc;
if (0) {
next_hb: MUTEX_UNLOCK(dbenv, mutexp);
}
R_LOCK(dbenv, memreg);
/* Restart the walk if we discarded the region lock. */
if (restart)
goto retry;
/*
* Retry the allocation as soon as we've freed up sufficient
* space. We're likely to have to coalesce of memory to
* satisfy the request, don't try until it's likely (possible?)
* we'll succeed.
*/
if (freed_space >= 3 * len)
goto alloc;
}
nomore = 1;
goto alloc;
/* NOTREACHED */
}
/*
* __memp_bad_buffer --
* Make the first buffer in a hash bucket the least desirable buffer.
*/
static void
__memp_bad_buffer(hp)
DB_MPOOL_HASH *hp;
{
BH *bhp, *t_bhp;
u_int32_t priority;
/* Remove the first buffer from the bucket. */
bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
/*
* Find the highest priority buffer in the bucket. Buffers are
* sorted by priority, so it's the last one in the bucket.
*
* XXX
* Should use SH_TAILQ_LAST, but I think that macro is broken.
*/
priority = bhp->priority;
for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
priority = t_bhp->priority;
/*
* Set our buffer's priority to be just as bad, and append it to
* the bucket.
*/
bhp->priority = priority;
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
/* Reset the hash bucket's priority. */
hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
/*
* __memp_reset_lru --
* Reset the cache LRU counter.
*/
static void
__memp_reset_lru(dbenv, memreg, c_mp)
DB_ENV *dbenv;
REGINFO *memreg;
MPOOL *c_mp;
{
BH *bhp;
DB_MPOOL_HASH *hp;
int bucket;
/*
* Update the counter so all future allocations will start at the
* bottom.
*/
c_mp->lru_count -= MPOOL_BASE_DECREMENT;
/* Release the region lock. */
R_UNLOCK(dbenv, memreg);
/* Adjust the priority of every buffer in the system. */
for (hp = R_ADDR(memreg, c_mp->htab),
bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
/*
* Skip empty buckets.
*
* We can check for empty buckets before locking as we
* only care if the pointer is zero or non-zero.
*/
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
MUTEX_LOCK(dbenv, &hp->hash_mutex);
for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
if (bhp->priority != UINT32_T_MAX &&
bhp->priority > MPOOL_BASE_DECREMENT)
bhp->priority -= MPOOL_BASE_DECREMENT;
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
/* Reacquire the region lock. */
R_LOCK(dbenv, memreg);
}
#ifdef DIAGNOSTIC
/*
* __memp_check_order --
* Verify the priority ordering of a hash bucket chain.
*
* PUBLIC: #ifdef DIAGNOSTIC
* PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
* PUBLIC: #endif
*/
void
__memp_check_order(hp)
DB_MPOOL_HASH *hp;
{
BH *bhp;
u_int32_t priority;
/*
* Assumes the hash bucket is locked.
*/
if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
return;
DB_ASSERT(bhp->priority == hp->hash_priority);
for (priority = bhp->priority;
(bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
priority = bhp->priority)
DB_ASSERT(priority <= bhp->priority);
}
#endif

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
static const char revid[] = "$Id: mp_bh.c,v 11.71 2002/09/04 19:06:45 margo Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -18,40 +18,41 @@ static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#include "log.h"
#include "db_page.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
#include "dbinc/log.h"
#include "dbinc/db_page.h"
static int __memp_pgwrite
__P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
/*
* __memp_bhwrite --
* Write the page associated with a given bucket header.
* Write the page associated with a given buffer header.
*
* PUBLIC: int __memp_bhwrite
* PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
* PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
* PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
*/
int
__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
MPOOLFILE *mfp;
BH *bhp;
int *restartp, *wrotep;
int open_extents;
{
DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
DB_MPREG *mpreg;
int incremented, ret;
int local_open, incremented, ret;
if (restartp != NULL)
*restartp = 0;
if (wrotep != NULL)
*wrotep = 0;
incremented = 0;
dbenv = dbmp->dbenv;
local_open = incremented = 0;
/*
* If the file has been removed or is a closed temporary file, Jump
* right ahead and pretend that we've found the file we want-- the
* If the file has been removed or is a closed temporary file, jump
* right ahead and pretend that we've found the file we want -- the
* page-write function knows how to handle the fact that we don't have
* (or need!) any real file descriptor information.
*/
@ -66,52 +67,60 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* If we find a descriptor on the file that's not open for writing, we
* try and upgrade it to make it writeable. If that fails, we're done.
*/
MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
if (dbmfp->mfp == mfp) {
if (F_ISSET(dbmfp, MP_READONLY) &&
__memp_upgrade(dbmp, dbmfp, mfp)) {
MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
return (0);
!F_ISSET(dbmfp, MP_UPGRADE) &&
(F_ISSET(dbmfp, MP_UPGRADE_FAIL) ||
__memp_upgrade(dbmp, dbmfp, mfp))) {
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (EPERM);
}
/*
* Increment the reference count -- see the comment in
* memp_fclose().
* __memp_fclose_int().
*/
++dbmfp->ref;
incremented = 1;
break;
}
MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (dbmfp != NULL)
goto found;
/*
* !!!
* It's the caller's choice if we're going to open extent files.
*/
if (!open_extents && F_ISSET(mfp, MP_EXTENT))
return (EPERM);
/*
* !!!
* Don't try to attach to temporary files. There are two problems in
* trying to do that. First, if we have different privileges than the
* process that "owns" the temporary file, we might create the backing
* disk file such that the owning process couldn't read/write its own
* buffers, e.g., memp_trickle() running as root creating a file owned
* buffers, e.g., memp_trickle running as root creating a file owned
* as root, mode 600. Second, if the temporary file has already been
* created, we don't have any way of finding out what its real name is,
* and, even if we did, it was already unlinked (so that it won't be
* left if the process dies horribly). This decision causes a problem,
* however: if the temporary file consumes the entire buffer cache,
* and the owner doesn't flush the buffers to disk, we could end up
* with resource starvation, and the memp_trickle() thread couldn't do
* with resource starvation, and the memp_trickle thread couldn't do
* anything about it. That's a pretty unlikely scenario, though.
*
* Note that we should never get here when the temporary file
* in question has already been closed in another process, in which
* case it should be marked MP_DEADFILE.
* Note we should never get here when the temporary file in question
* has already been closed in another process, in which case it should
* be marked MP_DEADFILE.
*/
if (F_ISSET(mfp, MP_TEMP)) {
DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
return (0);
}
if (F_ISSET(mfp, MP_TEMP))
return (EPERM);
/*
* It's not a page from a file we've opened. If the file requires
@ -120,14 +129,14 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* nothing we can do.
*/
if (mfp->ftype != 0) {
MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (mpreg = LIST_FIRST(&dbmp->dbregq);
mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
if (mpreg->ftype == mfp->ftype)
break;
MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (mpreg == NULL)
return (0);
return (EPERM);
}
/*
@ -138,17 +147,24 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* There's no negative cache, so we may repeatedly try and open files
* that we have previously tried (and failed) to open.
*/
if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
return (0);
found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
if (incremented) {
MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
--dbmfp->ref;
MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
return (ret);
if ((ret = __memp_fopen_int(dbmfp, mfp,
R_ADDR(dbmp->reginfo, mfp->path_off),
0, 0, mfp->stat.st_pagesize)) != 0) {
(void)dbmfp->close(dbmfp, 0);
return (ret);
}
local_open = 1;
found: ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp);
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
if (incremented)
--dbmfp->ref;
else if (local_open)
F_SET(dbmfp, MP_FLUSH);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (ret);
}
@ -157,11 +173,12 @@ found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
* __memp_pgread --
* Read a page from a file.
*
* PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
* PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int));
*/
int
__memp_pgread(dbmfp, bhp, can_create)
__memp_pgread(dbmfp, mutexp, bhp, can_create)
DB_MPOOLFILE *dbmfp;
DB_MUTEX *mutexp;
BH *bhp;
int can_create;
{
@ -169,171 +186,129 @@ __memp_pgread(dbmfp, bhp, can_create)
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
size_t len, pagesize;
size_t nr;
int created, ret;
size_t len, nr, pagesize;
int ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
pagesize = mfp->stat.st_pagesize;
/* We should never be called with a dirty or a locked buffer. */
DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED));
/* Lock the buffer and swap the hash bucket lock for the buffer lock. */
F_SET(bhp, BH_LOCKED | BH_TRASH);
MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
R_UNLOCK(dbenv, dbmp->reginfo);
MUTEX_LOCK(dbenv, &bhp->mutex);
MUTEX_UNLOCK(dbenv, mutexp);
/*
* Temporary files may not yet have been created. We don't create
* them now, we create them when the pages have to be flushed.
*/
nr = 0;
if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
/*
* Ignore read errors if we have permission to create the page.
* Assume that the page doesn't exist, and that we'll create it
* when we write it out.
*
* XXX
* Theoretically, we could overwrite a page of data if it were
* possible for a file to be successfully opened for reading
* and then for the read to fail. Shouldn't ever happen, but
* it might be worth checking to see if the offset is past the
* known end-of-file.
*/
db_io.fhp = &dbmfp->fh;
if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = pagesize;
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
} else
ret = 0;
created = 0;
if (nr < pagesize) {
if (can_create)
created = 1;
else {
/*
* If we had a short read, ret may be 0. This may not
* be an error -- in particular DB recovery processing
* may request pages that have never been written to
* disk, in which case we won't find the page. So, the
* caller must know how to handle the error.
*/
if (ret == 0)
ret = EIO;
/*
* The page may not exist; if it doesn't, nr may well be 0,
* but we expect the underlying OS calls not to return an
* error code in this case.
*/
if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0)
goto err;
}
}
/*
* Clear any bytes we didn't read that need to be cleared. If we're
* running in diagnostic mode, smash any bytes on the page that are
* unknown quantities for the caller.
*/
if (nr != pagesize) {
if (nr < pagesize) {
/*
* Don't output error messages for short reads. In particular,
* DB recovery processing may request pages never written to
* disk or for which only some part have been written to disk,
* in which case we won't find the page. The caller must know
* how to handle the error.
*/
if (can_create == 0) {
ret = DB_PAGE_NOTFOUND;
goto err;
}
/* Clear any bytes that need to be cleared. */
len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
if (nr < len)
memset(bhp->buf + nr, 0, len - nr);
#ifdef DIAGNOSTIC
if (nr > len)
len = nr;
memset(bhp->buf, 0, len);
#if defined(DIAGNOSTIC) || defined(UMRW)
/*
* If we're running in diagnostic mode, corrupt any bytes on
* the page that are unknown quantities for the caller.
*/
if (len < pagesize)
memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
#endif
}
++mfp->stat.st_page_create;
} else
++mfp->stat.st_page_in;
/* Call any pgin function. */
ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
/* Unlock the buffer and reacquire the region lock. */
/* Unlock the buffer and reacquire the hash bucket lock. */
err: MUTEX_UNLOCK(dbenv, &bhp->mutex);
R_LOCK(dbenv, dbmp->reginfo);
MUTEX_LOCK(dbenv, mutexp);
/*
* If no errors occurred, the data is now valid, clear the BH_TRASH
* flag; regardless, clear the lock bit and let other threads proceed.
*/
F_CLR(bhp, BH_LOCKED);
if (ret == 0) {
if (ret == 0)
F_CLR(bhp, BH_TRASH);
/* Update the statistics. */
if (created)
++mfp->stat.st_page_create;
else
++mfp->stat.st_page_in;
}
return (ret);
}
/*
* __memp_pgwrite --
* Write a page to a file.
*
* PUBLIC: int __memp_pgwrite
* PUBLIC: __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
*/
int
__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
static int
__memp_pgwrite(dbmp, dbmfp, hp, bhp)
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
DB_MPOOL_HASH *hp;
BH *bhp;
int *restartp, *wrotep;
{
DB_ENV *dbenv;
DB_IO db_io;
DB_LSN lsn;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
size_t nw;
int callpgin, dosync, ret, syncfail;
const char *fail;
int callpgin, ret;
dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
if (restartp != NULL)
*restartp = 0;
if (wrotep != NULL)
*wrotep = 0;
callpgin = 0;
callpgin = ret = 0;
/*
* Check the dirty bit -- this buffer may have been written since we
* decided to write it.
* We should never be called with a clean or trash buffer.
* The sync code does call us with already locked buffers.
*/
if (!F_ISSET(bhp, BH_DIRTY)) {
if (wrotep != NULL)
*wrotep = 1;
return (0);
}
MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
DB_ASSERT(!F_ISSET(bhp, BH_TRASH));
/*
* If there were two writers, we may have just been waiting while the
* other writer completed I/O on this buffer. Check the dirty bit one
* more time.
* If we have not already traded the hash bucket lock for the buffer
* lock, do so now.
*/
if (!F_ISSET(bhp, BH_DIRTY)) {
MUTEX_UNLOCK(dbenv, &bhp->mutex);
if (wrotep != NULL)
*wrotep = 1;
return (0);
if (!F_ISSET(bhp, BH_LOCKED)) {
F_SET(bhp, BH_LOCKED);
MUTEX_LOCK(dbenv, &bhp->mutex);
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
F_SET(bhp, BH_LOCKED);
R_UNLOCK(dbenv, dbmp->reginfo);
if (restartp != NULL)
*restartp = 1;
/*
* It's possible that the underlying file doesn't exist, either
* because of an outright removal or because it was a temporary
@ -347,155 +322,122 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
goto file_dead;
/*
* Ensure the appropriate log records are on disk. If the page is
* being written as part of a sync operation, the flush has already
* been done, unless it was written by the application *after* the
* sync was scheduled.
* If the page is in a file for which we have LSN information, we have
* to ensure the appropriate log records are on disk.
*/
if (LOGGING_ON(dbenv) &&
(!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) {
memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
if ((ret = log_flush(dbenv, &lsn)) != 0)
if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0)
goto err;
}
DB_ASSERT(!LOGGING_ON(dbenv) ||
log_compare(&((LOG *)((DB_LOG *)
dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
#ifdef DIAGNOSTIC
/*
* Verify write-ahead logging semantics.
*
* !!!
* One special case. There is a single field on the meta-data page,
* the last-page-number-in-the-file field, for which we do not log
* changes. If the page was originally created in a database that
* didn't have logging turned on, we can see a page marked dirty but
* for which no corresponding log record has been written. However,
* the only way that a page can be created for which there isn't a
* previous log record and valid LSN is when the page was created
* without logging turned on, and so we check for that special-case
* LSN value.
*/
if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) {
/*
* There is a potential race here. If we are in the midst of
* switching log files, it's possible we could test against the
* old file and the new offset in the log region's LSN. If we
* fail the first test, acquire the log mutex and check again.
*/
DB_LOG *dblp;
LOG *lp;
dblp = dbenv->lg_handle;
lp = dblp->reginfo.primary;
if (!IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
R_LOCK(dbenv, &dblp->reginfo);
DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0);
R_UNLOCK(dbenv, &dblp->reginfo);
}
}
#endif
/*
* Call any pgout function. We set the callpgin flag so that we flag
* that the contents of the buffer will need to be passed through pgin
* before they are reused.
*/
if (mfp->ftype == 0)
ret = 0;
else {
if (mfp->ftype != 0) {
callpgin = 1;
if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
goto err;
}
/* Temporary files may not yet have been created. */
if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
&dbmfp->fh, NULL)) != 0 ||
!F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 :
__db_appname(dbenv, DB_APP_TMP, NULL,
F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0,
dbmfp->fhp, NULL);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (ret != 0) {
__db_err(dbenv,
"unable to create temporary backing file");
goto err;
}
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
}
/* Write the page. */
db_io.fhp = &dbmfp->fh;
db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
ret = __db_panic(dbenv, ret);
fail = "write";
goto syserr;
}
if (nw != mfp->stat.st_pagesize) {
ret = EIO;
fail = "write";
goto syserr;
__db_err(dbenv, "%s: write failed for page %lu",
__memp_fn(dbmfp), (u_long)bhp->pgno);
goto err;
}
++mfp->stat.st_page_out;
err:
file_dead:
/*
* !!!
* Once we pass this point, dbmfp and mfp may be NULL, we may not have
* a valid file reference.
*
* Unlock the buffer and reacquire the region lock.
* Unlock the buffer and reacquire the hash lock.
*/
MUTEX_UNLOCK(dbenv, &bhp->mutex);
R_LOCK(dbenv, dbmp->reginfo);
MUTEX_LOCK(dbenv, &hp->hash_mutex);
/*
* Clean up the flags based on a successful write.
*
* If we rewrote the page, it will need processing by the pgin
* routine before reuse.
*/
if (callpgin)
F_SET(bhp, BH_CALLPGIN);
F_CLR(bhp, BH_DIRTY | BH_LOCKED);
/*
* If we write a buffer for which a checkpoint is waiting, update
* the count of pending buffers (both in the mpool as a whole and
* for this file). If the count for this file goes to zero, set a
* flag so we flush the writes.
* Update the hash bucket statistics, reset the flags.
* If we were successful, the page is no longer dirty.
*/
dosync = 0;
if (F_ISSET(bhp, BH_SYNC)) {
F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
if (ret == 0) {
DB_ASSERT(hp->hash_page_dirty != 0);
--hp->hash_page_dirty;
--mp->lsn_cnt;
if (mfp != NULL)
dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
/* Update the page clean/dirty statistics. */
c_mp = BH_TO_CACHE(dbmp, bhp);
++c_mp->stat.st_page_clean;
--c_mp->stat.st_page_dirty;
/* Update I/O statistics. */
if (mfp != NULL)
++mfp->stat.st_page_out;
/*
* Do the sync after everything else has been updated, so any incoming
* checkpoint doesn't see inconsistent information.
*
* XXX:
* Don't lock the region around the sync, fsync(2) has no atomicity
* issues.
*
* XXX:
* We ignore errors from the sync -- it makes no sense to return an
* error to the calling process, so set a flag causing the checkpoint
* to be retried later. There is a possibility, of course, that a
* subsequent checkpoint was started and that we're going to force it
* to fail. That should be unlikely, and fixing it would be difficult.
*/
if (dosync) {
R_UNLOCK(dbenv, dbmp->reginfo);
syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
R_LOCK(dbenv, dbmp->reginfo);
if (syncfail)
F_SET(mp, MP_LSN_RETRY);
}
if (wrotep != NULL)
*wrotep = 1;
return (0);
syserr: __db_err(dbenv, "%s: %s failed for page %lu",
__memp_fn(dbmfp), fail, (u_long)bhp->pgno);
err: /* Unlock the buffer and reacquire the region lock. */
MUTEX_UNLOCK(dbenv, &bhp->mutex);
R_LOCK(dbenv, dbmp->reginfo);
/*
* Clean up the flags based on a failure.
*
* The page remains dirty but we remove our lock. If we rewrote the
* page, it will need processing by the pgin routine before reuse.
*/
if (callpgin)
F_SET(bhp, BH_CALLPGIN);
/* Regardless, clear any sync wait-for count and remove our lock. */
bhp->ref_sync = 0;
F_CLR(bhp, BH_LOCKED);
return (ret);
@ -514,15 +456,17 @@ __memp_pg(dbmfp, bhp, is_pgin)
int is_pgin;
{
DBT dbt, *dbtp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
DB_MPREG *mpreg;
MPOOLFILE *mfp;
int ftype, ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
ftype = mfp->ftype;
for (mpreg = LIST_FIRST(&dbmp->dbregq);
@ -536,28 +480,28 @@ __memp_pg(dbmfp, bhp, is_pgin)
dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
dbtp = &dbt;
}
MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (is_pgin) {
if (mpreg->pgin != NULL &&
(ret = mpreg->pgin(dbmp->dbenv,
(ret = mpreg->pgin(dbenv,
bhp->pgno, bhp->buf, dbtp)) != 0)
goto err;
} else
if (mpreg->pgout != NULL &&
(ret = mpreg->pgout(dbmp->dbenv,
(ret = mpreg->pgout(dbenv,
bhp->pgno, bhp->buf, dbtp)) != 0)
goto err;
break;
}
if (mpreg == NULL)
MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (0);
err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
__db_err(dbmp->dbenv, "%s: %s failed for page %lu",
err: MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
__db_err(dbenv, "%s: %s failed for page %lu",
__memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
return (ret);
}
@ -566,55 +510,78 @@ err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
* __memp_bhfree --
* Free a bucket header and its referenced data.
*
* PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
* PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int));
*/
void
__memp_bhfree(dbmp, bhp, free_mem)
__memp_bhfree(dbmp, hp, bhp, free_mem)
DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
BH *bhp;
int free_mem;
{
DB_HASHTAB *dbht;
DB_ENV *dbenv;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
int n_bucket, n_cache;
u_int32_t n_cache;
/*
* Assumes the hash bucket is locked and the MPOOL is not.
*/
dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
c_mp = BH_TO_CACHE(dbmp, bhp);
n_cache = NCACHE(mp, bhp->pgno);
n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
/* Delete the buffer header from the hash bucket queue. */
SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
/*
* Delete the buffer header from the hash bucket queue and reset
* the hash bucket's priority, if necessary.
*/
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
if (bhp->priority == hp->hash_priority)
hp->hash_priority =
SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ?
0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
/* Delete the buffer header from the LRU queue. */
SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
/*
* Discard the hash bucket's mutex, it's no longer needed, and
* we don't want to be holding it when acquiring other locks.
*/
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
/* Clear the mutex this buffer recorded */
__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
(REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
/*
* Find the underlying MPOOLFILE and decrement its reference count.
* If this is its last reference, remove it.
*/
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
MUTEX_LOCK(dbenv, &mfp->mutex);
if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
__memp_mf_discard(dbmp, mfp);
else
MUTEX_UNLOCK(dbenv, &mfp->mutex);
R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
/*
* If we're not reusing it immediately, free the buffer header
* Clear the mutex this buffer recorded; requires the region lock
* be held.
*/
__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
(REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
/*
* If we're not reusing the buffer immediately, free the buffer header
* and data for real.
*/
if (free_mem) {
--c_mp->stat.st_page_clean;
__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
c_mp = dbmp->reginfo[n_cache].primary;
c_mp->stat.st_pages--;
}
R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
}
/*
* __memp_upgrade --
* Upgrade a file descriptor from readonly to readwrite.
* Upgrade a file descriptor from read-only to read-write.
*/
static int
__memp_upgrade(dbmp, dbmfp, mfp)
@ -622,41 +589,58 @@ __memp_upgrade(dbmp, dbmfp, mfp)
DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
{
DB_FH fh;
DB_ENV *dbenv;
DB_FH *fhp, *tfhp;
int ret;
char *rpath;
/*
* !!!
* We expect the handle to already be locked.
*/
/* Check to see if we've already upgraded. */
if (F_ISSET(dbmfp, MP_UPGRADE))
return (0);
/* Check to see if we've already failed. */
if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
return (1);
dbenv = dbmp->dbenv;
fhp = NULL;
rpath = NULL;
/*
* Calculate the real name for this file and try to open it read/write.
* We know we have a valid pathname for the file because it's the only
* way we could have gotten a file descriptor of any kind.
*/
if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
return (ret);
if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0)
goto err;
if ((ret = __db_appname(dbenv, DB_APP_DATA,
R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
goto err;
if (__os_open(dbenv, rpath,
F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) {
F_SET(dbmfp, MP_UPGRADE_FAIL);
ret = 1;
} else {
/* Swap the descriptors and set the upgrade flag. */
(void)__os_closehandle(&dbmfp->fh);
dbmfp->fh = fh;
F_SET(dbmfp, MP_UPGRADE);
ret = 0;
goto err;
}
__os_freestr(rpath);
/*
* Swap the descriptors and set the upgrade flag.
*
* XXX
* There is a race here. If another process schedules a read using the
* existing file descriptor and is swapped out before making the system
* call, this code could theoretically close the file descriptor out
* from under it. While it's very unlikely, this code should still be
* rewritten.
*/
tfhp = dbmfp->fhp;
dbmfp->fhp = fhp;
fhp = tfhp;
(void)__os_closehandle(dbenv, fhp);
F_SET(dbmfp, MP_UPGRADE);
ret = 0;
if (0) {
err: ret = 1;
}
if (fhp != NULL)
__os_free(dbenv, fhp);
if (rpath != NULL)
__os_free(dbenv, rpath);
return (ret);
}

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex
#include <string.h>
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#ifdef HAVE_FILESYSTEM_NOTZERO
static int __memp_fs_notzero
__P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
#endif
/*
* memp_fget --
* __memp_fget --
* Get a page from the file.
*
* PUBLIC: int __memp_fget
* PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
*/
int
memp_fget(dbmfp, pgnoaddr, flags, addrp)
__memp_fget(dbmfp, pgnoaddr, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
u_int32_t flags;
void *addrp;
{
BH *bhp;
enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
BH *alloc_bhp, *bhp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
DB_HASHTAB *dbht;
DB_MPOOL_HASH *hp;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
size_t n_bucket, n_cache, mf_offset;
u_int32_t st_hsearch;
int b_incr, first, ret;
roff_t mf_offset;
u_int32_t n_cache, st_hsearch;
int b_incr, extending, first, ret;
*(void **)addrp = NULL;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
mfp = dbmfp->mfp;
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
#endif
PANIC_CHECK(dbenv);
mp = dbmp->reginfo[0].primary;
mfp = dbmfp->mfp;
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
alloc_bhp = bhp = NULL;
hp = NULL;
b_incr = extending = ret = 0;
/*
* Validate arguments.
*
@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* is to keep database files small. It's sleazy as hell, but we catch
* any attempt to actually write the file in memp_fput().
*/
#define OKFLAGS \
(DB_MPOOL_CREATE | DB_MPOOL_LAST | \
DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
if (flags != 0) {
if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
switch (flags & ~DB_MPOOL_EXTENT) {
switch (flags) {
case DB_MPOOL_CREATE:
break;
case DB_MPOOL_LAST:
/* Get the last page number in the file. */
if (flags == DB_MPOOL_LAST) {
R_LOCK(dbenv, dbmp->reginfo);
*pgnoaddr = mfp->last_pgno;
R_UNLOCK(dbenv, dbmp->reginfo);
}
break;
case DB_MPOOL_NEW:
case DB_MPOOL_NEW_GROUP:
case 0:
/*
* If always creating a page, skip the first search
* of the hash bucket.
*/
if (flags == DB_MPOOL_NEW)
goto alloc;
break;
default:
return (__db_ferr(dbenv, "memp_fget", 1));
}
}
#ifdef DIAGNOSTIC
/*
* XXX
* We want to switch threads as often as possible. Yield every time
* we get a new page to ensure contention.
*/
if (DB_GLOBAL(db_pageyield))
__os_yield(dbenv, 1);
#endif
/* Initialize remaining local variables. */
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
bhp = NULL;
st_hsearch = 0;
b_incr = ret = 0;
R_LOCK(dbenv, dbmp->reginfo);
/*
* Check for the new, last or last + 1 page requests.
*
* Examine and update the file's last_pgno value. We don't care if
* the last_pgno value immediately changes due to another thread --
* at this instant in time, the value is correct. We do increment the
* current last_pgno value if the thread is asking for a new page,
* however, to ensure that two threads creating pages don't get the
* same one.
*
* If we create a page, there is the potential that a page after it
* in the file will be written before it will be written. Recovery
* depends on pages that are "created" in the file by subsequent pages
* being written be zeroed out, not have random garbage. Ensure that
* the OS agrees.
*
* !!!
* DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
* to allocate contiguous groups of pages in order to do subdatabases.
* We return the first page in the group, but the caller must put an
* LSN on the *last* page and write it, otherwise after a crash we may
* not create all of the pages we need to create.
*/
if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
if (LF_ISSET(DB_MPOOL_NEW)) {
if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
__os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
1, mfp->stat.st_pagesize)) != 0) {
R_UNLOCK(dbenv, dbmp->reginfo);
return (ret);
}
++mfp->last_pgno;
}
if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
__os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
(int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
R_UNLOCK(dbenv, dbmp->reginfo);
return (ret);
}
mfp->last_pgno += *pgnoaddr;
}
*pgnoaddr = mfp->last_pgno;
}
/*
* Determine the hash bucket where this page will live, and get local
* pointers to the cache and its hash table.
*/
n_cache = NCACHE(mp, *pgnoaddr);
c_mp = dbmp->reginfo[n_cache].primary;
n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
goto alloc;
/*
* If mmap'ing the file and the page is not past the end of the file,
* just return a pointer.
@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* goes through the cache. All pages previously returned will be safe,
* as long as the correct locking protocol was observed.
*
* XXX
* We don't discard the map because we don't know when all of the
* pages will have been discarded from the process' address space.
* It would be possible to do so by reference counting the open
* pages from the mmap, but it's unclear to me that it's worth it.
*/
if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
if (*pgnoaddr > mfp->orig_last_pgno) {
/*
* !!!
* See the comment above about non-existent pages and
* the hash access method.
*/
if (!LF_ISSET(DB_MPOOL_CREATE)) {
if (!LF_ISSET(DB_MPOOL_EXTENT))
__db_err(dbenv,
"%s: page %lu doesn't exist",
__memp_fn(dbmfp), (u_long)*pgnoaddr);
ret = EINVAL;
goto err;
}
} else {
*(void **)addrp =
R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
++mfp->stat.st_map;
goto done;
}
if (dbmfp->addr != NULL &&
F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
*(void **)addrp =
R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
++mfp->stat.st_map;
return (0);
}
hb_search:
/*
* Determine the cache and hash bucket where this page lives and get
* local pointers to them. Reset on each pass through this code, the
* page number can change.
*/
n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
c_mp = dbmp->reginfo[n_cache].primary;
hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
/* Search the hash chain for the page. */
for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
retry: st_hsearch = 0;
MUTEX_LOCK(dbenv, &hp->hash_mutex);
for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
++st_hsearch;
if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
continue;
/* Increment the reference count. */
/*
* Increment the reference count. We may discard the hash
* bucket lock as we evaluate and/or read the buffer, so we
* need to ensure it doesn't move and its contents remain
* unchanged.
*/
if (bhp->ref == UINT16_T_MAX) {
__db_err(dbenv,
"%s: page %lu: reference count overflow",
__memp_fn(dbmfp), (u_long)bhp->pgno);
ret = EINVAL;
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
goto err;
}
/*
* Increment the reference count. We may discard the region
* lock as we evaluate and/or read the buffer, so we need to
* ensure that it doesn't move and that its contents remain
* unchanged.
*/
++bhp->ref;
b_incr = 1;
/*
* Any buffer we find might be trouble.
*
* BH_LOCKED --
* I/O is in progress. Because we've incremented the buffer
* reference count, we know the buffer can't move. Unlock
* the region lock, wait for the I/O to complete, and reacquire
* the region.
* I/O is in progress or sync is waiting on the buffer to write
* it. Because we've incremented the buffer reference count,
* we know the buffer can't move. Unlock the bucket lock, wait
* for the buffer to become available, reacquire the bucket.
*/
for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
R_UNLOCK(dbenv, dbmp->reginfo);
for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
!F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
/*
* Explicitly yield the processor if it's not the first
* pass through this loop -- if we don't, we might end
* up running to the end of our CPU quantum as we will
* simply be swapping between the two locks.
* If someone is trying to sync this buffer and the
* buffer is hot, they may never get in. Give up
* and try again.
*/
if (!first && bhp->ref_sync != 0) {
--bhp->ref;
b_incr = 0;
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
__os_yield(dbenv, 1);
goto retry;
}
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
/*
* Explicitly yield the processor if not the first pass
* through this loop -- if we don't, we might run to the
* end of our CPU quantum as we will simply be swapping
* between the two locks.
*/
if (!first)
__os_yield(dbenv, 1);
MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
MUTEX_LOCK(dbenv, &bhp->mutex);
/* Wait for I/O to finish... */
MUTEX_UNLOCK(dbenv, &bhp->mutex);
R_LOCK(dbenv, dbmp->reginfo);
}
/*
* BH_TRASH --
* The contents of the buffer are garbage. Shouldn't happen,
* and this read is likely to fail, but might as well try.
*/
if (F_ISSET(bhp, BH_TRASH))
goto reread;
/*
* BH_CALLPGIN --
* The buffer was converted so it could be written, and the
* contents need to be converted again.
*/
if (F_ISSET(bhp, BH_CALLPGIN)) {
if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
goto err;
F_CLR(bhp, BH_CALLPGIN);
MUTEX_LOCK(dbenv, &hp->hash_mutex);
}
++mfp->stat.st_cache_hit;
*(void **)addrp = bhp->buf;
goto done;
}
alloc: /* Allocate new buffer header and data space. */
if ((ret = __memp_alloc(dbmp,
&dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
goto err;
++c_mp->stat.st_page_clean;
/*
* Initialize the BH fields so that we can call the __memp_bhfree
* routine if an error occurs.
*/
memset(bhp, 0, sizeof(BH));
bhp->ref = 1;
bhp->pgno = *pgnoaddr;
bhp->mf_offset = mf_offset;
/* Increment the count of buffers referenced by this MPOOLFILE. */
++mfp->block_cnt;
/*
* Prepend the bucket header to the head of the appropriate MPOOL
* bucket hash list. Append the bucket header to the tail of the
* MPOOL LRU chain.
*/
SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
#ifdef DIAGNOSTIC
if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
__db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
ret = EINVAL;
__memp_bhfree(dbmp, bhp, 1);
goto err;
}
#endif
if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
0, &dbmp->reginfo[n_cache],
(REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
!= 0) {
__memp_bhfree(dbmp, bhp, 1);
goto err;
break;
}
/*
* If we created the page, zero it out and continue.
*
* !!!
* Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
* If DB_MPOOL_CREATE is used, then the application's pgin function
* has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
* it can detect all of its page creates, and not bother.
*
* If we're running in diagnostic mode, smash any bytes on the
* page that are unknown quantities for the caller.
*
* Otherwise, read the page into memory, optionally creating it if
* DB_MPOOL_CREATE is set.
* Update the hash bucket search statistics -- do now because our next
* search may be for a different bucket.
*/
if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
if (mfp->clear_len == 0)
memset(bhp->buf, 0, mfp->stat.st_pagesize);
else {
memset(bhp->buf, 0, mfp->clear_len);
#ifdef DIAGNOSTIC
memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
mfp->stat.st_pagesize - mfp->clear_len);
#endif
}
++c_mp->stat.st_hash_searches;
if (st_hsearch > c_mp->stat.st_hash_longest)
c_mp->stat.st_hash_longest = st_hsearch;
c_mp->stat.st_hash_examined += st_hsearch;
++mfp->stat.st_page_create;
} else {
/*
* There are 4 possible paths to this location:
*
* FIRST_MISS:
* Didn't find the page in the hash bucket on our first pass:
* bhp == NULL, alloc_bhp == NULL
*
* FIRST_FOUND:
* Found the page in the hash bucket on our first pass:
* bhp != NULL, alloc_bhp == NULL
*
* SECOND_FOUND:
* Didn't find the page in the hash bucket on the first pass,
* allocated space, and found the page in the hash bucket on
* our second pass:
* bhp != NULL, alloc_bhp != NULL
*
* SECOND_MISS:
* Didn't find the page in the hash bucket on the first pass,
* allocated space, and didn't find the page in the hash bucket
* on our second pass:
* bhp == NULL, alloc_bhp != NULL
*/
state = bhp == NULL ?
(alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
(alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
switch (state) {
case FIRST_FOUND:
/* We found the buffer in our first check -- we're done. */
break;
case FIRST_MISS:
/*
* It's possible for the read function to fail, which means
* that we fail as well. Note, the __memp_pgread() function
* discards the region lock, so the buffer must be pinned
* down so that it cannot move and its contents are unchanged.
* We didn't find the buffer in our first check. Figure out
* if the page exists, and allocate structures so we can add
* the page to the buffer pool.
*/
reread: if ((ret = __memp_pgread(dbmfp,
bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
/*
* !!!
* Discard the buffer unless another thread is waiting
* on our I/O to complete. Regardless, the header has
* the BH_TRASH flag set.
*/
if (bhp->ref == 1)
__memp_bhfree(dbmp, bhp, 1);
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
alloc: /*
* If DB_MPOOL_NEW is set, we have to allocate a page number.
* If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
* it's an error to try and get a page past the end of file.
*/
COMPQUIET(n_cache, 0);
extending = ret = 0;
R_LOCK(dbenv, dbmp->reginfo);
switch (flags) {
case DB_MPOOL_NEW:
extending = 1;
*pgnoaddr = mfp->last_pgno + 1;
break;
case DB_MPOOL_CREATE:
extending = *pgnoaddr > mfp->last_pgno;
break;
default:
ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
break;
}
R_UNLOCK(dbenv, dbmp->reginfo);
if (ret != 0)
goto err;
/*
* !!!
* In the DB_MPOOL_NEW code path, mf_offset and n_cache have
* not yet been initialized.
*/
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
/* Allocate a new buffer header and data space. */
if ((ret = __memp_alloc(dbmp,
&dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
goto err;
#ifdef DIAGNOSTIC
if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
__db_err(dbenv,
"Error: buffer data is NOT size_t aligned");
ret = EINVAL;
goto err;
}
#endif
/*
* If we are extending the file, we'll need the region lock
* again.
*/
if (extending)
R_LOCK(dbenv, dbmp->reginfo);
++mfp->stat.st_cache_miss;
/*
* DB_MPOOL_NEW does not guarantee you a page unreferenced by
* any other thread of control. (That guarantee is interesting
* for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
* did not specify the page number, and so, may reasonably not
* have any way to lock the page outside of mpool.) Regardless,
* if we allocate the page, and some other thread of control
* requests the page by number, we will not detect that and the
* thread of control that allocated using DB_MPOOL_NEW may not
* have a chance to initialize the page. (Note: we *could*
* detect this case if we set a flag in the buffer header which
* guaranteed that no gets of the page would succeed until the
* reference count went to 0, that is, until the creating page
* put the page.) What we do guarantee is that if two threads
* of control are both doing DB_MPOOL_NEW calls, they won't
* collide, that is, they won't both get the same page.
*
* There's a possibility that another thread allocated the page
* we were planning to allocate while we were off doing buffer
* allocation. We can do that by making sure the page number
* we were going to use is still available. If it's not, then
* we check to see if the next available page number hashes to
* the same mpool region as the old one -- if it does, we can
* continue, otherwise, we have to start over.
*/
if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
*pgnoaddr = mfp->last_pgno + 1;
if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
__db_shalloc_free(
dbmp->reginfo[n_cache].addr, alloc_bhp);
/*
* flags == DB_MPOOL_NEW, so extending is set
* and we're holding the region locked.
*/
R_UNLOCK(dbenv, dbmp->reginfo);
alloc_bhp = NULL;
goto alloc;
}
}
/*
* We released the region lock, so another thread might have
* extended the file. Update the last_pgno and initialize
* the file, as necessary, if we extended the file.
*/
if (extending) {
#ifdef HAVE_FILESYSTEM_NOTZERO
if (*pgnoaddr > mfp->last_pgno &&
__os_fs_notzero() &&
F_ISSET(dbmfp->fhp, DB_FH_VALID))
ret = __memp_fs_notzero(
dbenv, dbmfp, mfp, pgnoaddr);
else
ret = 0;
#endif
if (ret == 0 && *pgnoaddr > mfp->last_pgno)
mfp->last_pgno = *pgnoaddr;
R_UNLOCK(dbenv, dbmp->reginfo);
if (ret != 0)
goto err;
}
goto hb_search;
case SECOND_FOUND:
/*
* We allocated buffer space for the requested page, but then
* found the page in the buffer cache on our second check.
* That's OK -- we can use the page we found in the pool,
* unless DB_MPOOL_NEW is set.
*
* Free the allocated memory, we no longer need it. Since we
* can't acquire the region lock while holding the hash bucket
* lock, we have to release the hash bucket and re-acquire it.
* That's OK, because we have the buffer pinned down.
*/
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
alloc_bhp = NULL;
R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
MUTEX_LOCK(dbenv, &hp->hash_mutex);
/*
* We can't use the page we found in the pool if DB_MPOOL_NEW
* was set. (For details, see the above comment beginning
* "DB_MPOOL_NEW does not guarantee you a page unreferenced by
* any other thread of control".) If DB_MPOOL_NEW is set, we
* release our pin on this particular buffer, and try to get
* another one.
*/
if (flags == DB_MPOOL_NEW) {
--bhp->ref;
b_incr = 0;
goto alloc;
}
break;
case SECOND_MISS:
/*
* We allocated buffer space for the requested page, and found
* the page still missing on our second pass through the buffer
* cache. Instantiate the page.
*/
bhp = alloc_bhp;
alloc_bhp = NULL;
/*
* Initialize all the BH and hash bucket fields so we can call
* __memp_bhfree if an error occurs.
*
* Append the buffer to the tail of the bucket list and update
* the hash bucket's priority.
*/
b_incr = 1;
memset(bhp, 0, sizeof(BH));
bhp->ref = 1;
bhp->priority = UINT32_T_MAX;
bhp->pgno = *pgnoaddr;
bhp->mf_offset = mf_offset;
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
hp->hash_priority =
SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
/* If we extended the file, make sure the page is never lost. */
if (extending) {
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
/*
* If we created the page, zero it out. If we didn't create
* the page, read from the backing file.
*
* !!!
* DB_MPOOL_NEW doesn't call the pgin function.
*
* If DB_MPOOL_CREATE is used, then the application's pgin
* function has to be able to handle pages of 0's -- if it
* uses DB_MPOOL_NEW, it can detect all of its page creates,
* and not bother.
*
* If we're running in diagnostic mode, smash any bytes on the
* page that are unknown quantities for the caller.
*
* Otherwise, read the page into memory, optionally creating it
* if DB_MPOOL_CREATE is set.
*/
if (extending) {
if (mfp->clear_len == 0)
memset(bhp->buf, 0, mfp->stat.st_pagesize);
else {
memset(bhp->buf, 0, mfp->clear_len);
#if defined(DIAGNOSTIC) || defined(UMRW)
memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
mfp->stat.st_pagesize - mfp->clear_len);
#endif
}
if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
F_SET(bhp, BH_CALLPGIN);
++mfp->stat.st_page_create;
} else {
F_SET(bhp, BH_TRASH);
++mfp->stat.st_cache_miss;
}
/* Increment buffer count referenced by MPOOLFILE. */
MUTEX_LOCK(dbenv, &mfp->mutex);
++mfp->block_cnt;
MUTEX_UNLOCK(dbenv, &mfp->mutex);
/*
* Initialize the mutex. This is the last initialization step,
* because it's the only one that can fail, and everything else
* must be set up or we can't jump to the err label because it
* will call __memp_bhfree.
*/
if ((ret = __db_mutex_setup(dbenv,
&dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
goto err;
}
DB_ASSERT(bhp->ref != 0);
/*
* If we're the only reference, update buffer and bucket priorities.
* We may be about to release the hash bucket lock, and everything
* should be correct, first. (We've already done this if we created
* the buffer, so there is no need to do it again.)
*/
if (state != SECOND_MISS && bhp->ref == 1) {
bhp->priority = UINT32_T_MAX;
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
hp->hash_priority =
SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
/*
* If we're returning a page after our current notion of the last-page,
* update our information. Note, there's no way to un-instantiate this
* page, it's going to exist whether it's returned to us dirty or not.
* BH_TRASH --
* The buffer we found may need to be filled from the disk.
*
* It's possible for the read function to fail, which means we fail as
* well. Note, the __memp_pgread() function discards and reacquires
* the hash lock, so the buffer must be pinned down so that it cannot
* move and its contents are unchanged. Discard the buffer on failure
* unless another thread is waiting on our I/O to complete. It's OK to
* leave the buffer around, as the waiting thread will see the BH_TRASH
* flag set, and will also attempt to discard it. If there's a waiter,
* we need to decrement our reference count.
*/
if (bhp->pgno > mfp->last_pgno)
mfp->last_pgno = bhp->pgno;
if (F_ISSET(bhp, BH_TRASH) &&
(ret = __memp_pgread(dbmfp,
&hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
goto err;
*(void **)addrp = bhp->buf;
done: /* Update the chain search statistics. */
if (st_hsearch) {
++c_mp->stat.st_hash_searches;
if (st_hsearch > c_mp->stat.st_hash_longest)
c_mp->stat.st_hash_longest = st_hsearch;
c_mp->stat.st_hash_examined += st_hsearch;
/*
* BH_CALLPGIN --
* The buffer was processed for being written to disk, and now has
* to be re-converted for use.
*/
if (F_ISSET(bhp, BH_CALLPGIN)) {
if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
goto err;
F_CLR(bhp, BH_CALLPGIN);
}
++dbmfp->pinref;
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
#ifdef DIAGNOSTIC
/* Update the file's pinned reference count. */
R_LOCK(dbenv, dbmp->reginfo);
++dbmfp->pinref;
R_UNLOCK(dbenv, dbmp->reginfo);
/*
* We want to switch threads as often as possible, and at awkward
* times. Yield every time we get a new page to ensure contention.
*/
if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
__os_yield(dbenv, 1);
#endif
*(void **)addrp = bhp->buf;
return (0);
err: /* Discard our reference. */
if (b_incr)
--bhp->ref;
R_UNLOCK(dbenv, dbmp->reginfo);
err: /*
* Discard our reference. If we're the only reference, discard the
* the buffer entirely. If we held a reference to a buffer, we are
* also still holding the hash bucket mutex.
*/
if (b_incr) {
if (bhp->ref == 1)
(void)__memp_bhfree(dbmp, hp, bhp, 1);
else {
--bhp->ref;
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
}
/* If alloc_bhp is set, free the memory. */
if (alloc_bhp != NULL)
__db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
*(void **)addrp = NULL;
return (ret);
}
#ifdef HAVE_FILESYSTEM_NOTZERO
/*
* __memp_fs_notzero --
* Initialize the underlying allocated pages in the file.
*/
static int
__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
db_pgno_t *pgnoaddr;
{
DB_IO db_io;
u_int32_t i, npages;
size_t nw;
int ret;
u_int8_t *page;
char *fail;
/*
* Pages allocated by writing pages past end-of-file are not zeroed,
* on some systems. Recovery could theoretically be fooled by a page
* showing up that contained garbage. In order to avoid this, we
* have to write the pages out to disk, and flush them. The reason
* for the flush is because if we don't sync, the allocation of another
* page subsequent to this one might reach the disk first, and if we
* crashed at the right moment, leave us with this page as the one
* allocated by writing a page past it in the file.
*
* Hash is the only access method that allocates groups of pages. We
* know that it will use the existence of the last page in a group to
* signify that the entire group is OK; so, write all the pages but
* the last one in the group, flush them to disk, and then write the
* last one to disk and flush it.
*/
if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
return (ret);
db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
db_io.buf = page;
npages = *pgnoaddr - mfp->last_pgno;
for (i = 1; i < npages; ++i) {
db_io.pgno = mfp->last_pgno + i;
if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
fail = "write";
goto err;
}
}
if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
fail = "sync";
goto err;
}
db_io.pgno = mfp->last_pgno + npages;
if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
fail = "write";
goto err;
}
if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
fail = "sync";
err: __db_err(dbenv, "%s: %s failed for page %lu",
__memp_fn(dbmfp), fail, (u_long)db_io.pgno);
}
__os_free(dbenv, page);
return (ret);
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $";
static const char revid[] = "$Id: mp_fput.c,v 11.36 2002/08/09 19:04:11 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -15,43 +15,32 @@ static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Ex
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#endif
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
/*
* memp_fput --
* __memp_fput --
* Mpool file put function.
*
* PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
*/
int
memp_fput(dbmfp, pgaddr, flags)
__memp_fput(dbmfp, pgaddr, flags)
DB_MPOOLFILE *dbmfp;
void *pgaddr;
u_int32_t flags;
{
BH *bhp;
BH *argbhp, *bhp, *prev;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOL *c_mp, *mp;
int ret, wrote;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
u_int32_t n_cache;
int adjust, ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
return (__dbcl_memp_fput(dbmfp, pgaddr, flags));
#endif
PANIC_CHECK(dbenv);
@ -72,17 +61,6 @@ memp_fput(dbmfp, pgaddr, flags)
}
}
R_LOCK(dbenv, dbmp->reginfo);
/* Decrement the pinned reference count. */
if (dbmfp->pinref == 0) {
__db_err(dbenv,
"%s: more pages returned than retrieved", __memp_fn(dbmfp));
R_UNLOCK(dbenv, dbmp->reginfo);
return (EINVAL);
} else
--dbmfp->pinref;
/*
* If we're mapping the file, there's nothing to do. Because we can
* stop mapping the file at any time, we have to check on each buffer
@ -90,40 +68,51 @@ memp_fput(dbmfp, pgaddr, flags)
* region.
*/
if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
(u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
R_UNLOCK(dbenv, dbmp->reginfo);
(u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
return (0);
#ifdef DIAGNOSTIC
/*
* Decrement the per-file pinned buffer count (mapped pages aren't
* counted).
*/
R_LOCK(dbenv, dbmp->reginfo);
if (dbmfp->pinref == 0) {
ret = EINVAL;
__db_err(dbenv,
"%s: more pages returned than retrieved", __memp_fn(dbmfp));
} else {
ret = 0;
--dbmfp->pinref;
}
R_UNLOCK(dbenv, dbmp->reginfo);
if (ret != 0)
return (ret);
#endif
/* Convert the page address to a buffer header. */
/* Convert a page address to a buffer header and hash bucket. */
bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
c_mp = dbmp->reginfo[n_cache].primary;
hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
/* Convert the buffer header to a cache. */
c_mp = BH_TO_CACHE(dbmp, bhp);
/* UNLOCK THE REGION, LOCK THE CACHE. */
MUTEX_LOCK(dbenv, &hp->hash_mutex);
/* Set/clear the page bits. */
if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
++c_mp->stat.st_page_clean;
--c_mp->stat.st_page_dirty;
if (LF_ISSET(DB_MPOOL_CLEAN) &&
F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
DB_ASSERT(hp->hash_page_dirty != 0);
--hp->hash_page_dirty;
F_CLR(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
--c_mp->stat.st_page_clean;
++c_mp->stat.st_page_dirty;
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DISCARD))
F_SET(bhp, BH_DISCARD);
/*
* If the page is dirty and being scheduled to be written as part of
* a checkpoint, we no longer know that the log is up-to-date.
*/
if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC))
F_SET(bhp, BH_SYNC_LOGFLSH);
/*
* Check for a reference count going to zero. This can happen if the
* application returns a page twice.
@ -131,56 +120,83 @@ memp_fput(dbmfp, pgaddr, flags)
if (bhp->ref == 0) {
__db_err(dbenv, "%s: page %lu: unpinned page returned",
__memp_fn(dbmfp), (u_long)bhp->pgno);
R_UNLOCK(dbenv, dbmp->reginfo);
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (EINVAL);
}
/*
* If more than one reference to the page, we're done. Ignore the
* discard flags (for now) and leave it at its position in the LRU
* chain. The rest gets done at last reference close.
* If more than one reference to the page or a reference other than a
* thread waiting to flush the buffer to disk, we're done. Ignore the
* discard flags (for now) and leave the buffer's priority alone.
*/
if (--bhp->ref > 0) {
R_UNLOCK(dbenv, dbmp->reginfo);
if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) {
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (0);
}
/*
* Move the buffer to the head/tail of the LRU chain. We do this
* before writing the buffer for checkpoint purposes, as the write
* can discard the region lock and allow another process to acquire
* buffer. We could keep that from happening, but there seems no
* reason to do so.
*/
SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
if (F_ISSET(bhp, BH_DISCARD))
SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh);
else
SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
/* Update priority values. */
if (F_ISSET(bhp, BH_DISCARD) ||
dbmfp->mfp->priority == MPOOL_PRI_VERY_LOW)
bhp->priority = 0;
else {
/*
* We don't lock the LRU counter or the stat.st_pages field, if
* we get garbage (which won't happen on a 32-bit machine), it
* only means a buffer has the wrong priority.
*/
bhp->priority = c_mp->lru_count;
/*
* If this buffer is scheduled for writing because of a checkpoint, we
* need to write it (if it's dirty), or update the checkpoint counters
* (if it's not dirty). If we try to write it and can't, that's not
* necessarily an error as it's not completely unreasonable that the
* application have permission to write the underlying file, but set a
* flag so that the next time the memp_sync function is called we try
* writing it there, as the checkpoint thread of control better be able
* to write all of the files.
*/
if (F_ISSET(bhp, BH_SYNC)) {
if (F_ISSET(bhp, BH_DIRTY)) {
if (__memp_bhwrite(dbmp,
dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
F_SET(mp, MP_LSN_RETRY);
} else {
F_CLR(bhp, BH_SYNC);
adjust = 0;
if (dbmfp->mfp->priority != 0)
adjust =
(int)c_mp->stat.st_pages / dbmfp->mfp->priority;
if (F_ISSET(bhp, BH_DIRTY))
adjust += c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
--mp->lsn_cnt;
--dbmfp->mfp->lsn_cnt;
}
if (adjust > 0) {
if (UINT32_T_MAX - bhp->priority <= (u_int32_t)adjust)
bhp->priority += adjust;
} else if (adjust < 0)
if (bhp->priority > (u_int32_t)-adjust)
bhp->priority += adjust;
}
R_UNLOCK(dbenv, dbmp->reginfo);
/*
* Buffers on hash buckets are sorted by priority -- move the buffer
* to the correct position in the list.
*/
argbhp = bhp;
SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh);
prev = NULL;
for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
bhp != NULL; prev = bhp, bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
if (bhp->priority > argbhp->priority)
break;
if (prev == NULL)
SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, argbhp, hq, __bh);
else
SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh);
/* Reset the hash bucket's priority. */
hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
#ifdef DIAGNOSTIC
__memp_check_order(hp);
#endif
/*
* The sync code has a separate counter for buffers on which it waits.
* It reads that value without holding a lock so we update it as the
* last thing we do. Once that value goes to 0, we won't see another
* reference to that buffer being returned to the cache until the sync
* code has finished, so we're safe as long as we don't let the value
* go to 0 before we finish with the buffer.
*/
if (F_ISSET(argbhp, BH_LOCKED) && argbhp->ref_sync != 0)
--argbhp->ref_sync;
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (0);
}

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $";
static const char revid[] = "$Id: mp_fset.c,v 11.25 2002/05/03 15:21:17 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -15,25 +15,18 @@ static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Ex
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#endif
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
/*
* memp_fset --
* __memp_fset --
* Mpool page set-flag routine.
*
* PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
*/
int
memp_fset(dbmfp, pgaddr, flags)
__memp_fset(dbmfp, pgaddr, flags)
DB_MPOOLFILE *dbmfp;
void *pgaddr;
u_int32_t flags;
@ -41,17 +34,13 @@ memp_fset(dbmfp, pgaddr, flags)
BH *bhp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOL *c_mp, *mp;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
u_int32_t n_cache;
int ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
return (__dbcl_memp_fset(dbmfp, pgaddr, flags));
#endif
PANIC_CHECK(dbenv);
@ -60,7 +49,7 @@ memp_fset(dbmfp, pgaddr, flags)
return (__db_ferr(dbenv, "memp_fset", 1));
if ((ret = __db_fchk(dbenv, "memp_fset", flags,
DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
return (ret);
if ((ret = __db_fcchk(dbenv, "memp_fset",
flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
@ -72,27 +61,29 @@ memp_fset(dbmfp, pgaddr, flags)
return (EACCES);
}
/* Convert the page address to a buffer header. */
/* Convert the page address to a buffer header and hash bucket. */
bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
c_mp = dbmp->reginfo[n_cache].primary;
hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
/* Convert the buffer header to a cache. */
c_mp = BH_TO_CACHE(dbmp, bhp);
MUTEX_LOCK(dbenv, &hp->hash_mutex);
R_LOCK(dbenv, dbmp->reginfo);
if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
++c_mp->stat.st_page_clean;
--c_mp->stat.st_page_dirty;
/* Set/clear the page bits. */
if (LF_ISSET(DB_MPOOL_CLEAN) &&
F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
DB_ASSERT(hp->hash_page_dirty != 0);
--hp->hash_page_dirty;
F_CLR(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
--c_mp->stat.st_page_clean;
++c_mp->stat.st_page_dirty;
++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DISCARD))
F_SET(bhp, BH_DISCARD);
R_UNLOCK(dbenv, dbmp->reginfo);
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (0);
}

View file

@ -1,30 +1,30 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $";
static const char revid[] = "$Id: mp_method.c,v 11.29 2002/03/27 04:32:27 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#ifdef HAVE_RPC
#include <rpc/rpc.h>
#endif
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#include "dbinc_auto/db_server.h"
#include "dbinc_auto/rpc_client_ext.h"
#endif
static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
@ -41,29 +41,46 @@ __memp_dbenv_create(dbenv)
DB_ENV *dbenv;
{
/*
* !!!
* Our caller has not yet had the opportunity to reset the panic
* state or turn off mutex locking, and so we can neither check
* the panic state or acquire a mutex in the DB_ENV create path.
*
* We default to 32 8K pages. We don't default to a flat 256K, because
* some systems require significantly more memory to hold 32 pages than
* others. For example, HP-UX with POSIX pthreads needs 88 bytes for
* a POSIX pthread mutex and almost 200 bytes per buffer header, while
* Solaris needs 24 and 52 bytes for the same structures.
* Solaris needs 24 and 52 bytes for the same structures. The minimum
* number of hash buckets is 37. These contain a mutex also.
*/
dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH));
dbenv->mp_bytes =
32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
dbenv->mp_ncache = 1;
dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
dbenv->set_cachesize = __memp_set_cachesize;
#ifdef HAVE_RPC
/*
* If we have a client, overwrite what we just setup to
* point to client functions.
*/
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
dbenv->set_cachesize = __dbcl_env_cachesize;
dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize;
}
dbenv->memp_dump_region = NULL;
dbenv->memp_fcreate = __dbcl_memp_fcreate;
dbenv->memp_nameop = NULL;
dbenv->memp_register = __dbcl_memp_register;
dbenv->memp_stat = __dbcl_memp_stat;
dbenv->memp_sync = __dbcl_memp_sync;
dbenv->memp_trickle = __dbcl_memp_trickle;
} else
#endif
{
dbenv->set_cachesize = __memp_set_cachesize;
dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
dbenv->memp_dump_region = __memp_dump_region;
dbenv->memp_fcreate = __memp_fcreate;
dbenv->memp_nameop = __memp_nameop;
dbenv->memp_register = __memp_register;
dbenv->memp_stat = __memp_stat;
dbenv->memp_sync = __memp_sync;
dbenv->memp_trickle = __memp_trickle;
}
}
/*
@ -78,26 +95,50 @@ __memp_set_cachesize(dbenv, gbytes, bytes, ncache)
{
ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize");
dbenv->mp_gbytes = gbytes + bytes / GIGABYTE;
dbenv->mp_bytes = bytes % GIGABYTE;
dbenv->mp_ncache = ncache == 0 ? 1 : ncache;
/* Normalize the values. */
if (ncache == 0)
ncache = 1;
/*
* If the application requested less than 500Mb, increase the
* cachesize by 25% to account for our overhead. (I'm guessing
* that caches over 500Mb are specifically sized, i.e., it's
* a large server and the application actually knows how much
* memory is available.)
* You can only store 4GB-1 in an unsigned 32-bit value, so correct for
* applications that specify 4GB cache sizes -- we know what they meant.
*/
if (gbytes / ncache == 4 && bytes == 0) {
--gbytes;
bytes = GIGABYTE - 1;
} else {
gbytes += bytes / GIGABYTE;
bytes %= GIGABYTE;
}
/* Avoid too-large cache sizes, they result in a region size of zero. */
if (gbytes / ncache > 4 || (gbytes / ncache == 4 && bytes != 0)) {
__db_err(dbenv, "individual cache size too large");
return (EINVAL);
}
/*
* If the application requested less than 500Mb, increase the cachesize
* by 25% and factor in the size of the hash buckets to account for our
* overhead. (I'm guessing caches over 500Mb are specifically sized,
* that is, it's a large server and the application actually knows how
* much memory is available. We only document the 25% overhead number,
* not the hash buckets, but I don't see a reason to confuse the issue,
* it shouldn't matter to an application.)
*
* There is a minimum cache size, regardless.
*/
if (dbenv->mp_gbytes == 0) {
if (dbenv->mp_bytes < 500 * MEGABYTE)
dbenv->mp_bytes += dbenv->mp_bytes / 4;
if (dbenv->mp_bytes < DB_CACHESIZE_MIN)
dbenv->mp_bytes = DB_CACHESIZE_MIN;
if (gbytes == 0) {
if (bytes < 500 * MEGABYTE)
bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
if (bytes / ncache < DB_CACHESIZE_MIN)
bytes = ncache * DB_CACHESIZE_MIN;
}
dbenv->mp_gbytes = gbytes;
dbenv->mp_bytes = bytes;
dbenv->mp_ncache = ncache;
return (0);
}

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $";
static const char revid[] = "$Id: mp_region.c,v 11.49 2002/05/07 18:42:20 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -17,11 +17,11 @@ static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int));
#ifdef MUTEX_SYSTEM_RESOURCES
#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
static size_t __mpool_region_maint __P((REGINFO *));
#endif
@ -119,6 +119,8 @@ __memp_open(dbenv)
regids[i] = dbmp->reginfo[i].id;
}
R_UNLOCK(dbenv, dbmp->reginfo);
} else {
/*
* Determine how many regions there are going to be, allocate
@ -135,6 +137,19 @@ __memp_open(dbenv)
dbmp->reginfo[i].id = INVALID_REGION_ID;
dbmp->reginfo[0] = reginfo;
/*
* We have to unlock the primary mpool region before we attempt
* to join the additional mpool regions. If we don't, we can
* deadlock. The scenario is that we hold the primary mpool
* region lock. We then try to attach to an additional mpool
* region, which requires the acquisition/release of the main
* region lock (to search the list of regions). If another
* thread of control already holds the main region lock and is
* waiting on our primary mpool region lock, we'll deadlock.
* See [#4696] for more information.
*/
R_UNLOCK(dbenv, dbmp->reginfo);
/* Join remaining regions. */
regids = R_ADDR(dbmp->reginfo, mp->regids);
for (i = 1; i < dbmp->nreg; ++i) {
@ -155,17 +170,10 @@ __memp_open(dbenv)
R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
/* If the region is threaded, allocate a mutex to lock the handles. */
if (F_ISSET(dbenv, DB_ENV_THREAD)) {
if ((ret = __db_mutex_alloc(
dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) {
goto err;
}
if ((ret =
__db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0)
goto err;
}
R_UNLOCK(dbenv, dbmp->reginfo);
if (F_ISSET(dbenv, DB_ENV_THREAD) &&
(ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmp->mutexp,
MUTEX_ALLOC | MUTEX_THREAD)) != 0)
goto err;
dbenv->mp_handle = dbmp;
return (0);
@ -180,12 +188,11 @@ err: if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
if (dbmp->reginfo[i].id != INVALID_REGION_ID)
(void)__db_r_detach(
dbenv, &dbmp->reginfo[i], 0);
__os_free(dbmp->reginfo,
dbmp->nreg * sizeof(*dbmp->reginfo));
__os_free(dbenv, dbmp->reginfo);
}
if (dbmp->mutexp != NULL)
__db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
__os_free(dbmp, sizeof(*dbmp));
__os_free(dbenv, dbmp);
return (ret);
}
@ -199,13 +206,13 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
DB_MPOOL *dbmp;
int reginfo_off, htab_buckets;
{
DB_HASHTAB *htab;
DB_MPOOL_HASH *htab;
MPOOL *mp;
REGINFO *reginfo;
#ifdef MUTEX_SYSTEM_RESOURCES
#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
size_t maint_size;
#endif
int ret;
int i, ret;
void *p;
mp = NULL;
@ -218,7 +225,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
mp = reginfo->primary;
memset(mp, 0, sizeof(*mp));
#ifdef MUTEX_SYSTEM_RESOURCES
#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
maint_size = __mpool_region_maint(reginfo);
/* Allocate room for the maintenance info and initialize it. */
if ((ret = __db_shalloc(reginfo->addr,
@ -231,14 +238,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
if (reginfo_off == 0) {
SH_TAILQ_INIT(&mp->mpfq);
if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex,
R_OFFSET(dbmp->reginfo, &mp->sync_mutex) +
DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo,
(REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0)
goto err;
ZERO_LSN(mp->lsn);
mp->lsn_cnt = 0;
mp->nreg = dbmp->nreg;
if ((ret = __db_shalloc(dbmp->reginfo[0].addr,
@ -247,32 +247,41 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
mp->regids = R_OFFSET(dbmp->reginfo, p);
}
SH_TAILQ_INIT(&mp->bhq);
/* Allocate hash table space and initialize it. */
if ((ret = __db_shalloc(reginfo->addr,
htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0)
htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
goto mem_err;
__db_hashinit(htab, htab_buckets);
mp->htab = R_OFFSET(reginfo, htab);
mp->htab_buckets = htab_buckets;
for (i = 0; i < htab_buckets; i++) {
if ((ret = __db_mutex_setup(dbenv,
reginfo, &htab[i].hash_mutex,
MUTEX_NO_RLOCK)) != 0)
return (ret);
SH_TAILQ_INIT(&htab[i].hash_bucket);
htab[i].hash_page_dirty = htab[i].hash_priority = 0;
}
mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets;
/*
* Only the environment creator knows the total cache size, fill in
* those statistics now.
*/
mp->stat.st_gbytes = dbenv->mp_gbytes;
mp->stat.st_bytes = dbenv->mp_bytes;
return (0);
mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region");
err: if (reginfo->primary != NULL)
__db_shalloc_free(reginfo->addr, reginfo->primary);
return (ret);
}
/*
* __memp_close --
* Internal version of memp_close: only called from DB_ENV->close.
* __memp_dbenv_refresh --
* Clean up after the mpool system on a close or failed open.
*
* PUBLIC: int __memp_close __P((DB_ENV *));
* PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *));
*/
int
__memp_close(dbenv)
__memp_dbenv_refresh(dbenv)
DB_ENV *dbenv;
{
DB_MPOOL *dbmp;
@ -287,12 +296,12 @@ __memp_close(dbenv)
/* Discard DB_MPREGs. */
while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
LIST_REMOVE(mpreg, q);
__os_free(mpreg, sizeof(DB_MPREG));
__os_free(dbenv, mpreg);
}
/* Discard DB_MPOOLFILEs. */
while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
if ((t_ret = __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
ret = t_ret;
/* Discard the thread mutex. */
@ -305,14 +314,14 @@ __memp_close(dbenv)
dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0)
ret = t_ret;
__os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo));
__os_free(dbmp, sizeof(*dbmp));
__os_free(dbenv, dbmp->reginfo);
__os_free(dbenv, dbmp);
dbenv->mp_handle = NULL;
return (ret);
}
#ifdef MUTEX_SYSTEM_RESOURCES
#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
/*
* __mpool_region_maint --
* Return the amount of space needed for region maintenance info.
@ -328,9 +337,11 @@ __mpool_region_maint(infop)
/*
* For mutex maintenance we need one mutex per possible page.
* Compute the maximum number of pages this cache can have.
* Also add in an mpool mutex.
* Also add in an mpool mutex and mutexes for all dbenv and db
* handles.
*/
numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1);
numlocks += DB_MAX_HANDLES;
s = sizeof(roff_t) * numlocks;
return (s);
}
@ -347,11 +358,109 @@ __mpool_region_destroy(dbenv, infop)
DB_ENV *dbenv;
REGINFO *infop;
{
MPOOL *mp;
__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop,
((MPOOL *)R_ADDR(infop, infop->rp->primary))->maint_off));
COMPQUIET(dbenv, NULL);
mp = R_ADDR(infop, infop->rp->primary);
__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off));
return;
COMPQUIET(infop, NULL);
}
/*
* __memp_nameop
* Remove or rename a file in the pool.
*
* PUBLIC: int __memp_nameop __P((DB_ENV *,
* PUBLIC: u_int8_t *, const char *, const char *, const char *));
*
* XXX
* Undocumented interface: DB private.
*/
int
__memp_nameop(dbenv, fileid, newname, fullold, fullnew)
DB_ENV *dbenv;
u_int8_t *fileid;
const char *newname, *fullold, *fullnew;
{
DB_MPOOL *dbmp;
MPOOL *mp;
MPOOLFILE *mfp;
roff_t newname_off;
int locked, ret;
void *p;
locked = 0;
dbmp = NULL;
if (!MPOOL_ON(dbenv))
goto fsop;
dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
/*
* Remove or rename a file that the mpool might know about. We assume
* that the fop layer has the file locked for exclusive access, so we
* don't worry about locking except for the mpool mutexes. Checkpoint
* can happen at any time, independent of file locking, so we have to
* do the actual unlink or rename system call to avoid any race.
*
* If this is a rename, allocate first, because we can't recursively
* grab the region lock.
*/
if (newname == NULL)
p = NULL;
else {
if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
NULL, strlen(newname) + 1, &newname_off, &p)) != 0)
return (ret);
memcpy(p, newname, strlen(newname) + 1);
}
locked = 1;
R_LOCK(dbenv, dbmp->reginfo);
/*
* Find the file -- if mpool doesn't know about this file, that's not
* an error-- we may not have it open.
*/
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
/* Ignore non-active files. */
if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
continue;
/* Ignore non-matching files. */
if (memcmp(fileid, R_ADDR(
dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
continue;
/* If newname is NULL, we're removing the file. */
if (newname == NULL) {
MUTEX_LOCK(dbenv, &mfp->mutex);
MPOOLFILE_IGNORE(mfp);
MUTEX_UNLOCK(dbenv, &mfp->mutex);
} else {
/*
* Else, it's a rename. We've allocated memory
* for the new name. Swap it with the old one.
*/
p = R_ADDR(dbmp->reginfo, mfp->path_off);
mfp->path_off = newname_off;
}
break;
}
/* Delete the memory we no longer need. */
if (p != NULL)
__db_shalloc_free(dbmp->reginfo[0].addr, p);
fsop: if (newname == NULL)
(void)__os_unlink(dbenv, fullold);
else
(void)__os_rename(dbenv, fullold, fullnew, 1);
if (locked)
R_UNLOCK(dbenv, dbmp->reginfo);
return (0);
}

View file

@ -1,38 +1,33 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $";
static const char revid[] = "$Id: mp_register.c,v 11.21 2002/03/27 04:32:27 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#endif
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
/*
* memp_register --
* Register a file type's pgin, pgout routines.
*
* PUBLIC: int __memp_register __P((DB_ENV *, int,
* PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
* PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
*/
int
memp_register(dbenv, ftype, pgin, pgout)
__memp_register(dbenv, ftype, pgin, pgout)
DB_ENV *dbenv;
int ftype;
int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
@ -42,13 +37,9 @@ memp_register(dbenv, ftype, pgin, pgout)
DB_MPREG *mpreg;
int ret;
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
return (__dbcl_memp_register(dbenv, ftype, pgin, pgout));
#endif
PANIC_CHECK(dbenv);
ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
ENV_REQUIRES_CONFIG(dbenv,
dbenv->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL);
dbmp = dbenv->mp_handle;
@ -70,7 +61,7 @@ memp_register(dbenv, ftype, pgin, pgout)
return (0);
/* New entry. */
if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0)
if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), &mpreg)) != 0)
return (ret);
mpreg->ftype = ftype;

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $";
static const char revid[] = "$Id: mp_stat.c,v 11.51 2002/08/06 06:13:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -18,123 +18,150 @@ static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic E
#include <unistd.h>
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#endif
#include "db_int.h"
#include "db_page.h"
#include "db_shash.h"
#include "db_am.h"
#include "mp.h"
#include "dbinc/db_page.h"
#include "dbinc/db_shash.h"
#include "dbinc/db_am.h"
#include "dbinc/mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#endif
static void __memp_dumpcache
__P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
static void __memp_dumpcache __P((DB_ENV *,
DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *));
static void __memp_stat_wait __P((REGINFO *, MPOOL *, DB_MPOOL_STAT *, int));
/*
* memp_stat --
* __memp_stat --
* Display MPOOL statistics.
*
* PUBLIC: int __memp_stat
* PUBLIC: __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
*/
int
memp_stat(dbenv, gspp, fspp, db_malloc)
__memp_stat(dbenv, gspp, fspp, flags)
DB_ENV *dbenv;
DB_MPOOL_STAT **gspp;
DB_MPOOL_FSTAT ***fspp;
void *(*db_malloc) __P((size_t));
u_int32_t flags;
{
DB_MPOOL *dbmp;
DB_MPOOL_FSTAT **tfsp, *tstruct;
DB_MPOOL_STAT *sp;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
char *tname;
size_t len, nlen;
u_int32_t i;
size_t len, nlen, pagesize;
u_int32_t pages, i;
int ret;
char *name;
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc));
#endif
char *name, *tname;
PANIC_CHECK(dbenv);
ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
ENV_REQUIRES_CONFIG(dbenv,
dbenv->mp_handle, "memp_stat", DB_INIT_MPOOL);
if ((ret = __db_fchk(dbenv,
"DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0)
return (ret);
dbmp = dbenv->mp_handle;
sp = NULL;
mp = dbmp->reginfo[0].primary;
/* Global statistics. */
mp = dbmp->reginfo[0].primary;
if (gspp != NULL) {
*gspp = NULL;
if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0)
if ((ret = __os_umalloc(dbenv, sizeof(**gspp), gspp)) != 0)
return (ret);
memset(*gspp, 0, sizeof(**gspp));
sp = *gspp;
/*
* Initialization and information that is not maintained on
* a per-cache basis.
*/
sp->st_hash_longest = 0;
sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait;
sp->st_region_nowait =
dbmp->reginfo[0].rp->mutex.mutex_set_nowait;
sp->st_gbytes = dbenv->mp_gbytes;
sp->st_bytes = dbenv->mp_bytes;
c_mp = dbmp->reginfo[0].primary;
sp->st_gbytes = c_mp->stat.st_gbytes;
sp->st_bytes = c_mp->stat.st_bytes;
sp->st_ncache = dbmp->nreg;
sp->st_regsize = dbmp->reginfo[0].rp->size;
R_LOCK(dbenv, dbmp->reginfo);
/* Walk the cache list and accumulate the global information. */
for (i = 0; i < mp->nreg; ++i) {
c_mp = dbmp->reginfo[i].primary;
sp->st_map += c_mp->stat.st_map;
sp->st_cache_hit += c_mp->stat.st_cache_hit;
sp->st_cache_miss += c_mp->stat.st_cache_miss;
sp->st_map += c_mp->stat.st_map;
sp->st_page_create += c_mp->stat.st_page_create;
sp->st_page_in += c_mp->stat.st_page_in;
sp->st_page_out += c_mp->stat.st_page_out;
sp->st_ro_evict += c_mp->stat.st_ro_evict;
sp->st_rw_evict += c_mp->stat.st_rw_evict;
sp->st_page_trickle += c_mp->stat.st_page_trickle;
sp->st_pages += c_mp->stat.st_pages;
/*
* st_page_dirty calculated by __memp_stat_hash
* st_page_clean calculated here
*/
__memp_stat_hash(
&dbmp->reginfo[i], c_mp, &sp->st_page_dirty);
sp->st_page_clean = sp->st_pages - sp->st_page_dirty;
sp->st_hash_buckets += c_mp->stat.st_hash_buckets;
sp->st_hash_searches += c_mp->stat.st_hash_searches;
if (c_mp->stat.st_hash_longest > sp->st_hash_longest)
sp->st_hash_longest =
c_mp->stat.st_hash_longest;
sp->st_hash_longest += c_mp->stat.st_hash_longest;
sp->st_hash_examined += c_mp->stat.st_hash_examined;
sp->st_page_clean += c_mp->stat.st_page_clean;
sp->st_page_dirty += c_mp->stat.st_page_dirty;
sp->st_page_trickle += c_mp->stat.st_page_trickle;
sp->st_region_wait += c_mp->stat.st_region_wait;
sp->st_region_nowait += c_mp->stat.st_region_nowait;
/*
* st_hash_nowait calculated by __memp_stat_wait
* st_hash_wait
*/
__memp_stat_wait(&dbmp->reginfo[i], c_mp, sp, flags);
sp->st_region_nowait +=
dbmp->reginfo[i].rp->mutex.mutex_set_nowait;
sp->st_region_wait +=
dbmp->reginfo[i].rp->mutex.mutex_set_wait;
sp->st_alloc += c_mp->stat.st_alloc;
sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets;
if (sp->st_alloc_max_buckets <
c_mp->stat.st_alloc_max_buckets)
sp->st_alloc_max_buckets =
c_mp->stat.st_alloc_max_buckets;
sp->st_alloc_pages += c_mp->stat.st_alloc_pages;
if (sp->st_alloc_max_pages <
c_mp->stat.st_alloc_max_pages)
sp->st_alloc_max_pages =
c_mp->stat.st_alloc_max_pages;
if (LF_ISSET(DB_STAT_CLEAR)) {
dbmp->reginfo[i].rp->mutex.mutex_set_wait = 0;
dbmp->reginfo[i].rp->mutex.mutex_set_nowait = 0;
pages = c_mp->stat.st_pages;
memset(&c_mp->stat, 0, sizeof(c_mp->stat));
c_mp->stat.st_hash_buckets = c_mp->htab_buckets;
c_mp->stat.st_pages = pages;
}
}
/*
* We have duplicate statistics fields in the cache and
* per-file structures. The counters are only incremented
* in the per-file structures, though. The intent is that
* if we ever flush files from the pool we can save their
* last known totals in the cache structure.
* We have duplicate statistics fields in per-file structures
* and the cache. The counters are only incremented in the
* per-file structures, except if a file is flushed from the
* mpool, at which time we copy its information into the cache
* statistics. We added the cache information above, now we
* add the per-file information.
*/
R_LOCK(dbenv, dbmp->reginfo);
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
sp->st_map += mfp->stat.st_map;
sp->st_cache_hit += mfp->stat.st_cache_hit;
sp->st_cache_miss += mfp->stat.st_cache_miss;
sp->st_map += mfp->stat.st_map;
sp->st_page_create += mfp->stat.st_page_create;
sp->st_page_in += mfp->stat.st_page_in;
sp->st_page_out += mfp->stat.st_page_out;
if (fspp == NULL && LF_ISSET(DB_STAT_CLEAR)) {
pagesize = mfp->stat.st_pagesize;
memset(&mfp->stat, 0, sizeof(mfp->stat));
mfp->stat.st_pagesize = pagesize;
}
}
R_UNLOCK(dbenv, dbmp->reginfo);
}
@ -142,9 +169,8 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
if (fspp != NULL) {
*fspp = NULL;
R_LOCK(dbenv, dbmp->reginfo);
/* Count the MPOOLFILE structures. */
R_LOCK(dbenv, dbmp->reginfo);
for (i = 0, len = 0,
mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL;
@ -153,18 +179,15 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
sizeof(DB_MPOOL_FSTAT) +
strlen(__memp_fns(dbmp, mfp)) + 1;
len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */
R_UNLOCK(dbenv, dbmp->reginfo);
if (len == 0)
if (i == 0)
return (0);
/* Allocate space */
if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0)
if ((ret = __os_umalloc(dbenv, len, fspp)) != 0)
return (ret);
R_LOCK(dbenv, dbmp->reginfo);
/*
* Build each individual entry. We assume that an array of
* pointers are aligned correctly to be followed by an array
@ -179,20 +202,30 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1);
tname = (char *)(tstruct + i);
/*
* Files may have been opened since we counted, don't walk
* off the end of the allocated space.
*/
R_LOCK(dbenv, dbmp->reginfo);
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL;
mfp != NULL && i-- > 0;
++tfsp, ++tstruct, tname += nlen,
mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
name = __memp_fns(dbmp, mfp);
nlen = strlen(name) + 1;
*tfsp = tstruct;
*tstruct = mfp->stat;
if (LF_ISSET(DB_STAT_CLEAR)) {
pagesize = mfp->stat.st_pagesize;
memset(&mfp->stat, 0, sizeof(mfp->stat));
mfp->stat.st_pagesize = pagesize;
}
tstruct->file_name = tname;
memcpy(tname, name, nlen);
}
*tfsp = NULL;
R_UNLOCK(dbenv, dbmp->reginfo);
*tfsp = NULL;
}
return (0);
}
@ -200,7 +233,6 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
#define FMAP_ENTRIES 200 /* Files we map. */
#define MPOOL_DUMP_HASH 0x01 /* Debug hash chains. */
#define MPOOL_DUMP_LRU 0x02 /* Debug LRU chains. */
#define MPOOL_DUMP_MEM 0x04 /* Debug region memory. */
#define MPOOL_DUMP_ALL 0x07 /* Debug all. */
@ -208,14 +240,23 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
* __memp_dump_region --
* Display MPOOL structures.
*
* PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *));
* PUBLIC: int __memp_dump_region __P((DB_ENV *, char *, FILE *));
*/
void
int
__memp_dump_region(dbenv, area, fp)
DB_ENV *dbenv;
char *area;
FILE *fp;
{
static const FN fn[] = {
{ MP_CAN_MMAP, "mmapped" },
{ MP_DEADFILE, "dead" },
{ MP_DIRECT, "no buffer" },
{ MP_EXTENT, "extent" },
{ MP_TEMP, "temporary" },
{ MP_UNLINK, "unlink" },
{ 0, NULL }
};
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
MPOOL *mp;
@ -225,6 +266,10 @@ __memp_dump_region(dbenv, area, fp)
int cnt;
u_int8_t *p;
PANIC_CHECK(dbenv);
ENV_REQUIRES_CONFIG(dbenv,
dbenv->mp_handle, "memp_dump_region", DB_INIT_MPOOL);
dbmp = dbenv->mp_handle;
/* Make it easy to call from the debugger. */
@ -239,40 +284,42 @@ __memp_dump_region(dbenv, area, fp)
case 'h':
LF_SET(MPOOL_DUMP_HASH);
break;
case 'l':
LF_SET(MPOOL_DUMP_LRU);
break;
case 'm':
LF_SET(MPOOL_DUMP_MEM);
break;
}
R_LOCK(dbenv, dbmp->reginfo);
mp = dbmp->reginfo[0].primary;
/* Display MPOOL structures. */
(void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n",
DB_LINE, (u_long)dbmp->reginfo[0].addr);
DB_LINE, P_TO_ULONG(dbmp->reginfo[0].addr));
/* Display the MPOOLFILE structures. */
cnt = 0;
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
R_LOCK(dbenv, dbmp->reginfo);
for (cnt = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
(void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ",
cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype,
F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
(void)fprintf(fp, "File #%d: %s: pagesize %lu\n", cnt + 1,
__memp_fns(dbmp, mfp), (u_long)mfp->stat.st_pagesize);
(void)fprintf(fp, "\t type %ld; ref %lu; blocks %lu; last %lu;",
(long)mfp->ftype, (u_long)mfp->mpf_cnt,
(u_long)mfp->block_cnt, (u_long)mfp->last_pgno);
__db_prflags(mfp->flags, fn, fp);
(void)fprintf(fp, "\n\t UID: ");
p = R_ADDR(dbmp->reginfo, mfp->fileid_off);
for (i = 0; i < DB_FILE_ID_LEN; ++i) {
(void)fprintf(fp, "%x", *p++);
for (i = 0; i < DB_FILE_ID_LEN; ++i, ++p) {
(void)fprintf(fp, "%x", (u_int)*p);
if (i < DB_FILE_ID_LEN - 1)
(void)fprintf(fp, " ");
}
(void)fprintf(fp, "]\n");
(void)fprintf(fp, "\n");
if (cnt < FMAP_ENTRIES)
fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
}
R_UNLOCK(dbenv, dbmp->reginfo);
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
(void)fprintf(fp, "File #%d: %s: per-process, %s\n",
@ -281,6 +328,7 @@ __memp_dump_region(dbenv, area, fp)
if (cnt < FMAP_ENTRIES)
fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
}
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (cnt < FMAP_ENTRIES)
fmap[cnt] = INVALID_ROFF;
else
@ -289,13 +337,14 @@ __memp_dump_region(dbenv, area, fp)
/* Dump the memory pools. */
for (i = 0; i < mp->nreg; ++i) {
(void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1);
__memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags);
__memp_dumpcache(
dbenv, dbmp, &dbmp->reginfo[i], fmap, fp, flags);
}
R_UNLOCK(dbenv, dbmp->reginfo);
/* Flush in case we're debugging. */
(void)fflush(fp);
return (0);
}
/*
@ -303,7 +352,8 @@ __memp_dump_region(dbenv, area, fp)
* Display statistics for a cache.
*/
static void
__memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
__memp_dumpcache(dbenv, dbmp, reginfo, fmap, fp, flags)
DB_ENV *dbenv;
DB_MPOOL *dbmp;
REGINFO *reginfo;
size_t *fmap;
@ -311,7 +361,7 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
u_int32_t flags;
{
BH *bhp;
DB_HASHTAB *dbht;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
int bucket;
@ -320,25 +370,22 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
/* Display the hash table list of BH's. */
if (LF_ISSET(MPOOL_DUMP_HASH)) {
(void)fprintf(fp,
"%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n",
"%s\nBH hash table (%lu hash slots)\nbucket (priority):\n",
DB_LINE, (u_long)c_mp->htab_buckets);
for (dbht = R_ADDR(reginfo, c_mp->htab),
bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) {
if (SH_TAILQ_FIRST(dbht, __bh) != NULL)
(void)fprintf(fp, "%lu:\n", (u_long)bucket);
for (bhp = SH_TAILQ_FIRST(dbht, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
__memp_pbh(dbmp, bhp, fmap, fp);
}
}
(void)fprintf(fp,
"\tpageno, file, ref, address [LSN] priority\n");
/* Display the LRU list of BH's. */
if (LF_ISSET(MPOOL_DUMP_LRU)) {
(void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE);
(void)fprintf(fp, "pageno, file, ref, address\n");
for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
__memp_pbh(dbmp, bhp, fmap, fp);
for (hp = R_ADDR(reginfo, c_mp->htab),
bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
MUTEX_LOCK(dbenv, &hp->hash_mutex);
if ((bhp =
SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
(void)fprintf(fp, "%lu (%u):\n",
(u_long)bucket, hp->hash_priority);
for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
__memp_pbh(dbmp, bhp, fmap, fp);
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
}
/* Dump the memory pool. */
@ -360,10 +407,9 @@ __memp_pbh(dbmp, bhp, fmap, fp)
static const FN fn[] = {
{ BH_CALLPGIN, "callpgin" },
{ BH_DIRTY, "dirty" },
{ BH_DIRTY_CREATE, "created" },
{ BH_DISCARD, "discard" },
{ BH_LOCKED, "locked" },
{ BH_SYNC, "sync" },
{ BH_SYNC_LOGFLSH, "sync:logflush" },
{ BH_TRASH, "trash" },
{ 0, NULL }
};
@ -374,15 +420,72 @@ __memp_pbh(dbmp, bhp, fmap, fp)
break;
if (fmap[i] == INVALID_ROFF)
(void)fprintf(fp, " %4lu, %lu, %2lu, %lu",
(void)fprintf(fp, "\t%5lu, %lu, %2lu, %8lu [%lu,%lu] %lu",
(u_long)bhp->pgno, (u_long)bhp->mf_offset,
(u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
(u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
(u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
(u_long)bhp->priority);
else
(void)fprintf(fp, " %4lu, #%d, %2lu, %lu",
(void)fprintf(fp, "\t%5lu, #%d, %2lu, %8lu [%lu,%lu] %lu",
(u_long)bhp->pgno, i + 1,
(u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
(u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
(u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
(u_long)bhp->priority);
__db_prflags(bhp->flags, fn, fp);
(void)fprintf(fp, "\n");
}
/*
* __memp_stat_hash --
* Total hash bucket stats (other than mutex wait) into the region.
*
* PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
*/
void
__memp_stat_hash(reginfo, mp, dirtyp)
REGINFO *reginfo;
MPOOL *mp;
u_int32_t *dirtyp;
{
DB_MPOOL_HASH *hp;
u_int32_t dirty;
int i;
hp = R_ADDR(reginfo, mp->htab);
for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++)
dirty += hp->hash_page_dirty;
*dirtyp = dirty;
}
/*
* __memp_stat_wait --
* Total hash bucket wait stats into the region.
*/
static void
__memp_stat_wait(reginfo, mp, mstat, flags)
REGINFO *reginfo;
MPOOL *mp;
DB_MPOOL_STAT *mstat;
int flags;
{
DB_MPOOL_HASH *hp;
DB_MUTEX *mutexp;
int i;
mstat->st_hash_max_wait = 0;
hp = R_ADDR(reginfo, mp->htab);
for (i = 0; i < mp->htab_buckets; i++, hp++) {
mutexp = &hp->hash_mutex;
mstat->st_hash_nowait += mutexp->mutex_set_nowait;
mstat->st_hash_wait += mutexp->mutex_set_wait;
if (mutexp->mutex_set_wait > mstat->st_hash_max_wait)
mstat->st_hash_max_wait = mutexp->mutex_set_wait;
if (LF_ISSET(DB_STAT_CLEAR)) {
mutexp->mutex_set_wait = 0;
mutexp->mutex_set_nowait = 0;
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $";
static const char revid[] = "$Id: mp_trickle.c,v 11.24 2002/08/06 06:13:53 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@ -16,42 +16,29 @@ static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell
#include <stdlib.h>
#endif
#ifdef HAVE_RPC
#include "db_server.h"
#endif
#include "db_int.h"
#include "db_shash.h"
#include "mp.h"
#ifdef HAVE_RPC
#include "gen_client_ext.h"
#include "rpc_client_ext.h"
#endif
static int __memp_trick __P((DB_ENV *, int, int, int *));
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
/*
* memp_trickle --
* __memp_trickle --
* Keep a specified percentage of the buffers clean.
*
* PUBLIC: int __memp_trickle __P((DB_ENV *, int, int *));
*/
int
memp_trickle(dbenv, pct, nwrotep)
__memp_trickle(dbenv, pct, nwrotep)
DB_ENV *dbenv;
int pct, *nwrotep;
{
DB_MPOOL *dbmp;
MPOOL *mp;
u_int32_t i;
int ret;
#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
return (__dbcl_memp_trickle(dbenv, pct, nwrotep));
#endif
MPOOL *c_mp, *mp;
u_int32_t clean, dirty, i, total, dtmp;
int ret, wrote;
PANIC_CHECK(dbenv);
ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
ENV_REQUIRES_CONFIG(dbenv,
dbenv->mp_handle, "memp_trickle", DB_INIT_MPOOL);
dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
@ -62,88 +49,35 @@ memp_trickle(dbenv, pct, nwrotep)
if (pct < 1 || pct > 100)
return (EINVAL);
R_LOCK(dbenv, dbmp->reginfo);
/* Loop through the caches... */
for (ret = 0, i = 0; i < mp->nreg; ++i)
if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0)
break;
R_UNLOCK(dbenv, dbmp->reginfo);
return (ret);
}
/*
* __memp_trick --
* Trickle a single cache.
*/
static int
__memp_trick(dbenv, ncache, pct, nwrotep)
DB_ENV *dbenv;
int ncache, pct, *nwrotep;
{
BH *bhp;
DB_MPOOL *dbmp;
MPOOL *c_mp;
MPOOLFILE *mfp;
db_pgno_t pgno;
u_long total;
int ret, wrote;
dbmp = dbenv->mp_handle;
c_mp = dbmp->reginfo[ncache].primary;
/*
* If there are sufficient clean buffers, or no buffers or no dirty
* If there are sufficient clean buffers, no buffers or no dirty
* buffers, we're done.
*
* XXX
* Using st_page_clean and st_page_dirty is our only choice at the
* moment, but it's not as correct as we might like in the presence
* of pools with more than one buffer size, as a free 512-byte buffer
* isn't the same as a free 8K buffer.
* Using hash_page_dirty is our only choice at the moment, but it's not
* as correct as we might like in the presence of pools having more
* than one page size, as a free 512B buffer isn't the same as a free
* 8KB buffer.
*
* Loop through the caches counting total/dirty buffers.
*/
loop: total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty;
if (total == 0 || c_mp->stat.st_page_dirty == 0 ||
(c_mp->stat.st_page_clean * 100) / total >= (u_long)pct)
return (0);
/* Loop until we write a buffer. */
for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
if (bhp->ref != 0 ||
!F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
continue;
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
/*
* We can't write to temporary files -- see the comment in
* mp_bh.c:__memp_bhwrite().
*/
if (F_ISSET(mfp, MP_TEMP))
continue;
pgno = bhp->pgno;
if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
return (ret);
/*
* Any process syncing the shared memory buffer pool had better
* be able to write to any underlying file. Be understanding,
* but firm, on this point.
*/
if (!wrote) {
__db_err(dbenv, "%s: unable to flush page: %lu",
__memp_fns(dbmp, mfp), (u_long)pgno);
return (EPERM);
}
++c_mp->stat.st_page_trickle;
if (nwrotep != NULL)
++*nwrotep;
goto loop;
for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) {
c_mp = dbmp->reginfo[i].primary;
total += c_mp->stat.st_pages;
__memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp);
dirty += dtmp;
}
return (0);
clean = total - dirty;
if (clean == total || (clean * 100) / total >= (u_long)pct)
return (0);
if (nwrotep == NULL)
nwrotep = &wrote;
ret = __memp_sync_int(dbenv, NULL,
((total * pct) / 100) - clean, DB_SYNC_TRICKLE, nwrotep);
mp->stat.st_page_trickle += *nwrotep;
return (ret);
}