mariadb/bdb/db/db_am.c

512 lines
11 KiB
C
Raw Normal View History

2001-03-04 19:42:05 -05:00
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1998, 1999, 2000
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <string.h>
#endif
#include "db_int.h"
#include "db_page.h"
#include "db_shash.h"
#include "btree.h"
#include "hash.h"
#include "qam.h"
#include "lock.h"
#include "mp.h"
#include "txn.h"
#include "db_am.h"
#include "db_ext.h"
/*
* __db_cursor --
* Allocate and return a cursor.
*
* PUBLIC: int __db_cursor __P((DB *, DB_TXN *, DBC **, u_int32_t));
*/
int
__db_cursor(dbp, txn, dbcp, flags)
DB *dbp;
DB_TXN *txn;
DBC **dbcp;
u_int32_t flags;
{
DB_ENV *dbenv;
DBC *dbc;
db_lockmode_t mode;
u_int32_t op;
int ret;
dbenv = dbp->dbenv;
PANIC_CHECK(dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
/* Check for invalid flags. */
if ((ret = __db_cursorchk(dbp, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
return (ret);
if ((ret =
__db_icursor(dbp, txn, dbp->type, PGNO_INVALID, 0, dbcp)) != 0)
return (ret);
dbc = *dbcp;
/*
* If this is CDB, do all the locking in the interface, which is
* right here.
*/
if (CDB_LOCKING(dbenv)) {
op = LF_ISSET(DB_OPFLAGS_MASK);
mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE :
((op == DB_WRITECURSOR) ? DB_LOCK_IWRITE : DB_LOCK_READ);
if ((ret = lock_get(dbenv, dbc->locker, 0,
&dbc->lock_dbt, mode, &dbc->mylock)) != 0) {
(void)__db_c_close(dbc);
return (ret);
}
if (op == DB_WRITECURSOR)
F_SET(dbc, DBC_WRITECURSOR);
if (op == DB_WRITELOCK)
F_SET(dbc, DBC_WRITER);
}
return (0);
}
/*
* __db_icursor --
* Internal version of __db_cursor. If dbcp is
* non-NULL it is assumed to point to an area to
* initialize as a cursor.
*
* PUBLIC: int __db_icursor
* PUBLIC: __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, DBC **));
*/
int
__db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
DB *dbp;
DB_TXN *txn;
DBTYPE dbtype;
db_pgno_t root;
int is_opd;
DBC **dbcp;
{
DBC *dbc, *adbc;
DBC_INTERNAL *cp;
DB_ENV *dbenv;
int allocated, ret;
dbenv = dbp->dbenv;
allocated = 0;
/*
* Take one from the free list if it's available. Take only the
* right type. With off page dups we may have different kinds
* of cursors on the queue for a single database.
*/
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
for (dbc = TAILQ_FIRST(&dbp->free_queue);
dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
if (dbtype == dbc->dbtype) {
TAILQ_REMOVE(&dbp->free_queue, dbc, links);
dbc->flags = 0;
break;
}
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
if (dbc == NULL) {
if ((ret = __os_calloc(dbp->dbenv, 1, sizeof(DBC), &dbc)) != 0)
return (ret);
allocated = 1;
dbc->flags = 0;
dbc->dbp = dbp;
/* Set up locking information. */
if (LOCKING_ON(dbenv)) {
/*
* If we are not threaded, then there is no need to
* create new locker ids. We know that no one else
* is running concurrently using this DB, so we can
* take a peek at any cursors on the active queue.
*/
if (!DB_IS_THREADED(dbp) &&
(adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
dbc->lid = adbc->lid;
else
if ((ret = lock_id(dbenv, &dbc->lid)) != 0)
goto err;
memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN);
if (CDB_LOCKING(dbenv)) {
if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB)) {
/*
* If we are doing a single lock per
* environment, set up the global
* lock object just like we do to
* single thread creates.
*/
DB_ASSERT(sizeof(db_pgno_t) ==
sizeof(u_int32_t));
dbc->lock_dbt.size = sizeof(u_int32_t);
dbc->lock_dbt.data = &dbc->lock.pgno;
dbc->lock.pgno = 0;
} else {
dbc->lock_dbt.size = DB_FILE_ID_LEN;
dbc->lock_dbt.data = dbc->lock.fileid;
}
} else {
dbc->lock.type = DB_PAGE_LOCK;
dbc->lock_dbt.size = sizeof(dbc->lock);
dbc->lock_dbt.data = &dbc->lock;
}
}
/* Init the DBC internal structure. */
switch (dbtype) {
case DB_BTREE:
case DB_RECNO:
if ((ret = __bam_c_init(dbc, dbtype)) != 0)
goto err;
break;
case DB_HASH:
if ((ret = __ham_c_init(dbc)) != 0)
goto err;
break;
case DB_QUEUE:
if ((ret = __qam_c_init(dbc)) != 0)
goto err;
break;
default:
ret = __db_unknown_type(dbp->dbenv,
"__db_icursor", dbtype);
goto err;
}
cp = dbc->internal;
}
/* Refresh the DBC structure. */
dbc->dbtype = dbtype;
if ((dbc->txn = txn) == NULL)
dbc->locker = dbc->lid;
else {
dbc->locker = txn->txnid;
txn->cursors++;
}
if (is_opd)
F_SET(dbc, DBC_OPD);
if (F_ISSET(dbp, DB_AM_RECOVER))
F_SET(dbc, DBC_RECOVER);
/* Refresh the DBC internal structure. */
cp = dbc->internal;
cp->opd = NULL;
cp->indx = 0;
cp->page = NULL;
cp->pgno = PGNO_INVALID;
cp->root = root;
switch (dbtype) {
case DB_BTREE:
case DB_RECNO:
if ((ret = __bam_c_refresh(dbc)) != 0)
goto err;
break;
case DB_HASH:
case DB_QUEUE:
break;
default:
ret = __db_unknown_type(dbp->dbenv, "__db_icursor", dbp->type);
goto err;
}
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
F_SET(dbc, DBC_ACTIVE);
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
*dbcp = dbc;
return (0);
err: if (allocated)
__os_free(dbc, sizeof(*dbc));
return (ret);
}
#ifdef DEBUG
/*
* __db_cprint --
* Display the current cursor list.
*
* PUBLIC: int __db_cprint __P((DB *));
*/
int
__db_cprint(dbp)
DB *dbp;
{
static const FN fn[] = {
{ DBC_ACTIVE, "active" },
{ DBC_OPD, "off-page-dup" },
{ DBC_RECOVER, "recover" },
{ DBC_RMW, "read-modify-write" },
{ DBC_WRITECURSOR, "write cursor" },
{ DBC_WRITEDUP, "internally dup'ed write cursor" },
{ DBC_WRITER, "short-term write cursor" },
{ 0, NULL }
};
DBC *dbc;
DBC_INTERNAL *cp;
char *s;
MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
for (dbc = TAILQ_FIRST(&dbp->active_queue);
dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
switch (dbc->dbtype) {
case DB_BTREE:
s = "btree";
break;
case DB_HASH:
s = "hash";
break;
case DB_RECNO:
s = "recno";
break;
case DB_QUEUE:
s = "queue";
break;
default:
DB_ASSERT(0);
return (1);
}
cp = dbc->internal;
fprintf(stderr, "%s/%#0lx: opd: %#0lx\n",
s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd));
fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n",
P_TO_ULONG(dbc->txn),
(u_long)dbc->lid, (u_long)dbc->locker);
fprintf(stderr, "\troot: %lu page/index: %lu/%lu",
(u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx);
__db_prflags(dbc->flags, fn, stderr);
fprintf(stderr, "\n");
if (dbp->type == DB_BTREE)
__bam_cprint(dbc);
}
for (dbc = TAILQ_FIRST(&dbp->free_queue);
dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
fprintf(stderr, "free: %#0lx ", P_TO_ULONG(dbc));
fprintf(stderr, "\n");
MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
return (0);
}
#endif /* DEBUG */
/*
* db_fd --
* Return a file descriptor for flock'ing.
*
* PUBLIC: int __db_fd __P((DB *, int *));
*/
int
__db_fd(dbp, fdp)
DB *dbp;
int *fdp;
{
DB_FH *fhp;
int ret;
PANIC_CHECK(dbp->dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->fd");
/*
* XXX
* Truly spectacular layering violation.
*/
if ((ret = __mp_xxx_fh(dbp->mpf, &fhp)) != 0)
return (ret);
if (F_ISSET(fhp, DB_FH_VALID)) {
*fdp = fhp->fd;
return (0);
} else {
*fdp = -1;
__db_err(dbp->dbenv, "DB does not have a valid file handle.");
return (ENOENT);
}
}
/*
* __db_get --
* Return a key/data pair.
*
* PUBLIC: int __db_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
*/
int
__db_get(dbp, txn, key, data, flags)
DB *dbp;
DB_TXN *txn;
DBT *key, *data;
u_int32_t flags;
{
DBC *dbc;
int mode, ret, t_ret;
PANIC_CHECK(dbp->dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get");
if ((ret = __db_getchk(dbp, key, data, flags)) != 0)
return (ret);
mode = 0;
if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
mode = DB_WRITELOCK;
if ((ret = dbp->cursor(dbp, txn, &dbc, mode)) != 0)
return (ret);
DEBUG_LREAD(dbc, txn, "__db_get", key, NULL, flags);
/*
* The DBC_TRANSIENT flag indicates that we're just doing a
* single operation with this cursor, and that in case of
* error we don't need to restore it to its old position--we're
* going to close it right away. Thus, we can perform the get
* without duplicating the cursor, saving some cycles in this
* common case.
*/
F_SET(dbc, DBC_TRANSIENT);
ret = dbc->c_get(dbc, key, data,
flags == 0 || flags == DB_RMW ? flags | DB_SET : flags);
if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
/*
* __db_put --
* Store a key/data pair.
*
* PUBLIC: int __db_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
*/
int
__db_put(dbp, txn, key, data, flags)
DB *dbp;
DB_TXN *txn;
DBT *key, *data;
u_int32_t flags;
{
DBC *dbc;
DBT tdata;
int ret, t_ret;
PANIC_CHECK(dbp->dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
if ((ret = __db_putchk(dbp, key, data,
flags, F_ISSET(dbp, DB_AM_RDONLY),
F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0)
return (ret);
DB_CHECK_TXN(dbp, txn);
if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
return (ret);
/*
* See the comment in __db_get().
*
* Note that the c_get in the DB_NOOVERWRITE case is safe to
* do with this flag set; if it errors in any way other than
* DB_NOTFOUND, we're going to close the cursor without doing
* anything else, and if it returns DB_NOTFOUND then it's safe
* to do a c_put(DB_KEYLAST) even if an access method moved the
* cursor, since that's not position-dependent.
*/
F_SET(dbc, DBC_TRANSIENT);
DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags);
if (flags == DB_NOOVERWRITE) {
flags = 0;
/*
* Set DB_DBT_USERMEM, this might be a threaded application and
* the flags checking will catch us. We don't want the actual
* data, so request a partial of length 0.
*/
memset(&tdata, 0, sizeof(tdata));
F_SET(&tdata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
/*
* If we're doing page-level locking, set the read-modify-write
* flag, we're going to overwrite immediately.
*/
if ((ret = dbc->c_get(dbc, key, &tdata,
DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0))) == 0)
ret = DB_KEYEXIST;
else if (ret == DB_NOTFOUND)
ret = 0;
}
if (ret == 0)
ret = dbc->c_put(dbc,
key, data, flags == 0 ? DB_KEYLAST : flags);
if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
/*
* __db_sync --
* Flush the database cache.
*
* PUBLIC: int __db_sync __P((DB *, u_int32_t));
*/
int
__db_sync(dbp, flags)
DB *dbp;
u_int32_t flags;
{
int ret, t_ret;
PANIC_CHECK(dbp->dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->sync");
if ((ret = __db_syncchk(dbp, flags)) != 0)
return (ret);
/* Read-only trees never need to be sync'd. */
if (F_ISSET(dbp, DB_AM_RDONLY))
return (0);
/* If it's a Recno tree, write the backing source text file. */
if (dbp->type == DB_RECNO)
ret = __ram_writeback(dbp);
/* If the tree was never backed by a database file, we're done. */
if (F_ISSET(dbp, DB_AM_INMEM))
return (0);
/* Flush any dirty pages from the cache to the backing file. */
if ((t_ret = memp_fsync(dbp->mpf)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}