mariadb/bdb/hash/hash.c

2097 lines
53 KiB
C
Raw Normal View History

2001-03-05 01:42:05 +01:00
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999, 2000
* Sleepycat Software. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
* Margo Seltzer. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Margo Seltzer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: hash.c,v 11.94 2001/01/03 16:42:26 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <stdlib.h>
#include <string.h>
#endif
#include "db_int.h"
#include "db_page.h"
#include "db_am.h"
#include "db_ext.h"
#include "db_shash.h"
#include "db_swap.h"
#include "hash.h"
#include "btree.h"
#include "log.h"
#include "lock.h"
#include "txn.h"
static int __ham_c_close __P((DBC *, db_pgno_t, int *));
static int __ham_c_del __P((DBC *));
static int __ham_c_destroy __P((DBC *));
static int __ham_c_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
static int __ham_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
static int __ham_c_writelock __P((DBC *));
static int __ham_del_dups __P((DBC *, DBT *));
static int __ham_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
static int __ham_dup_return __P((DBC *, DBT *, u_int32_t));
static int __ham_expand_table __P((DBC *));
static int __ham_init_htab __P((DBC *,
const char *, db_pgno_t, u_int32_t, u_int32_t));
static int __ham_lookup __P((DBC *,
const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *));
static int __ham_overwrite __P((DBC *, DBT *, u_int32_t));
/*
* __ham_metachk --
*
* PUBLIC: int __ham_metachk __P((DB *, const char *, HMETA *));
*/
int
__ham_metachk(dbp, name, hashm)
DB *dbp;
const char *name;
HMETA *hashm;
{
DB_ENV *dbenv;
u_int32_t vers;
int ret;
dbenv = dbp->dbenv;
/*
* At this point, all we know is that the magic number is for a Hash.
* Check the version, the database may be out of date.
*/
vers = hashm->dbmeta.version;
if (F_ISSET(dbp, DB_AM_SWAP))
M_32_SWAP(vers);
switch (vers) {
case 4:
case 5:
case 6:
__db_err(dbenv,
"%s: hash version %lu requires a version upgrade",
name, (u_long)vers);
return (DB_OLD_VERSION);
case 7:
break;
default:
__db_err(dbenv,
"%s: unsupported hash version: %lu", name, (u_long)vers);
return (EINVAL);
}
/* Swap the page if we need to. */
if (F_ISSET(dbp, DB_AM_SWAP) && (ret = __ham_mswap((PAGE *)hashm)) != 0)
return (ret);
/* Check the type. */
if (dbp->type != DB_HASH && dbp->type != DB_UNKNOWN)
return (EINVAL);
dbp->type = DB_HASH;
DB_ILLEGAL_METHOD(dbp, DB_OK_HASH);
/*
* Check application info against metadata info, and set info, flags,
* and type based on metadata info.
*/
if ((ret = __db_fchk(dbenv,
"DB->open", hashm->dbmeta.flags,
DB_HASH_DUP | DB_HASH_SUBDB | DB_HASH_DUPSORT)) != 0)
return (ret);
if (F_ISSET(&hashm->dbmeta, DB_HASH_DUP))
F_SET(dbp, DB_AM_DUP);
else
if (F_ISSET(dbp, DB_AM_DUP)) {
__db_err(dbenv,
"%s: DB_DUP specified to open method but not set in database",
name);
return (EINVAL);
}
if (F_ISSET(&hashm->dbmeta, DB_HASH_SUBDB))
F_SET(dbp, DB_AM_SUBDB);
else
if (F_ISSET(dbp, DB_AM_SUBDB)) {
__db_err(dbenv,
"%s: multiple databases specified but not supported in file",
name);
return (EINVAL);
}
if (F_ISSET(&hashm->dbmeta, DB_HASH_DUPSORT)) {
if (dbp->dup_compare == NULL)
dbp->dup_compare = __bam_defcmp;
} else
if (dbp->dup_compare != NULL) {
__db_err(dbenv,
"%s: duplicate sort function specified but not set in database",
name);
return (EINVAL);
}
/* Set the page size. */
dbp->pgsize = hashm->dbmeta.pagesize;
/* Copy the file's ID. */
memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN);
return (0);
}
/*
* __ham_open --
*
* PUBLIC: int __ham_open __P((DB *, const char *, db_pgno_t, u_int32_t));
*/
int
__ham_open(dbp, name, base_pgno, flags)
DB *dbp;
const char *name;
db_pgno_t base_pgno;
u_int32_t flags;
{
DB_ENV *dbenv;
DBC *dbc;
HASH_CURSOR *hcp;
HASH *hashp;
int need_sync, ret, t_ret;
dbc = NULL;
dbenv = dbp->dbenv;
need_sync = 0;
/* Initialize the remaining fields/methods of the DB. */
dbp->del = __ham_delete;
dbp->stat = __ham_stat;
/*
* Get a cursor. If DB_CREATE is specified, we may be creating
* pages, and to do that safely in CDB we need a write cursor.
* In STD_LOCKING mode, we'll synchronize using the meta page
* lock instead.
*/
if ((ret = dbp->cursor(dbp,
dbp->open_txn, &dbc, LF_ISSET(DB_CREATE) && CDB_LOCKING(dbenv) ?
DB_WRITECURSOR : 0)) != 0)
return (ret);
hcp = (HASH_CURSOR *)dbc->internal;
hashp = dbp->h_internal;
hashp->meta_pgno = base_pgno;
if ((ret = __ham_get_meta(dbc)) != 0)
goto err1;
/*
* If this is a new file, initialize it, and put it back dirty.
*
* Initialize the hdr structure.
*/
if (hcp->hdr->dbmeta.magic == DB_HASHMAGIC) {
/* File exists, verify the data in the header. */
if (hashp->h_hash == NULL)
hashp->h_hash = hcp->hdr->dbmeta.version < 5
? __ham_func4 : __ham_func5;
if (!F_ISSET(dbp, DB_RDONLY) &&
hashp->h_hash(dbp,
CHARKEY, sizeof(CHARKEY)) != hcp->hdr->h_charkey) {
__db_err(dbp->dbenv,
"hash: incompatible hash function");
ret = EINVAL;
goto err2;
}
if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_DUP))
F_SET(dbp, DB_AM_DUP);
if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_DUPSORT))
F_SET(dbp, DB_AM_DUPSORT);
if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_SUBDB))
F_SET(dbp, DB_AM_SUBDB);
} else if (!IS_RECOVERING(dbenv)) {
/*
* File does not exist, we must initialize the header. If
* locking is enabled that means getting a write lock first.
* During recovery the meta page will be in the log.
*/
dbc->lock.pgno = base_pgno;
if (STD_LOCKING(dbc) &&
((ret = lock_put(dbenv, &hcp->hlock)) != 0 ||
(ret = lock_get(dbenv, dbc->locker,
DB_NONBLOCK(dbc) ? DB_LOCK_NOWAIT : 0,
&dbc->lock_dbt, DB_LOCK_WRITE, &hcp->hlock)) != 0))
goto err2;
else if (CDB_LOCKING(dbp->dbenv)) {
DB_ASSERT(LF_ISSET(DB_CREATE));
if ((ret = lock_get(dbenv, dbc->locker,
DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
&dbc->mylock)) != 0)
goto err2;
}
if ((ret = __ham_init_htab(dbc, name,
base_pgno, hashp->h_nelem, hashp->h_ffactor)) != 0)
goto err2;
need_sync = 1;
}
err2: /* Release the meta data page */
if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
ret = t_ret;
err1: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
/* Sync the file so that we know that the meta data goes to disk. */
if (ret == 0 && need_sync)
ret = dbp->sync(dbp, 0);
#if CONFIG_TEST
if (ret == 0)
DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
DB_TEST_RECOVERY_LABEL
#endif
if (ret != 0)
(void)__ham_db_close(dbp);
return (ret);
}
/************************** LOCAL CREATION ROUTINES **********************/
/*
* Returns 0 on No Error
*/
static int
__ham_init_htab(dbc, name, pgno, nelem, ffactor)
DBC *dbc;
const char *name;
db_pgno_t pgno;
u_int32_t nelem, ffactor;
{
DB *dbp;
DB_LOCK metalock;
DB_LSN orig_lsn;
DBMETA *mmeta;
HASH_CURSOR *hcp;
HASH *hashp;
PAGE *h;
db_pgno_t mpgno;
int32_t l2, nbuckets;
int dirty_mmeta, i, ret, t_ret;
hcp = (HASH_CURSOR *)dbc->internal;
dbp = dbc->dbp;
hashp = dbp->h_internal;
mmeta = NULL;
h = NULL;
ret = 0;
dirty_mmeta = 0;
metalock.off = LOCK_INVALID;
if (hashp->h_hash == NULL)
hashp->h_hash = DB_HASHVERSION < 5 ? __ham_func4 : __ham_func5;
if (nelem != 0 && ffactor != 0) {
nelem = (nelem - 1) / ffactor + 1;
l2 = __db_log2(nelem > 2 ? nelem : 2);
} else
l2 = 1;
nbuckets = 1 << l2;
orig_lsn = hcp->hdr->dbmeta.lsn;
memset(hcp->hdr, 0, sizeof(HMETA));
ZERO_LSN(hcp->hdr->dbmeta.lsn);
hcp->hdr->dbmeta.pgno = pgno;
hcp->hdr->dbmeta.magic = DB_HASHMAGIC;
hcp->hdr->dbmeta.version = DB_HASHVERSION;
hcp->hdr->dbmeta.pagesize = dbp->pgsize;
hcp->hdr->dbmeta.type = P_HASHMETA;
hcp->hdr->dbmeta.free = PGNO_INVALID;
hcp->hdr->max_bucket = hcp->hdr->high_mask = nbuckets - 1;
hcp->hdr->low_mask = (nbuckets >> 1) - 1;
hcp->hdr->ffactor = ffactor;
hcp->hdr->h_charkey = hashp->h_hash(dbp, CHARKEY, sizeof(CHARKEY));
memcpy(hcp->hdr->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
if (F_ISSET(dbp, DB_AM_DUP))
F_SET(&hcp->hdr->dbmeta, DB_HASH_DUP);
if (F_ISSET(dbp, DB_AM_SUBDB))
F_SET(&hcp->hdr->dbmeta, DB_HASH_SUBDB);
if (dbp->dup_compare != NULL)
F_SET(&hcp->hdr->dbmeta, DB_HASH_DUPSORT);
if ((ret = memp_fset(dbp->mpf, hcp->hdr, DB_MPOOL_DIRTY)) != 0)
goto err;
/*
* Create the first and second buckets pages so that we have the
* page numbers for them and we can store that page number
* in the meta-data header (spares[0]).
*/
hcp->hdr->spares[0] = nbuckets;
if ((ret = memp_fget(dbp->mpf,
&hcp->hdr->spares[0], DB_MPOOL_NEW_GROUP, &h)) != 0)
goto err;
P_INIT(h, dbp->pgsize, hcp->hdr->spares[0], PGNO_INVALID,
PGNO_INVALID, 0, P_HASH);
/* Fill in the last fields of the meta data page. */
hcp->hdr->spares[0] -= (nbuckets - 1);
for (i = 1; i <= l2; i++)
hcp->hdr->spares[i] = hcp->hdr->spares[0];
for (; i < NCACHED; i++)
hcp->hdr->spares[i] = PGNO_INVALID;
/*
* Before we are about to put any dirty pages, we need to log
* the meta-data page create.
*/
ret = __db_log_page(dbp, name, &orig_lsn, pgno, (PAGE *)hcp->hdr);
if (dbp->open_txn != NULL) {
mmeta = (DBMETA *) hcp->hdr;
if (F_ISSET(dbp, DB_AM_SUBDB)) {
/*
* If this is a subdatabase, then we need to
* get the LSN off the master meta data page
* because that's where free pages are linked
* and during recovery we need to access
* that page and roll it backward/forward
* correctly with respect to LSN.
*/
mpgno = PGNO_BASE_MD;
if ((ret = __db_lget(dbc,
0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
goto err;
if ((ret = memp_fget(dbp->mpf,
&mpgno, 0, (PAGE **)&mmeta)) != 0)
goto err;
}
if ((t_ret = __ham_groupalloc_log(dbp->dbenv,
dbp->open_txn, &LSN(mmeta), 0, dbp->log_fileid,
&LSN(mmeta), hcp->hdr->spares[0],
hcp->hdr->max_bucket + 1, mmeta->free)) != 0 && ret == 0)
ret = t_ret;
if (ret == 0) {
/* need to update real LSN for buffer manager */
dirty_mmeta = 1;
}
}
DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, name);
DB_TEST_RECOVERY_LABEL
err: if (h != NULL &&
(t_ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0 && ret == 0)
ret = t_ret;
if (F_ISSET(dbp, DB_AM_SUBDB) && mmeta != NULL)
if ((t_ret = memp_fput(dbp->mpf, mmeta,
dirty_mmeta ? DB_MPOOL_DIRTY : 0)) != 0 && ret == 0)
ret = t_ret;
if (metalock.off != LOCK_INVALID)
(void)__TLPUT(dbc, metalock);
return (ret);
}
static int
__ham_delete(dbp, txn, key, flags)
DB *dbp;
DB_TXN *txn;
DBT *key;
u_int32_t flags;
{
DBC *dbc;
HASH_CURSOR *hcp;
db_pgno_t pgno;
int ret, t_ret;
/*
* This is the only access method routine called directly from
* the dbp, so we have to do error checking.
*/
PANIC_CHECK(dbp->dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
DB_CHECK_TXN(dbp, txn);
if ((ret =
__db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
return (ret);
if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
return (ret);
DEBUG_LWRITE(dbc, txn, "ham_delete", key, NULL, flags);
hcp = (HASH_CURSOR *)dbc->internal;
if ((ret = __ham_get_meta(dbc)) != 0)
goto out;
pgno = PGNO_INVALID;
if ((ret = __ham_lookup(dbc, key, 0, DB_LOCK_WRITE, &pgno)) == 0) {
if (F_ISSET(hcp, H_OK)) {
if (pgno == PGNO_INVALID)
ret = __ham_del_pair(dbc, 1);
else {
/* When we close the cursor in __ham_del_dups,
* that will make the off-page dup tree go
* go away as well as our current entry. When
* it updates cursors, ours should get marked
* as H_DELETED.
*/
ret = __ham_del_dups(dbc, key);
}
} else
ret = DB_NOTFOUND;
}
if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
ret = t_ret;
out: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
/* ****************** CURSORS ********************************** */
/*
* __ham_c_init --
* Initialize the hash-specific portion of a cursor.
*
* PUBLIC: int __ham_c_init __P((DBC *));
*/
int
__ham_c_init(dbc)
DBC *dbc;
{
DB_ENV *dbenv;
HASH_CURSOR *new_curs;
int ret;
dbenv = dbc->dbp->dbenv;
if ((ret = __os_calloc(dbenv,
1, sizeof(struct cursor_t), &new_curs)) != 0)
return (ret);
if ((ret = __os_malloc(dbenv,
dbc->dbp->pgsize, NULL, &new_curs->split_buf)) != 0) {
__os_free(new_curs, sizeof(*new_curs));
return (ret);
}
dbc->internal = (DBC_INTERNAL *) new_curs;
dbc->c_close = __db_c_close;
dbc->c_count = __db_c_count;
dbc->c_del = __db_c_del;
dbc->c_dup = __db_c_dup;
dbc->c_get = __db_c_get;
dbc->c_put = __db_c_put;
dbc->c_am_close = __ham_c_close;
dbc->c_am_del = __ham_c_del;
dbc->c_am_destroy = __ham_c_destroy;
dbc->c_am_get = __ham_c_get;
dbc->c_am_put = __ham_c_put;
dbc->c_am_writelock = __ham_c_writelock;
__ham_item_init(dbc);
return (0);
}
/*
* __ham_c_close --
* Close down the cursor from a single use.
*/
static int
__ham_c_close(dbc, root_pgno, rmroot)
DBC *dbc;
db_pgno_t root_pgno;
int *rmroot;
{
HASH_CURSOR *hcp;
HKEYDATA *dp;
int doroot, gotmeta, ret, t_ret;
u_int32_t dirty;
COMPQUIET(rmroot, 0);
dirty = 0;
doroot = gotmeta = ret = 0;
hcp = (HASH_CURSOR *) dbc->internal;
/* Check for off page dups. */
if (dbc->internal->opd != NULL) {
if ((ret = __ham_get_meta(dbc)) != 0)
goto done;
gotmeta = 1;
if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0)
goto out;
dp = (HKEYDATA *)H_PAIRDATA(hcp->page, hcp->indx);
DB_ASSERT(HPAGE_PTYPE(dp) == H_OFFDUP);
memcpy(&root_pgno, HOFFPAGE_PGNO(dp), sizeof(db_pgno_t));
if ((ret =
hcp->opd->c_am_close(hcp->opd, root_pgno, &doroot)) != 0)
goto out;
if (doroot != 0) {
if ((ret = __ham_del_pair(dbc, 1)) != 0)
goto out;
dirty = DB_MPOOL_DIRTY;
}
}
out: if (hcp->page != NULL && (t_ret =
memp_fput(dbc->dbp->mpf, hcp->page, dirty)) != 0 && ret == 0)
ret = t_ret;
if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
ret = t_ret;
done:
__ham_item_init(dbc);
return (ret);
}
/*
* __ham_c_destroy --
* Cleanup the access method private part of a cursor.
*/
static int
__ham_c_destroy(dbc)
DBC *dbc;
{
HASH_CURSOR *hcp;
hcp = (HASH_CURSOR *)dbc->internal;
if (hcp->split_buf != NULL)
__os_free(hcp->split_buf, dbc->dbp->pgsize);
__os_free(hcp, sizeof(HASH_CURSOR));
return (0);
}
/*
* __ham_c_count --
* Return a count of on-page duplicates.
*
* PUBLIC: int __ham_c_count __P((DBC *, db_recno_t *));
*/
int
__ham_c_count(dbc, recnop)
DBC *dbc;
db_recno_t *recnop;
{
DB *dbp;
HASH_CURSOR *hcp;
db_indx_t len;
db_recno_t recno;
int ret, t_ret;
u_int8_t *p, *pend;
dbp = dbc->dbp;
hcp = (HASH_CURSOR *) dbc->internal;
recno = 0;
if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0)
return (ret);
switch (HPAGE_PTYPE(H_PAIRDATA(hcp->page, hcp->indx))) {
case H_KEYDATA:
case H_OFFPAGE:
recno = 1;
break;
case H_DUPLICATE:
p = HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx));
pend = p +
LEN_HDATA(hcp->page, dbp->pgsize, hcp->indx);
for (; p < pend; recno++) {
/* p may be odd, so copy rather than just dereffing */
memcpy(&len, p, sizeof(db_indx_t));
p += 2 * sizeof(db_indx_t) + len;
}
break;
default:
ret = __db_unknown_type(dbp->dbenv, "__ham_c_count",
HPAGE_PTYPE(H_PAIRDATA(hcp->page, hcp->indx)));
goto err;
}
*recnop = recno;
err: if ((t_ret = memp_fput(dbc->dbp->mpf, hcp->page, 0)) != 0 && ret == 0)
ret = t_ret;
hcp->page = NULL;
return (ret);
}
static int
__ham_c_del(dbc)
DBC *dbc;
{
DB *dbp;
DBT repldbt;
HASH_CURSOR *hcp;
int ret, t_ret;
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
if (F_ISSET(hcp, H_DELETED))
return (DB_NOTFOUND);
if ((ret = __ham_get_meta(dbc)) != 0)
goto out;
if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0)
goto out;
/* Off-page duplicates. */
if (HPAGE_TYPE(hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP)
goto out;
if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */
if (hcp->dup_off == 0 &&
DUP_SIZE(hcp->dup_len) == LEN_HDATA(hcp->page,
hcp->hdr->dbmeta.pagesize, hcp->indx))
ret = __ham_del_pair(dbc, 1);
else {
repldbt.flags = 0;
F_SET(&repldbt, DB_DBT_PARTIAL);
repldbt.doff = hcp->dup_off;
repldbt.dlen = DUP_SIZE(hcp->dup_len);
repldbt.size = 0;
repldbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->page,
hcp->indx));
ret = __ham_replpair(dbc, &repldbt, 0);
hcp->dup_tlen -= DUP_SIZE(hcp->dup_len);
F_SET(hcp, H_DELETED);
ret = __ham_c_update(dbc, DUP_SIZE(hcp->dup_len), 0, 1);
}
} else /* Not a duplicate */
ret = __ham_del_pair(dbc, 1);
out: if (ret == 0 && hcp->page != NULL &&
(t_ret = memp_fput(dbp->mpf, hcp->page, DB_MPOOL_DIRTY)) != 0)
ret = t_ret;
hcp->page = NULL;
if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
/*
* __ham_c_dup --
* Duplicate a hash cursor, such that the new one holds appropriate
* locks for the position of the original.
*
* PUBLIC: int __ham_c_dup __P((DBC *, DBC *));
*/
int
__ham_c_dup(orig_dbc, new_dbc)
DBC *orig_dbc, *new_dbc;
{
HASH_CURSOR *orig, *new;
orig = (HASH_CURSOR *)orig_dbc->internal;
new = (HASH_CURSOR *)new_dbc->internal;
new->bucket = orig->bucket;
new->lbucket = orig->lbucket;
new->dup_off = orig->dup_off;
new->dup_len = orig->dup_len;
new->dup_tlen = orig->dup_tlen;
if (F_ISSET(orig, H_DELETED))
F_SET(new, H_DELETED);
if (F_ISSET(orig, H_ISDUP))
F_SET(new, H_ISDUP);
/*
* If the old cursor held a lock and we're not in transactions, get one
* for the new one. The reason that we don't need a new lock if we're
* in a transaction is because we already hold a lock and will continue
* to do so until commit, so there is no point in reaquiring it. We
* don't know if the old lock was a read or write lock, but it doesn't
* matter. We'll get a read lock. We know that this locker already
* holds a lock of the correct type, so if we need a write lock and
* request it, we know that we'll get it.
*/
if (orig->lock.off == LOCK_INVALID || orig_dbc->txn != NULL)
return (0);
return (__ham_lock_bucket(new_dbc, DB_LOCK_READ));
}
static int
__ham_c_get(dbc, key, data, flags, pgnop)
DBC *dbc;
DBT *key;
DBT *data;
u_int32_t flags;
db_pgno_t *pgnop;
{
DB *dbp;
HASH_CURSOR *hcp;
db_lockmode_t lock_type;
int get_key, ret, t_ret;
hcp = (HASH_CURSOR *)dbc->internal;
dbp = dbc->dbp;
/* Clear OR'd in additional bits so we can check for flag equality. */
if (F_ISSET(dbc, DBC_RMW))
lock_type = DB_LOCK_WRITE;
else
lock_type = DB_LOCK_READ;
if ((ret = __ham_get_meta(dbc)) != 0)
return (ret);
hcp->seek_size = 0;
ret = 0;
get_key = 1;
switch (flags) {
case DB_PREV_NODUP:
F_SET(hcp, H_NEXT_NODUP);
/* FALLTHROUGH */
case DB_PREV:
if (IS_INITIALIZED(dbc)) {
ret = __ham_item_prev(dbc, lock_type, pgnop);
break;
}
/* FALLTHROUGH */
case DB_LAST:
ret = __ham_item_last(dbc, lock_type, pgnop);
break;
case DB_NEXT_NODUP:
F_SET(hcp, H_NEXT_NODUP);
/* FALLTHROUGH */
case DB_NEXT:
if (IS_INITIALIZED(dbc)) {
ret = __ham_item_next(dbc, lock_type, pgnop);
break;
}
/* FALLTHROUGH */
case DB_FIRST:
ret = __ham_item_first(dbc, lock_type, pgnop);
break;
case DB_NEXT_DUP:
/* cgetchk has already determined that the cursor is set. */
F_SET(hcp, H_DUPONLY);
ret = __ham_item_next(dbc, lock_type, pgnop);
break;
case DB_SET:
case DB_SET_RANGE:
case DB_GET_BOTH:
ret = __ham_lookup(dbc, key, 0, lock_type, pgnop);
get_key = 0;
break;
case DB_GET_BOTHC:
F_SET(hcp, H_DUPONLY);
ret = __ham_item_next(dbc, lock_type, pgnop);
get_key = 0;
break;
case DB_CURRENT:
/* cgetchk has already determined that the cursor is set. */
if (F_ISSET(hcp, H_DELETED)) {
ret = DB_KEYEMPTY;
goto err;
}
ret = __ham_item(dbc, lock_type, pgnop);
break;
}
/*
* Must always enter this loop to do error handling and
* check for big key/data pair.
*/
for (;;) {
if (ret != 0 && ret != DB_NOTFOUND)
goto err;
else if (F_ISSET(hcp, H_OK)) {
if (*pgnop == PGNO_INVALID)
ret = __ham_dup_return (dbc, data, flags);
break;
} else if (!F_ISSET(hcp, H_NOMORE)) {
__db_err(dbp->dbenv,
"H_NOMORE returned to __ham_c_get");
ret = EINVAL;
break;
}
/*
* Ran out of entries in a bucket; change buckets.
*/
switch (flags) {
case DB_LAST:
case DB_PREV:
case DB_PREV_NODUP:
ret = memp_fput(dbp->mpf, hcp->page, 0);
hcp->page = NULL;
if (hcp->bucket == 0) {
ret = DB_NOTFOUND;
hcp->pgno = PGNO_INVALID;
goto err;
}
F_CLR(hcp, H_ISDUP);
hcp->bucket--;
hcp->indx = NDX_INVALID;
hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
if (ret == 0)
ret = __ham_item_prev(dbc,
lock_type, pgnop);
break;
case DB_FIRST:
case DB_NEXT:
case DB_NEXT_NODUP:
ret = memp_fput(dbp->mpf, hcp->page, 0);
hcp->page = NULL;
hcp->indx = NDX_INVALID;
hcp->bucket++;
F_CLR(hcp, H_ISDUP);
hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
if (hcp->bucket > hcp->hdr->max_bucket) {
ret = DB_NOTFOUND;
hcp->pgno = PGNO_INVALID;
goto err;
}
if (ret == 0)
ret = __ham_item_next(dbc,
lock_type, pgnop);
break;
case DB_GET_BOTH:
case DB_GET_BOTHC:
case DB_NEXT_DUP:
case DB_SET:
case DB_SET_RANGE:
/* Key not found. */
ret = DB_NOTFOUND;
goto err;
case DB_CURRENT:
/*
* This should only happen if you are doing
* deletes and reading with concurrent threads
* and not doing proper locking. We return
* the same error code as we would if the
* cursor were deleted.
*/
ret = DB_KEYEMPTY;
goto err;
default:
DB_ASSERT(0);
}
}
if (get_key == 0)
F_SET(key, DB_DBT_ISSET);
err: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
ret = t_ret;
F_CLR(hcp, H_DUPONLY);
F_CLR(hcp, H_NEXT_NODUP);
return (ret);
}
static int
__ham_c_put(dbc, key, data, flags, pgnop)
DBC *dbc;
DBT *key;
DBT *data;
u_int32_t flags;
db_pgno_t *pgnop;
{
DB *dbp;
DBT tmp_val, *myval;
HASH_CURSOR *hcp;
u_int32_t nbytes;
int ret, t_ret;
/*
* The compiler doesn't realize that we only use this when ret is
* equal to 0 and that if ret is equal to 0, that we must have set
* myval. So, we initialize it here to shut the compiler up.
*/
COMPQUIET(myval, NULL);
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
if (F_ISSET(hcp, H_DELETED) &&
flags != DB_KEYFIRST && flags != DB_KEYLAST)
return (DB_NOTFOUND);
if ((ret = __ham_get_meta(dbc)) != 0)
goto err1;
switch (flags) {
case DB_KEYLAST:
case DB_KEYFIRST:
case DB_NODUPDATA:
nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
HKEYDATA_PSIZE(key->size)) +
(ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
HKEYDATA_PSIZE(data->size));
if ((ret = __ham_lookup(dbc,
key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) {
ret = 0;
if (hcp->seek_found_page != PGNO_INVALID &&
hcp->seek_found_page != hcp->pgno) {
if ((ret = memp_fput(dbp->mpf, hcp->page, 0))
!= 0)
goto err2;
hcp->page = NULL;
hcp->pgno = hcp->seek_found_page;
hcp->indx = NDX_INVALID;
}
if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
/*
* A partial put, but the key does not exist
* and we are not beginning the write at 0.
* We must create a data item padded up to doff
* and then write the new bytes represented by
* val.
*/
if ((ret = __ham_init_dbt(dbp->dbenv,
&tmp_val, data->size + data->doff,
&dbc->rdata.data, &dbc->rdata.ulen)) == 0) {
memset(tmp_val.data, 0, data->doff);
memcpy((u_int8_t *)tmp_val.data +
data->doff, data->data, data->size);
myval = &tmp_val;
}
} else
myval = (DBT *)data;
if (ret == 0)
ret = __ham_add_el(dbc, key, myval, H_KEYDATA);
goto done;
}
break;
case DB_BEFORE:
case DB_AFTER:
case DB_CURRENT:
ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop);
break;
}
if (*pgnop == PGNO_INVALID && ret == 0) {
if (flags == DB_CURRENT ||
((flags == DB_KEYFIRST ||
flags == DB_KEYLAST || flags == DB_NODUPDATA) &&
!(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))))
ret = __ham_overwrite(dbc, data, flags);
else
ret = __ham_add_dup(dbc, data, flags, pgnop);
}
done: if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
ret = __ham_expand_table(dbc);
F_CLR(hcp, H_EXPAND);
}
if (ret == 0 &&
(t_ret = memp_fset(dbp->mpf, hcp->page, DB_MPOOL_DIRTY)) != 0)
ret = t_ret;
err2: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
ret = t_ret;
err1: return (ret);
}
/********************************* UTILITIES ************************/
/*
* __ham_expand_table --
*/
static int
__ham_expand_table(dbc)
DBC *dbc;
{
DB *dbp;
PAGE *h;
HASH_CURSOR *hcp;
db_pgno_t pgno;
u_int32_t old_bucket, new_bucket;
int ret;
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
if ((ret = __ham_dirty_meta(dbc)) != 0)
return (ret);
/*
* If the split point is about to increase, make sure that we
* have enough extra pages. The calculation here is weird.
* We'd like to do this after we've upped max_bucket, but it's
* too late then because we've logged the meta-data split. What
* we'll do between then and now is increment max bucket and then
* see what the log of one greater than that is; here we have to
* look at the log of max + 2. VERY NASTY STUFF.
*
* It just got even nastier. With subdatabases, we have to request
* a chunk of contiguous pages, so we do that here using an
* undocumented feature of mpool (the MPOOL_NEW_GROUP flag) to
* give us a number of contiguous pages. Ouch.
*/
if (hcp->hdr->max_bucket == hcp->hdr->high_mask) {
/*
* Ask mpool to give us a set of contiguous page numbers
* large enough to contain the next doubling.
*
* Figure out how many new pages we need. This will return
* us the last page. We calculate its page number, initialize
* the page and then write it back to reserve all the pages
* in between. It is possible that the allocation of new pages
* has already been done, but the tranaction aborted. Since
* we don't undo the allocation, check for a valid pgno before
* doing the allocation.
*/
pgno = hcp->hdr->max_bucket + 1;
if (hcp->hdr->spares[__db_log2(pgno) + 1] == PGNO_INVALID)
/* Allocate a group of pages. */
ret = memp_fget(dbp->mpf,
&pgno, DB_MPOOL_NEW_GROUP, &h);
else {
/* Just read in the last page of the batch */
pgno = hcp->hdr->spares[__db_log2(pgno) + 1] +
hcp->hdr->max_bucket + 1;
/* Move to the last page of the group. */
pgno += hcp->hdr->max_bucket;
ret = memp_fget(dbp->mpf,
&pgno, DB_MPOOL_CREATE, &h);
}
if (ret != 0)
return (ret);
P_INIT(h, dbp->pgsize, pgno,
PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
pgno -= hcp->hdr->max_bucket;
} else {
pgno = BUCKET_TO_PAGE(hcp, hcp->hdr->max_bucket + 1);
if ((ret =
memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &h)) != 0)
return (ret);
}
/* Now we can log the meta-data split. */
if (DB_LOGGING(dbc)) {
if ((ret = __ham_metagroup_log(dbp->dbenv,
dbc->txn, &h->lsn, 0, dbp->log_fileid,
hcp->hdr->max_bucket, pgno, &hcp->hdr->dbmeta.lsn,
&h->lsn)) != 0) {
(void)memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
return (ret);
}
hcp->hdr->dbmeta.lsn = h->lsn;
}
/* If we allocated some new pages, write out the last page. */
if ((ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
return (ret);
new_bucket = ++hcp->hdr->max_bucket;
old_bucket = (hcp->hdr->max_bucket & hcp->hdr->low_mask);
/*
* If we started a new doubling, fill in the spares array with
* the starting page number negatively offset by the bucket number.
*/
if (new_bucket > hcp->hdr->high_mask) {
/* Starting a new doubling */
hcp->hdr->low_mask = hcp->hdr->high_mask;
hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask;
if (hcp->hdr->spares[__db_log2(new_bucket) + 1] == PGNO_INVALID)
hcp->hdr->spares[__db_log2(new_bucket) + 1] =
pgno - new_bucket;
}
/* Relocate records to the new bucket */
return (__ham_split_page(dbc, old_bucket, new_bucket));
}
/*
* PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, int32_t));
*/
u_int32_t
__ham_call_hash(dbc, k, len)
DBC *dbc;
u_int8_t *k;
int32_t len;
{
DB *dbp;
u_int32_t n, bucket;
HASH_CURSOR *hcp;
HASH *hashp;
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
hashp = dbp->h_internal;
n = (u_int32_t)(hashp->h_hash(dbp, k, len));
bucket = n & hcp->hdr->high_mask;
if (bucket > hcp->hdr->max_bucket)
bucket = bucket & hcp->hdr->low_mask;
return (bucket);
}
/*
* Check for duplicates, and call __db_ret appropriately. Release
* everything held by the cursor.
*/
static int
__ham_dup_return (dbc, val, flags)
DBC *dbc;
DBT *val;
u_int32_t flags;
{
DB *dbp;
HASH_CURSOR *hcp;
PAGE *pp;
DBT *myval, tmp_val;
db_indx_t ndx;
db_pgno_t pgno;
u_int32_t off, tlen;
u_int8_t *hk, type;
int cmp, ret;
db_indx_t len;
/* Check for duplicate and return the first one. */
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
ndx = H_DATAINDEX(hcp->indx);
type = HPAGE_TYPE(hcp->page, ndx);
pp = hcp->page;
myval = val;
/*
* There are 4 cases:
* 1. We are not in duplicate, simply return; the upper layer
* will do the right thing.
* 2. We are looking at keys and stumbled onto a duplicate.
* 3. We are in the middle of a duplicate set. (ISDUP set)
* 4. We need to check for particular data match.
*/
/* We should never get here with off-page dups. */
DB_ASSERT(type != H_OFFDUP);
/* Case 1 */
if (type != H_DUPLICATE &&
flags != DB_GET_BOTH && flags != DB_GET_BOTHC)
return (0);
/*
* Here we check for the case where we just stumbled onto a
* duplicate. In this case, we do initialization and then
* let the normal duplicate code handle it. (Case 2)
*/
if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) {
F_SET(hcp, H_ISDUP);
hcp->dup_tlen = LEN_HDATA(hcp->page,
hcp->hdr->dbmeta.pagesize, hcp->indx);
hk = H_PAIRDATA(hcp->page, hcp->indx);
if (flags == DB_LAST
|| flags == DB_PREV || flags == DB_PREV_NODUP) {
hcp->dup_off = 0;
do {
memcpy(&len,
HKEYDATA_DATA(hk) + hcp->dup_off,
sizeof(db_indx_t));
hcp->dup_off += DUP_SIZE(len);
} while (hcp->dup_off < hcp->dup_tlen);
hcp->dup_off -= DUP_SIZE(len);
} else {
memcpy(&len,
HKEYDATA_DATA(hk), sizeof(db_indx_t));
hcp->dup_off = 0;
}
hcp->dup_len = len;
}
/*
* If we are retrieving a specific key/data pair, then we
* may need to adjust the cursor before returning data.
* Case 4
*/
if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC) {
if (F_ISSET(hcp, H_ISDUP)) {
/*
* If we're doing a join, search forward from the
* current position, not the beginning of the dup set.
*/
if (flags == DB_GET_BOTHC)
F_SET(hcp, H_CONTINUE);
__ham_dsearch(dbc, val, &off, &cmp);
/*
* This flag is set nowhere else and is safe to
* clear unconditionally.
*/
F_CLR(hcp, H_CONTINUE);
hcp->dup_off = off;
} else {
hk = H_PAIRDATA(hcp->page, hcp->indx);
if (((HKEYDATA *)hk)->type == H_OFFPAGE) {
memcpy(&tlen,
HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
memcpy(&pgno,
HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
if ((ret = __db_moff(dbp, val,
pgno, tlen, dbp->dup_compare, &cmp)) != 0)
return (ret);
} else {
/*
* We do not zero tmp_val since the comparison
* routines may only look at data and size.
*/
tmp_val.data = HKEYDATA_DATA(hk);
tmp_val.size = LEN_HDATA(hcp->page,
dbp->pgsize, hcp->indx);
cmp = dbp->dup_compare == NULL ?
__bam_defcmp(dbp, &tmp_val, val) :
dbp->dup_compare(dbp, &tmp_val, val);
}
}
if (cmp != 0)
return (DB_NOTFOUND);
}
/*
* Now, everything is initialized, grab a duplicate if
* necessary.
*/
if (F_ISSET(hcp, H_ISDUP)) { /* Case 3 */
/*
* Copy the DBT in case we are retrieving into user
* memory and we need the parameters for it. If the
* user requested a partial, then we need to adjust
* the user's parameters to get the partial of the
* duplicate which is itself a partial.
*/
memcpy(&tmp_val, val, sizeof(*val));
if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) {
/*
* Take the user's length unless it would go
* beyond the end of the duplicate.
*/
if (tmp_val.doff + hcp->dup_off > hcp->dup_len)
tmp_val.dlen = 0;
else if (tmp_val.dlen + tmp_val.doff >
hcp->dup_len)
tmp_val.dlen =
hcp->dup_len - tmp_val.doff;
/*
* Calculate the new offset.
*/
tmp_val.doff += hcp->dup_off;
} else {
F_SET(&tmp_val, DB_DBT_PARTIAL);
tmp_val.dlen = hcp->dup_len;
tmp_val.doff = hcp->dup_off + sizeof(db_indx_t);
}
myval = &tmp_val;
}
/*
* Finally, if we had a duplicate, pp, ndx, and myval should be
* set appropriately.
*/
if ((ret = __db_ret(dbp, pp, ndx, myval, &dbc->rdata.data,
&dbc->rdata.ulen)) != 0)
return (ret);
/*
* In case we sent a temporary off to db_ret, set the real
* return values.
*/
val->data = myval->data;
val->size = myval->size;
F_SET(val, DB_DBT_ISSET);
return (0);
}
static int
__ham_overwrite(dbc, nval, flags)
DBC *dbc;
DBT *nval;
u_int32_t flags;
{
DB *dbp;
HASH_CURSOR *hcp;
DBT *myval, tmp_val, tmp_val2;
void *newrec;
u_int8_t *hk, *p;
u_int32_t len, nondup_size;
db_indx_t newsize;
int ret;
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
if (F_ISSET(hcp, H_ISDUP)) {
/*
* This is an overwrite of a duplicate. We should never
* be off-page at this point.
*/
DB_ASSERT(hcp->opd == NULL);
/* On page dups */
if (F_ISSET(nval, DB_DBT_PARTIAL)) {
/*
* We're going to have to get the current item, then
* construct the record, do any padding and do a
* replace.
*/
memset(&tmp_val, 0, sizeof(tmp_val));
if ((ret =
__ham_dup_return (dbc, &tmp_val, DB_CURRENT)) != 0)
return (ret);
/* Figure out new size. */
nondup_size = tmp_val.size;
newsize = nondup_size;
/*
* Three cases:
* 1. strictly append (may need to allocate space
* for pad bytes; really gross).
* 2. overwrite some and append.
* 3. strictly overwrite.
*/
if (nval->doff > nondup_size)
newsize +=
(nval->doff - nondup_size + nval->size);
else if (nval->doff + nval->dlen > nondup_size)
newsize += nval->size -
(nondup_size - nval->doff);
else
newsize += nval->size - nval->dlen;
/*
* Make sure that the new size doesn't put us over
* the onpage duplicate size in which case we need
* to convert to off-page duplicates.
*/
if (ISBIG(hcp, hcp->dup_tlen - nondup_size + newsize)) {
if ((ret = __ham_dup_convert(dbc)) != 0)
return (ret);
return (hcp->opd->c_am_put(hcp->opd,
NULL, nval, flags, NULL));
}
if ((ret = __os_malloc(dbp->dbenv,
DUP_SIZE(newsize), NULL, &newrec)) != 0)
return (ret);
memset(&tmp_val2, 0, sizeof(tmp_val2));
F_SET(&tmp_val2, DB_DBT_PARTIAL);
/* Construct the record. */
p = newrec;
/* Initial size. */
memcpy(p, &newsize, sizeof(db_indx_t));
p += sizeof(db_indx_t);
/* First part of original record. */
len = nval->doff > tmp_val.size
? tmp_val.size : nval->doff;
memcpy(p, tmp_val.data, len);
p += len;
if (nval->doff > tmp_val.size) {
/* Padding */
memset(p, 0, nval->doff - tmp_val.size);
p += nval->doff - tmp_val.size;
}
/* New bytes */
memcpy(p, nval->data, nval->size);
p += nval->size;
/* End of original record (if there is any) */
if (nval->doff + nval->dlen < tmp_val.size) {
len = tmp_val.size - nval->doff - nval->dlen;
memcpy(p, (u_int8_t *)tmp_val.data +
nval->doff + nval->dlen, len);
p += len;
}
/* Final size. */
memcpy(p, &newsize, sizeof(db_indx_t));
/*
* Make sure that the caller isn't corrupting
* the sort order.
*/
if (dbp->dup_compare != NULL) {
tmp_val2.data =
(u_int8_t *)newrec + sizeof(db_indx_t);
tmp_val2.size = newsize;
if (dbp->dup_compare(
dbp, &tmp_val, &tmp_val2) != 0) {
(void)__os_free(newrec,
DUP_SIZE(newsize));
return (__db_duperr(dbp, flags));
}
}
tmp_val2.data = newrec;
tmp_val2.size = DUP_SIZE(newsize);
tmp_val2.doff = hcp->dup_off;
tmp_val2.dlen = DUP_SIZE(hcp->dup_len);
ret = __ham_replpair(dbc, &tmp_val2, 0);
(void)__os_free(newrec, DUP_SIZE(newsize));
/* Update cursor */
if (ret != 0)
return (ret);
if (newsize > nondup_size)
hcp->dup_tlen += (newsize - nondup_size);
else
hcp->dup_tlen -= (nondup_size - newsize);
hcp->dup_len = DUP_SIZE(newsize);
return (0);
} else {
/* Check whether we need to convert to off page. */
if (ISBIG(hcp,
hcp->dup_tlen - hcp->dup_len + nval->size)) {
if ((ret = __ham_dup_convert(dbc)) != 0)
return (ret);
return (hcp->opd->c_am_put(hcp->opd,
NULL, nval, flags, NULL));
}
/* Make sure we maintain sort order. */
if (dbp->dup_compare != NULL) {
tmp_val2.data =
HKEYDATA_DATA(H_PAIRDATA(hcp->page,
hcp->indx)) + hcp->dup_off +
sizeof(db_indx_t);
tmp_val2.size = hcp->dup_len;
if (dbp->dup_compare(dbp, nval, &tmp_val2) != 0)
return (EINVAL);
}
/* Overwriting a complete duplicate. */
if ((ret =
__ham_make_dup(dbp->dbenv, nval,
&tmp_val, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
return (ret);
/* Now fix what we are replacing. */
tmp_val.doff = hcp->dup_off;
tmp_val.dlen = DUP_SIZE(hcp->dup_len);
/* Update cursor */
if (nval->size > hcp->dup_len)
hcp->dup_tlen += (nval->size - hcp->dup_len);
else
hcp->dup_tlen -= (hcp->dup_len - nval->size);
hcp->dup_len = DUP_SIZE(nval->size);
}
myval = &tmp_val;
} else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
/* Put/overwrite */
memcpy(&tmp_val, nval, sizeof(*nval));
F_SET(&tmp_val, DB_DBT_PARTIAL);
tmp_val.doff = 0;
hk = H_PAIRDATA(hcp->page, hcp->indx);
if (HPAGE_PTYPE(hk) == H_OFFPAGE)
memcpy(&tmp_val.dlen,
HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
else
tmp_val.dlen = LEN_HDATA(hcp->page,
hcp->hdr->dbmeta.pagesize, hcp->indx);
myval = &tmp_val;
} else
/* Regular partial put */
myval = nval;
return (__ham_replpair(dbc, myval, 0));
}
/*
* Given a key and a cursor, sets the cursor to the page/ndx on which
* the key resides. If the key is found, the cursor H_OK flag is set
* and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set.
* If the key is not found, the H_OK flag is not set. If the sought
* field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields
* are set indicating where an add might take place. If it is 0,
* non of the cursor pointer field are valid.
*/
static int
__ham_lookup(dbc, key, sought, mode, pgnop)
DBC *dbc;
const DBT *key;
u_int32_t sought;
db_lockmode_t mode;
db_pgno_t *pgnop;
{
DB *dbp;
HASH_CURSOR *hcp;
db_pgno_t pgno;
u_int32_t tlen;
int match, ret;
u_int8_t *hk, *dk;
dbp = dbc->dbp;
hcp = (HASH_CURSOR *)dbc->internal;
/*
* Set up cursor so that we're looking for space to add an item
* as we cycle through the pages looking for the key.
*/
if ((ret = __ham_item_reset(dbc)) != 0)
return (ret);
hcp->seek_size = sought;
hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size);
hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
while (1) {
*pgnop = PGNO_INVALID;
if ((ret = __ham_item_next(dbc, mode, pgnop)) != 0)
return (ret);
if (F_ISSET(hcp, H_NOMORE))
break;
hk = H_PAIRKEY(hcp->page, hcp->indx);
switch (HPAGE_PTYPE(hk)) {
case H_OFFPAGE:
memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
if (tlen == key->size) {
memcpy(&pgno,
HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
if ((ret = __db_moff(dbp,
key, pgno, tlen, NULL, &match)) != 0)
return (ret);
if (match == 0)
goto found_key;
}
break;
case H_KEYDATA:
if (key->size ==
LEN_HKEY(hcp->page, dbp->pgsize, hcp->indx) &&
memcmp(key->data,
HKEYDATA_DATA(hk), key->size) == 0) {
/* Found the key, check for data type. */
found_key: F_SET(hcp, H_OK);
dk = H_PAIRDATA(hcp->page, hcp->indx);
if (HPAGE_PTYPE(dk) == H_OFFDUP)
memcpy(pgnop, HOFFDUP_PGNO(dk),
sizeof(db_pgno_t));
return (0);
}
break;
case H_DUPLICATE:
case H_OFFDUP:
/*
* These are errors because keys are never
* duplicated, only data items are.
*/
return (__db_pgfmt(dbp, PGNO(hcp->page)));
}
}
/*
* Item was not found.
*/
if (sought != 0)
return (ret);
return (ret);
}
/*
* __ham_init_dbt --
* Initialize a dbt using some possibly already allocated storage
* for items.
*
* PUBLIC: int __ham_init_dbt __P((DB_ENV *,
* PUBLIC: DBT *, u_int32_t, void **, u_int32_t *));
*/
int
__ham_init_dbt(dbenv, dbt, size, bufp, sizep)
DB_ENV *dbenv;
DBT *dbt;
u_int32_t size;
void **bufp;
u_int32_t *sizep;
{
int ret;
memset(dbt, 0, sizeof(*dbt));
if (*sizep < size) {
if ((ret = __os_realloc(dbenv, size, NULL, bufp)) != 0) {
*sizep = 0;
return (ret);
}
*sizep = size;
}
dbt->data = *bufp;
dbt->size = size;
return (0);
}
/*
* Adjust the cursor after an insert or delete. The cursor passed is
* the one that was operated upon; we just need to check any of the
* others.
*
* len indicates the length of the item added/deleted
* add indicates if the item indicated by the cursor has just been
* added (add == 1) or deleted (add == 0).
* dup indicates if the addition occurred into a duplicate set.
*
* PUBLIC: int __ham_c_update
* PUBLIC: __P((DBC *, u_int32_t, int, int));
*/
int
__ham_c_update(dbc, len, add, is_dup)
DBC *dbc;
u_int32_t len;
int add, is_dup;
{
DB *dbp, *ldbp;
DBC *cp;
DB_ENV *dbenv;
DB_LSN lsn;
DB_TXN *my_txn;
HASH_CURSOR *hcp, *lcp;
int found, ret;
u_int32_t order;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
hcp = (HASH_CURSOR *)dbc->internal;
/*
* Adjustment will only be logged if this is a subtransaction.
* Only subtransactions can abort and effect their parent
* transactions cursors.
*/
my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
found = 0;
MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
/*
* Calcuate the order of this deleted record.
* This will be one grater than any cursor that is pointing
* at this record and already marked as deleted.
*/
order = 0;
if (!add) {
order = 1;
for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
ldbp = LIST_NEXT(ldbp, dblistlinks)) {
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL;
cp = TAILQ_NEXT(cp, links)) {
if (cp == dbc || cp->dbtype != DB_HASH)
continue;
lcp = (HASH_CURSOR *)cp->internal;
if (F_ISSET(lcp, H_DELETED) &&
hcp->pgno == lcp->pgno &&
hcp->indx == lcp->indx &&
order <= lcp->order &&
(!is_dup || hcp->dup_off == lcp->dup_off))
order = lcp->order +1;
}
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
}
hcp->order = order;
}
for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
ldbp = LIST_NEXT(ldbp, dblistlinks)) {
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL;
cp = TAILQ_NEXT(cp, links)) {
if (cp == dbc || cp->dbtype != DB_HASH)
continue;
lcp = (HASH_CURSOR *)cp->internal;
if (lcp->pgno != hcp->pgno || lcp->indx == NDX_INVALID)
continue;
if (my_txn != NULL && cp->txn != my_txn)
found = 1;
if (!is_dup) {
if (add) {
/*
* This routine is not called to add
* non-dup records which are always put
* at the end. It is only called from
* recovery in this case and the
* cursor will be marked deleted.
* We are "undeleting" so unmark all
* cursors with the same order.
*/
if (lcp->indx == hcp->indx
&& F_ISSET(lcp, H_DELETED)) {
if (lcp->order == hcp->order)
F_CLR(lcp, H_DELETED);
else if (lcp->order >
hcp->order) {
/*
* If we've moved this cursor's
* index, split its order
* number--i.e., decrement it by
* enough so that the lowest
* cursor moved has order 1.
* cp_arg->order is the split
* point, so decrement by one
* less than that.
*/
lcp->order -=
(hcp->order - 1);
lcp->indx += 2;
}
} else if (lcp->indx >= hcp->indx)
lcp->indx += 2;
} else {
if (lcp->indx > hcp->indx) {
lcp->indx -= 2;
if (lcp->indx == hcp->indx
&& F_ISSET(lcp, H_DELETED))
lcp->order += order;
} else if (lcp->indx == hcp->indx
&& !F_ISSET(lcp, H_DELETED)) {
F_SET(lcp, H_DELETED);
lcp->order = order;
}
}
} else if (lcp->indx == hcp->indx) {
/*
* Handle duplicates. This routine is
* only called for on page dups.
* Off page dups are handled by btree/rtree
* code.
*/
if (add) {
lcp->dup_tlen += len;
if (lcp->dup_off == hcp->dup_off
&& F_ISSET(hcp, H_DELETED)
&& F_ISSET(lcp, H_DELETED)) {
/* Abort of a delete. */
if (lcp->order == hcp->order)
F_CLR(lcp, H_DELETED);
else if (lcp->order >
hcp->order) {
lcp->order -=
(hcp->order -1);
lcp->dup_off += len;
}
} else if (lcp->dup_off >= hcp->dup_off)
lcp->dup_off += len;
} else {
lcp->dup_tlen -= len;
if (lcp->dup_off > hcp->dup_off) {
lcp->dup_off -= len;
if (lcp->dup_off == hcp->dup_off
&& F_ISSET(lcp, H_DELETED))
lcp->order += order;
} else if (lcp->dup_off ==
hcp->dup_off &&
!F_ISSET(lcp, H_DELETED)) {
F_SET(lcp, H_DELETED);
lcp->order = order;
}
}
}
}
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
}
MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
if (found != 0 && DB_LOGGING(dbc)) {
if ((ret = __ham_curadj_log(dbenv,
my_txn, &lsn, 0, dbp->log_fileid, hcp->pgno,
hcp->indx, len, hcp->dup_off, add, is_dup, order)) != 0)
return (ret);
}
return (0);
}
/*
* __ham_get_clist --
*
* Get a list of cursors either on a particular bucket or on a particular
* page and index combination. The former is so that we can update
* cursors on a split. The latter is so we can update cursors when we
* move items off page.
*
* PUBLIC: int __ham_get_clist __P((DB *,
* PUBLIC: db_pgno_t, u_int32_t, DBC ***));
*/
int
__ham_get_clist(dbp, bucket, indx, listp)
DB *dbp;
db_pgno_t bucket;
u_int32_t indx;
DBC ***listp;
{
DB *ldbp;
DBC *cp;
DB_ENV *dbenv;
int nalloc, nused, ret;
/*
* Assume that finding anything is the exception, so optimize for
* the case where there aren't any.
*/
nalloc = nused = 0;
*listp = NULL;
dbenv = dbp->dbenv;
MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
ldbp = LIST_NEXT(ldbp, dblistlinks)) {
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL;
cp = TAILQ_NEXT(cp, links))
if (cp->dbtype == DB_HASH &&
((indx == NDX_INVALID &&
((HASH_CURSOR *)(cp->internal))->bucket
== bucket) || (indx != NDX_INVALID &&
cp->internal->pgno == bucket &&
cp->internal->indx == indx))) {
if (nused >= nalloc) {
nalloc += 10;
if ((ret = __os_realloc(dbp->dbenv,
nalloc * sizeof(HASH_CURSOR *),
NULL, listp)) != 0)
return (ret);
}
(*listp)[nused++] = cp;
}
MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
}
MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
if (listp != NULL) {
if (nused >= nalloc) {
nalloc++;
if ((ret = __os_realloc(dbp->dbenv,
nalloc * sizeof(HASH_CURSOR *), NULL, listp)) != 0)
return (ret);
}
(*listp)[nused] = NULL;
}
return (0);
}
static int
__ham_del_dups(orig_dbc, key)
DBC *orig_dbc;
DBT *key;
{
DBC *dbc;
DBT data, lkey;
int ret, t_ret;
/* Allocate a cursor. */
if ((ret = orig_dbc->c_dup(orig_dbc, &dbc, 0)) != 0)
return (ret);
/*
* Walk a cursor through the key/data pairs, deleting as we go. Set
* the DB_DBT_USERMEM flag, as this might be a threaded application
* and the flags checking will catch us. We don't actually want the
* keys or data, so request a partial of length 0.
*/
memset(&lkey, 0, sizeof(lkey));
F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL);
memset(&data, 0, sizeof(data));
F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
/* Walk through the set of key/data pairs, deleting as we go. */
if ((ret = dbc->c_get(dbc, key, &data, DB_SET)) != 0) {
if (ret == DB_NOTFOUND)
ret = 0;
goto err;
}
for (;;) {
if ((ret = dbc->c_del(dbc, 0)) != 0)
goto err;
if ((ret = dbc->c_get(dbc, &lkey, &data, DB_NEXT_DUP)) != 0) {
if (ret == DB_NOTFOUND) {
ret = 0;
break;
}
goto err;
}
}
err: /*
* Discard the cursor. This will cause the underlying off-page dup
* tree to go away as well as the actual entry on the page.
*/
if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
static int
__ham_c_writelock(dbc)
DBC *dbc;
{
HASH_CURSOR *hcp;
DB_LOCK tmp_lock;
int ret;
/*
* All we need do is acquire the lock and let the off-page
* dup tree do its thing.
*/
if (!STD_LOCKING(dbc))
return (0);
hcp = (HASH_CURSOR *)dbc->internal;
if ((hcp->lock.off == LOCK_INVALID || hcp->lock_mode == DB_LOCK_READ)) {
tmp_lock = hcp->lock;
if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) != 0)
return (ret);
if (tmp_lock.off != LOCK_INVALID &&
(ret = lock_put(dbc->dbp->dbenv, &tmp_lock)) != 0)
return (ret);
}
return (0);
}
/*
* __ham_c_chgpg --
*
* Adjust the cursors after moving an item from one page to another.
* If the old_index is NDX_INVALID, that means that we copied the
* page wholesale and we're leaving indices intact and just changing
* the page number.
*
* PUBLIC: int __ham_c_chgpg
* PUBLIC: __P((DBC *, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
*/
int
__ham_c_chgpg(dbc, old_pgno, old_index, new_pgno, new_index)
DBC *dbc;
db_pgno_t old_pgno, new_pgno;
u_int32_t old_index, new_index;
{
DB *dbp, *ldbp;
DB_ENV *dbenv;
DB_LSN lsn;
DB_TXN *my_txn;
DBC *cp;
HASH_CURSOR *hcp;
int found, ret;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
found = 0;
MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
ldbp = LIST_NEXT(ldbp, dblistlinks)) {
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL;
cp = TAILQ_NEXT(cp, links)) {
if (cp == dbc || cp->dbtype != DB_HASH)
continue;
hcp = (HASH_CURSOR *)cp->internal;
if (hcp->pgno == old_pgno) {
if (old_index == NDX_INVALID) {
hcp->pgno = new_pgno;
} else if (hcp->indx == old_index) {
hcp->pgno = new_pgno;
hcp->indx = new_index;
} else
continue;
if (my_txn != NULL && cp->txn != my_txn)
found = 1;
}
}
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
}
MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
if (found != 0 && DB_LOGGING(dbc)) {
if ((ret = __ham_chgpg_log(dbenv,
my_txn, &lsn, 0, dbp->log_fileid, DB_HAM_CHGPG,
old_pgno, new_pgno, old_index, new_index)) != 0)
return (ret);
}
return (0);
}