mariadb/storage/bdb/os/os_rw.c
2005-12-05 10:27:46 -08:00

331 lines
8 KiB
C

/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1997-2005
* Sleepycat Software. All rights reserved.
*
* $Id: os_rw.c,v 12.5 2005/08/10 15:47:26 bostic Exp $
*/
#include "db_config.h"
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#endif
#include "db_int.h"
#ifdef HAVE_FILESYSTEM_NOTZERO
static int __os_zerofill __P((DB_ENV *, DB_FH *));
#endif
static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
/*
* __os_io --
* Do an I/O.
*
* PUBLIC: int __os_io __P((DB_ENV *,
* PUBLIC: int, DB_FH *, db_pgno_t, u_int32_t, u_int8_t *, size_t *));
*/
int
__os_io(dbenv, op, fhp, pgno, pagesize, buf, niop)
DB_ENV *dbenv;
int op;
DB_FH *fhp;
db_pgno_t pgno;
u_int32_t pagesize;
u_int8_t *buf;
size_t *niop;
{
#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
ssize_t nio;
#endif
int ret;
/* Check for illegal usage. */
DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
switch (op) {
case DB_IO_READ:
if (DB_GLOBAL(j_read) != NULL)
goto slow;
nio = DB_GLOBAL(j_pread) != NULL ? DB_GLOBAL(j_pread)
(fhp->fd, buf, pagesize, (off_t)pgno * pagesize) :
pread(fhp->fd, buf, pagesize, (off_t)pgno * pagesize);
break;
case DB_IO_WRITE:
if (DB_GLOBAL(j_write) != NULL)
goto slow;
#ifdef HAVE_FILESYSTEM_NOTZERO
if (__os_fs_notzero())
goto slow;
#endif
nio = DB_GLOBAL(j_pwrite) != NULL ? DB_GLOBAL(j_pwrite)
(fhp->fd, buf, pagesize, (off_t)pgno * pagesize) :
pwrite(fhp->fd, buf, pagesize, (off_t)pgno * pagesize);
break;
default:
return (EINVAL);
}
if (nio == (ssize_t)pagesize) {
*niop = pagesize;
return (0);
}
slow:
#endif
MUTEX_LOCK(dbenv, fhp->mtx_fh);
if ((ret = __os_seek(dbenv, fhp,
pagesize, pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
goto err;
switch (op) {
case DB_IO_READ:
ret = __os_read(dbenv, fhp, buf, pagesize, niop);
break;
case DB_IO_WRITE:
ret = __os_write(dbenv, fhp, buf, pagesize, niop);
break;
default:
ret = EINVAL;
break;
}
err: MUTEX_UNLOCK(dbenv, fhp->mtx_fh);
return (ret);
}
/*
* __os_read --
* Read from a file handle.
*
* PUBLIC: int __os_read __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
*/
int
__os_read(dbenv, fhp, addr, len, nrp)
DB_ENV *dbenv;
DB_FH *fhp;
void *addr;
size_t len;
size_t *nrp;
{
size_t offset;
ssize_t nr;
int ret;
u_int8_t *taddr;
ret = 0;
/* Check for illegal usage. */
DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
if (DB_GLOBAL(j_read) != NULL) {
*nrp = len;
if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
ret = __os_get_errno();
__db_err(dbenv, "read: %#lx, %lu: %s",
P_TO_ULONG(addr), (u_long)len, strerror(ret));
}
return (ret);
}
for (taddr = addr, offset = 0;
offset < len; taddr += nr, offset += (u_int32_t)nr) {
RETRY_CHK(((nr = read(
fhp->fd, taddr, len - offset)) < 0 ? 1 : 0), ret);
if (nr == 0 || ret != 0)
break;
}
*nrp = (size_t)(taddr - (u_int8_t *)addr);
if (ret != 0)
__db_err(dbenv, "read: %#lx, %lu: %s",
P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
return (ret);
}
/*
* __os_write --
* Write to a file handle.
*
* PUBLIC: int __os_write __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
*/
int
__os_write(dbenv, fhp, addr, len, nwp)
DB_ENV *dbenv;
DB_FH *fhp;
void *addr;
size_t len;
size_t *nwp;
{
/* Check for illegal usage. */
DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
#ifdef HAVE_FILESYSTEM_NOTZERO
/* Zero-fill as necessary. */
if (__os_fs_notzero()) {
int ret;
if ((ret = __os_zerofill(dbenv, fhp)) != 0)
return (ret);
}
#endif
return (__os_physwrite(dbenv, fhp, addr, len, nwp));
}
/*
* __os_physwrite --
* Physical write to a file handle.
*/
static int
__os_physwrite(dbenv, fhp, addr, len, nwp)
DB_ENV *dbenv;
DB_FH *fhp;
void *addr;
size_t len;
size_t *nwp;
{
size_t offset;
ssize_t nw;
int ret;
u_int8_t *taddr;
ret = 0;
#if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
if (__os_fs_notzero()) {
struct stat sb;
off_t cur_off;
DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
(cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
cur_off <= sb.st_size);
}
#endif
/*
* Make a last "panic" check. Imagine a thread of control running in
* Berkeley DB, going to sleep. Another thread of control decides to
* run recovery because the environment is broken. The first thing
* recovery does is panic the existing environment, but we only check
* the panic flag when crossing the public API. If the sleeping thread
* wakes up and writes something, we could have two threads of control
* writing the log files at the same time. So, before writing, make a
* last panic check. Obviously, there's still a window, but it's very,
* very small.
*/
PANIC_CHECK(dbenv);
if (DB_GLOBAL(j_write) != NULL) {
*nwp = len;
if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
ret = __os_get_errno();
__db_err(dbenv, "write: %#lx, %lu: %s",
P_TO_ULONG(addr), (u_long)len, strerror(ret));
}
return (ret);
}
for (taddr = addr, offset = 0;
offset < len; taddr += nw, offset += (u_int32_t)nw) {
RETRY_CHK(((nw = write(
fhp->fd, taddr, len - offset)) < 0 ? 1 : 0), ret);
if (ret != 0)
break;
}
*nwp = len;
if (ret != 0)
__db_err(dbenv, "write: %#lx, %lu: %s",
P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
return (ret);
}
#ifdef HAVE_FILESYSTEM_NOTZERO
/*
* __os_zerofill --
* Zero out bytes in the file.
*
* Pages allocated by writing pages past end-of-file are not zeroed,
* on some systems. Recovery could theoretically be fooled by a page
* showing up that contained garbage. In order to avoid this, we
* have to write the pages out to disk, and flush them. The reason
* for the flush is because if we don't sync, the allocation of another
* page subsequent to this one might reach the disk first, and if we
* crashed at the right moment, leave us with this page as the one
* allocated by writing a page past it in the file.
*/
static int
__os_zerofill(dbenv, fhp)
DB_ENV *dbenv;
DB_FH *fhp;
{
off_t stat_offset, write_offset;
size_t blen, nw;
u_int32_t bytes, mbytes;
int group_sync, need_free, ret;
u_int8_t buf[8 * 1024], *bp;
/* Calculate the byte offset of the next write. */
write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
/* Stat the file. */
if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
return (ret);
stat_offset = (off_t)mbytes * MEGABYTE + bytes;
/* Check if the file is large enough. */
if (stat_offset >= write_offset)
return (0);
/* Get a large buffer if we're writing lots of data. */
#undef ZF_LARGE_WRITE
#define ZF_LARGE_WRITE (64 * 1024)
if (write_offset - stat_offset > ZF_LARGE_WRITE) {
if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
return (ret);
blen = ZF_LARGE_WRITE;
need_free = 1;
} else {
bp = buf;
blen = sizeof(buf);
need_free = 0;
memset(buf, 0, sizeof(buf));
}
/* Seek to the current end of the file. */
if ((ret = __os_seek(
dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
goto err;
/*
* Hash is the only access method that allocates groups of pages. Hash
* uses the existence of the last page in a group to signify the entire
* group is OK; so, write all the pages but the last one in the group,
* flush them to disk, then write the last one to disk and flush it.
*/
for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
if (write_offset - stat_offset <= blen) {
blen = (size_t)(write_offset - stat_offset);
if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
goto err;
}
if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
goto err;
stat_offset += blen;
}
if ((ret = __os_fsync(dbenv, fhp)) != 0)
goto err;
/* Seek back to where we started. */
mbytes = (u_int32_t)(write_offset / MEGABYTE);
bytes = (u_int32_t)(write_offset % MEGABYTE);
ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
err: if (need_free)
__os_free(dbenv, bp);
return (ret);
}
#endif