mariadb/storage/bdb/os/os_rw.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1997-2005
 *	Sleepycat Software.  All rights reserved.
 *
 * $Id: os_rw.c,v 12.5 2005/08/10 15:47:26 bostic Exp $
 */

#include "db_config.h"

#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <sys/stat.h>

#include <string.h>
#endif

#include "db_int.h"

#ifdef HAVE_FILESYSTEM_NOTZERO
static int __os_zerofill __P((DB_ENV *, DB_FH *));
#endif
static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));

/*
 * __os_io --
 *	Do an I/O.
 *
 * PUBLIC: int __os_io __P((DB_ENV *,
 * PUBLIC:     int, DB_FH *, db_pgno_t, u_int32_t, u_int8_t *, size_t *));
 */
int
__os_io(dbenv, op, fhp, pgno, pagesize, buf, niop)
	DB_ENV *dbenv;
	int op;
	DB_FH *fhp;
	db_pgno_t pgno;
	u_int32_t pagesize;
	u_int8_t *buf;
	size_t *niop;
{
#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
	ssize_t nio;
#endif
	int ret;

	/* Check for illegal usage. */
	DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);

#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
	switch (op) {
	case DB_IO_READ:
		if (DB_GLOBAL(j_read) != NULL)
			goto slow;
		nio = DB_GLOBAL(j_pread) != NULL ? DB_GLOBAL(j_pread)
			(fhp->fd, buf, pagesize, (off_t)pgno * pagesize) :
			pread(fhp->fd, buf, pagesize, (off_t)pgno * pagesize);
		break;
	case DB_IO_WRITE:
		if (DB_GLOBAL(j_write) != NULL)
			goto slow;
#ifdef HAVE_FILESYSTEM_NOTZERO
		if (__os_fs_notzero())
			goto slow;
#endif
		nio = DB_GLOBAL(j_pwrite) != NULL ? DB_GLOBAL(j_pwrite)
			(fhp->fd, buf, pagesize, (off_t)pgno * pagesize) :
			pwrite(fhp->fd, buf, pagesize, (off_t)pgno * pagesize);
		break;
	default:
		return (EINVAL);
	}
	if (nio == (ssize_t)pagesize) {
		*niop = pagesize;
		return (0);
	}
slow:
#endif
	MUTEX_LOCK(dbenv, fhp->mtx_fh);

	if ((ret = __os_seek(dbenv, fhp,
	    pagesize, pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
		goto err;
	switch (op) {
	case DB_IO_READ:
		ret = __os_read(dbenv, fhp, buf, pagesize, niop);
		break;
	case DB_IO_WRITE:
		ret = __os_write(dbenv, fhp, buf, pagesize, niop);
		break;
	default:
		ret = EINVAL;
		break;
	}

err:	MUTEX_UNLOCK(dbenv, fhp->mtx_fh);

	return (ret);

}

/*
 * __os_read --
 *	Read from a file handle.
 *
 * PUBLIC: int __os_read __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
 */
int
__os_read(dbenv, fhp, addr, len, nrp)
	DB_ENV *dbenv;
	DB_FH *fhp;
	void *addr;
	size_t len;
	size_t *nrp;
{
	size_t offset;
	ssize_t nr;
	int ret;
	u_int8_t *taddr;

	ret = 0;

	/* Check for illegal usage. */
	DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);

	if (DB_GLOBAL(j_read) != NULL) {
		*nrp = len;
		if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
			ret = __os_get_errno();
			__db_err(dbenv, "read: %#lx, %lu: %s",
			    P_TO_ULONG(addr), (u_long)len, strerror(ret));
		}
		return (ret);
	}

	for (taddr = addr, offset = 0;
	    offset < len; taddr += nr, offset += (u_int32_t)nr) {
		RETRY_CHK(((nr = read(
		    fhp->fd, taddr, len - offset)) < 0 ? 1 : 0), ret);
		if (nr == 0 || ret != 0)
			break;
	}
	*nrp = (size_t)(taddr - (u_int8_t *)addr);
	if (ret != 0)
		__db_err(dbenv, "read: %#lx, %lu: %s",
		    P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
	return (ret);
}

/*
 * __os_write --
 *	Write to a file handle.
 *
 * PUBLIC: int __os_write __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
 */
int
__os_write(dbenv, fhp, addr, len, nwp)
	DB_ENV *dbenv;
	DB_FH *fhp;
	void *addr;
	size_t len;
	size_t *nwp;
{
	/* Check for illegal usage. */
	DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);

#ifdef HAVE_FILESYSTEM_NOTZERO
	/* Zero-fill as necessary. */
	if (__os_fs_notzero()) {
		int ret;
		if ((ret = __os_zerofill(dbenv, fhp)) != 0)
			return (ret);
	}
#endif
	return (__os_physwrite(dbenv, fhp, addr, len, nwp));
}

/*
 * __os_physwrite --
 *	Physical write to a file handle.
 */
static int
__os_physwrite(dbenv, fhp, addr, len, nwp)
	DB_ENV *dbenv;
	DB_FH *fhp;
	void *addr;
	size_t len;
	size_t *nwp;
{
	size_t offset;
	ssize_t nw;
	int ret;
	u_int8_t *taddr;

	ret = 0;

#if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
	if (__os_fs_notzero()) {
		struct stat sb;
		off_t cur_off;

		DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
		    (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
		    cur_off <= sb.st_size);
	}
#endif

	/*
	 * Make a last "panic" check.  Imagine a thread of control running in
	 * Berkeley DB, going to sleep.  Another thread of control decides to
	 * run recovery because the environment is broken.  The first thing
	 * recovery does is panic the existing environment, but we only check
	 * the panic flag when crossing the public API.  If the sleeping thread
	 * wakes up and writes something, we could have two threads of control
	 * writing the log files at the same time.  So, before writing, make a
	 * last panic check.  Obviously, there's still a window, but it's very,
	 * very small.
	 */
	PANIC_CHECK(dbenv);

	if (DB_GLOBAL(j_write) != NULL) {
		*nwp = len;
		if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
			ret = __os_get_errno();
			__db_err(dbenv, "write: %#lx, %lu: %s",
			    P_TO_ULONG(addr), (u_long)len, strerror(ret));
		}
		return (ret);
	}

	for (taddr = addr, offset = 0;
	    offset < len; taddr += nw, offset += (u_int32_t)nw) {
		RETRY_CHK(((nw = write(
		    fhp->fd, taddr, len - offset)) < 0 ? 1 : 0), ret);
		if (ret != 0)
			break;
	}
	*nwp = len;
	if (ret != 0)
		__db_err(dbenv, "write: %#lx, %lu: %s",
		    P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
	return (ret);
}

#ifdef HAVE_FILESYSTEM_NOTZERO
/*
 * __os_zerofill --
 *	Zero out bytes in the file.
 *
 *	Pages allocated by writing pages past end-of-file are not zeroed,
 *	on some systems.  Recovery could theoretically be fooled by a page
 *	showing up that contained garbage.  In order to avoid this, we
 *	have to write the pages out to disk, and flush them.  The reason
 *	for the flush is because if we don't sync, the allocation of another
 *	page subsequent to this one might reach the disk first, and if we
 *	crashed at the right moment, leave us with this page as the one
 *	allocated by writing a page past it in the file.
 */
static int
__os_zerofill(dbenv, fhp)
	DB_ENV *dbenv;
	DB_FH *fhp;
{
	off_t stat_offset, write_offset;
	size_t blen, nw;
	u_int32_t bytes, mbytes;
	int group_sync, need_free, ret;
	u_int8_t buf[8 * 1024], *bp;

	/* Calculate the byte offset of the next write. */
	write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;

	/* Stat the file. */
	if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
		return (ret);
	stat_offset = (off_t)mbytes * MEGABYTE + bytes;

	/* Check if the file is large enough. */
	if (stat_offset >= write_offset)
		return (0);

	/* Get a large buffer if we're writing lots of data. */
#undef	ZF_LARGE_WRITE
#define	ZF_LARGE_WRITE	(64 * 1024)
	if (write_offset - stat_offset > ZF_LARGE_WRITE) {
		if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
			    return (ret);
		blen = ZF_LARGE_WRITE;
		need_free = 1;
	} else {
		bp = buf;
		blen = sizeof(buf);
		need_free = 0;
		memset(buf, 0, sizeof(buf));
	}

	/* Seek to the current end of the file. */
	if ((ret = __os_seek(
	    dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
		goto err;

	/*
	 * Hash is the only access method that allocates groups of pages.  Hash
	 * uses the existence of the last page in a group to signify the entire
	 * group is OK; so, write all the pages but the last one in the group,
	 * flush them to disk, then write the last one to disk and flush it.
	 */
	for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
		if (write_offset - stat_offset <= blen) {
			blen = (size_t)(write_offset - stat_offset);
			if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
				goto err;
		}
		if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
			goto err;
		stat_offset += blen;
	}
	if ((ret = __os_fsync(dbenv, fhp)) != 0)
		goto err;

	/* Seek back to where we started. */
	mbytes = (u_int32_t)(write_offset / MEGABYTE);
	bytes = (u_int32_t)(write_offset % MEGABYTE);
	ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);

err:	if (need_free)
		__os_free(dbenv, bp);
	return (ret);
}
#endif