srv0srv.h Support raw disk partitions as data files

srv0start.c	Support raw disk partitions as data files
srv0srv.c	Support raw disk partitions as data files
row0purge.c	< 4 GB rows, doublewrite, hang fixes
row0row.c	< 4 GB rows, doublewrite, hang fixes
row0sel.c	< 4 GB rows, doublewrite, hang fixes
row0uins.c	< 4 GB rows, doublewrite, hang fixes
row0umod.c	< 4 GB rows, doublewrite, hang fixes
row0undo.c	< 4 GB rows, doublewrite, hang fixes
row0upd.c	< 4 GB rows, doublewrite, hang fixes
srv0srv.c	< 4 GB rows, doublewrite, hang fixes
srv0start.c	< 4 GB rows, doublewrite, hang fixes
sync0rw.c	< 4 GB rows, doublewrite, hang fixes
sync0sync.c	< 4 GB rows, doublewrite, hang fixes
trx0purge.c	< 4 GB rows, doublewrite, hang fixes
trx0rec.c	< 4 GB rows, doublewrite, hang fixes
trx0sys.c	< 4 GB rows, doublewrite, hang fixes
btr0btr.c	< 4 GB rows, doublewrite, hang fixes
btr0cur.c	< 4 GB rows, doublewrite, hang fixes
buf0buf.c	< 4 GB rows, doublewrite, hang fixes
buf0flu.c	< 4 GB rows, doublewrite, hang fixes
buf0rea.c	< 4 GB rows, doublewrite, hang fixes
data0data.c	< 4 GB rows, doublewrite, hang fixes
fil0fil.c	< 4 GB rows, doublewrite, hang fixes
fsp0fsp.c	< 4 GB rows, doublewrite, hang fixes
ibuf0ibuf.c	< 4 GB rows, doublewrite, hang fixes
lock0lock.c	< 4 GB rows, doublewrite, hang fixes
log0log.c	< 4 GB rows, doublewrite, hang fixes
log0recv.c	< 4 GB rows, doublewrite, hang fixes
os0file.c	< 4 GB rows, doublewrite, hang fixes
page0cur.c	< 4 GB rows, doublewrite, hang fixes
pars0pars.c	< 4 GB rows, doublewrite, hang fixes
rem0cmp.c	< 4 GB rows, doublewrite, hang fixes
rem0rec.c	< 4 GB rows, doublewrite, hang fixes
row0ins.c	< 4 GB rows, doublewrite, hang fixes
row0mysql.c	< 4 GB rows, doublewrite, hang fixes
univ.i  	< 4 GB rows, doublewrite, hang fixes
data0data.ic	< 4 GB rows, doublewrite, hang fixes
mach0data.ic	< 4 GB rows, doublewrite, hang fixes
rem0rec.ic	< 4 GB rows, doublewrite, hang fixes
row0upd.ic	< 4 GB rows, doublewrite, hang fixes
trx0rec.ic	< 4 GB rows, doublewrite, hang fixes
rem0cmp.h	< 4 GB rows, doublewrite, hang fixes
rem0rec.h	< 4 GB rows, doublewrite, hang fixes
row0ins.h	< 4 GB rows, doublewrite, hang fixes
row0mysql.h	< 4 GB rows, doublewrite, hang fixes
row0row.h	< 4 GB rows, doublewrite, hang fixes
row0upd.h	< 4 GB rows, doublewrite, hang fixes
srv0srv.h	< 4 GB rows, doublewrite, hang fixes
sync0sync.h	< 4 GB rows, doublewrite, hang fixes
trx0rec.h	< 4 GB rows, doublewrite, hang fixes
trx0sys.h	< 4 GB rows, doublewrite, hang fixes
trx0types.h	< 4 GB rows, doublewrite, hang fixes
trx0undo.h	< 4 GB rows, doublewrite, hang fixes
ut0dbg.h	< 4 GB rows, doublewrite, hang fixes
ut0ut.h 	< 4 GB rows, doublewrite, hang fixes
btr0btr.h	< 4 GB rows, doublewrite, hang fixes
btr0cur.h	< 4 GB rows, doublewrite, hang fixes
buf0buf.h	< 4 GB rows, doublewrite, hang fixes
buf0flu.h	< 4 GB rows, doublewrite, hang fixes
data0data.h	< 4 GB rows, doublewrite, hang fixes
dict0mem.h	< 4 GB rows, doublewrite, hang fixes
fil0fil.h	< 4 GB rows, doublewrite, hang fixes
fsp0fsp.h	< 4 GB rows, doublewrite, hang fixes
os0file.h	< 4 GB rows, doublewrite, hang fixes
This commit is contained in:
heikki@donna.mysql.fi 2001-08-04 19:36:14 +03:00
commit 94db78ce61
62 changed files with 3146 additions and 517 deletions

View file

@ -216,14 +216,44 @@ buf_calc_page_checksum(
/* out: checksum */
byte* page) /* in: buffer page */
{
ulint checksum;
ulint checksum;
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN);
checksum = checksum & 0xFFFFFFFF;
checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ ut_fold_binary(page + FIL_PAGE_DATA,
UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN);
checksum = checksum & 0xFFFFFFFF;
return(checksum);
return(checksum);
}
/************************************************************************
Checks if a page is corrupt. */
ibool
buf_page_is_corrupted(
/*==================*/
/* out: TRUE if corrupted */
byte* read_buf) /* in: a database page */
{
ulint checksum;
checksum = buf_calc_page_checksum(read_buf);
if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
!= mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN + 4))
|| (checksum != mach_read_from_4(read_buf
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN)
&& mach_read_from_4(read_buf + FIL_PAGE_LSN)
!= mach_read_from_4(read_buf
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN))) {
return(TRUE);
}
return(FALSE);
}
/************************************************************************
@ -1265,34 +1295,22 @@ buf_page_io_complete(
dulint id;
dict_index_t* index;
ulint io_type;
ulint checksum;
ut_ad(block);
io_type = block->io_fix;
if (io_type == BUF_IO_READ) {
checksum = buf_calc_page_checksum(block->frame);
/* From version 3.23.38 up we store the page checksum
to the 4 upper bytes of the page end lsn field */
if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
!= mach_read_from_4(block->frame + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN + 4))
|| (checksum != mach_read_from_4(block->frame
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN)
&& mach_read_from_4(block->frame + FIL_PAGE_LSN)
!= mach_read_from_4(block->frame
+ UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN))) {
fprintf(stderr,
if (buf_page_is_corrupted(block->frame)) {
fprintf(stderr,
"InnoDB: Database page corruption or a failed\n"
"InnoDB: file read of page %lu.\n", block->offset);
fprintf(stderr,
fprintf(stderr,
"InnoDB: You may have to recover from a backup.\n");
exit(1);
exit(1);
}
if (recv_recovery_is_on()) {
@ -1601,11 +1619,28 @@ void
buf_print_io(void)
/*==============*/
{
ulint size;
ut_ad(buf_pool);
mutex_enter(&(buf_pool->mutex));
size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;
printf("pages read %lu, created %lu, written %lu\n",
mutex_enter(&(buf_pool->mutex));
printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU));
printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free));
printf("Flush list length %lu \n",
UT_LIST_GET_LEN(buf_pool->flush_list));
printf("Buffer pool size in pages %lu\n", size);
printf("Pending reads %lu \n", buf_pool->n_pend_reads);
printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n",
buf_pool->n_flush[BUF_FLUSH_LRU],
buf_pool->n_flush[BUF_FLUSH_LIST],
buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
printf("Pages read %lu, created %lu, written %lu\n",
buf_pool->n_pages_read, buf_pool->n_pages_created,
buf_pool->n_pages_written);
mutex_exit(&(buf_pool->mutex));

View file

@ -1,7 +1,7 @@
/******************************************************
The database buffer buf_pool flush algorithm
(c) 1995 Innobase Oy
(c) 1995-2001 Innobase Oy
Created 11/11/1995 Heikki Tuuri
*******************************************************/
@ -15,7 +15,6 @@ Created 11/11/1995 Heikki Tuuri
#include "ut0byte.h"
#include "ut0lst.h"
#include "fil0fil.h"
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
@ -195,9 +194,145 @@ buf_flush_write_complete(
}
/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio we must
call os_aio_simulated_wake_handler_threads after we have posted a batch
of writes! */
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
buf_block_t* block;
ulint len;
ulint i;
if (trx_doublewrite == NULL) {
os_aio_simulated_wake_handler_threads();
return;
}
mutex_enter(&(trx_doublewrite->mutex));
/* Write first to doublewrite buffer blocks. We use synchronous
aio and thus know that file write has been completed when the
control returns. */
if (trx_doublewrite->first_free == 0) {
mutex_exit(&(trx_doublewrite->mutex));
return;
}
if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
} else {
len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
}
fil_io(OS_FILE_WRITE,
TRUE, TRX_SYS_SPACE,
trx_doublewrite->block1, 0, len,
(void*)trx_doublewrite->write_buf, NULL);
if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
len = (trx_doublewrite->first_free
- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
fil_io(OS_FILE_WRITE,
TRUE, TRX_SYS_SPACE,
trx_doublewrite->block2, 0, len,
(void*)(trx_doublewrite->write_buf
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
NULL);
}
/* Now flush the doublewrite buffer data to disk */
fil_flush(TRX_SYS_SPACE);
/* We know that the writes have been flushed to disk now
and in recovery we will find them in the doublewrite buffer
blocks. Next do the writes to the intended positions. */
for (i = 0; i < trx_doublewrite->first_free; i++) {
block = trx_doublewrite->buf_block_arr[i];
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
}
/* Wake possible simulated aio thread to actually post the
writes to the operating system */
os_aio_simulated_wake_handler_threads();
/* Wait that all async writes to tablespaces have been posted to
the OS */
os_aio_wait_until_no_pending_writes();
/* Now we flush the data to disk (for example, with fsync) */
fil_flush_file_spaces(FIL_TABLESPACE);
/* We can now reuse the doublewrite memory buffer: */
trx_doublewrite->first_free = 0;
mutex_exit(&(trx_doublewrite->mutex));
}
/************************************************************************
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
buf_block_t* block) /* in: buffer block to write */
{
try_again:
mutex_enter(&(trx_doublewrite->mutex));
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
buf_flush_buffered_writes();
goto try_again;
}
ut_memcpy(trx_doublewrite->write_buf
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
block->frame, UNIV_PAGE_SIZE);
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;
trx_doublewrite->first_free++;
if (trx_doublewrite->first_free
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
mutex_exit(&(trx_doublewrite->mutex));
buf_flush_buffered_writes();
return;
}
mutex_exit(&(trx_doublewrite->mutex));
}
/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
@ -222,15 +357,24 @@ buf_flush_write_block_low(
mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
block->newest_modification);
/* Write to the page the space id and page number */
mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space);
mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset);
/* We overwrite the first 4 bytes of the end lsn field to store
a page checksum */
mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
buf_calc_page_checksum(block->frame));
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
if (!trx_doublewrite) {
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
} else {
buf_flush_post_to_doublewrite_buf(block);
}
}
/************************************************************************
@ -251,14 +395,14 @@ buf_flush_try_page(
buf_block_t* block;
ibool locked;
ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)
|| (flush_type == BUF_FLUSH_SINGLE_PAGE));
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
|| flush_type == BUF_FLUSH_SINGLE_PAGE);
mutex_enter(&(buf_pool->mutex));
block = buf_page_hash_get(space, offset);
if ((flush_type == BUF_FLUSH_LIST)
if (flush_type == BUF_FLUSH_LIST
&& block && buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@ -286,7 +430,7 @@ buf_flush_try_page(
mutex_exit(&(buf_pool->mutex));
if (!locked) {
os_aio_simulated_wake_handler_threads();
buf_flush_buffered_writes();
rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
}
@ -300,7 +444,7 @@ buf_flush_try_page(
return(1);
} else if ((flush_type == BUF_FLUSH_LRU) && block
} else if (flush_type == BUF_FLUSH_LRU && block
&& buf_flush_ready_for_flush(block, flush_type)) {
/* VERY IMPORTANT:
@ -328,7 +472,7 @@ buf_flush_try_page(
return(1);
} else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block
} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
&& buf_flush_ready_for_flush(block, flush_type)) {
block->io_fix = BUF_IO_WRITE;
@ -385,6 +529,14 @@ buf_flush_try_neighbors(
/* If there is little space, it is better not to flush any
block except from the end of the LRU list */
low = offset;
high = offset + 1;
} else if (flush_type == BUF_FLUSH_LIST) {
/* Since semaphore waits require us to flush the
doublewrite buffer to disk, it is best that the
search area is just the page itself, to minimize
chances for semaphore waits */
low = offset;
high = offset + 1;
}
@ -418,13 +570,6 @@ buf_flush_try_neighbors(
mutex_exit(&(buf_pool->mutex));
/* In simulated aio we wake up the i/o-handler threads now that
we have posted a batch of writes: */
/* printf("Flush count %lu ; Waking i/o handlers\n", count); */
os_aio_simulated_wake_handler_threads();
return(count);
}
@ -565,13 +710,15 @@ buf_flush_batch(
mutex_exit(&(buf_pool->mutex));
if (buf_debug_prints && (page_count > 0)) {
buf_flush_buffered_writes();
if (buf_debug_prints && page_count > 0) {
if (flush_type == BUF_FLUSH_LRU) {
printf("To flush %lu pages in LRU flush\n",
printf("Flushed %lu pages in LRU flush\n",
page_count);
} else if (flush_type == BUF_FLUSH_LIST) {
printf("To flush %lu pages in flush list flush\n",
page_count, flush_type);
printf("Flushed %lu pages in flush list flush\n",
page_count);
} else {
ut_error;
}

View file

@ -49,7 +49,9 @@ ulint
buf_read_page_low(
/*==============*/
/* out: 1 if a read request was queued, 0 if the page
already resided in buf_pool */
already resided in buf_pool or if the page is in
the doublewrite buffer blocks in which case it is never
read into the pool */
ibool sync, /* in: TRUE if synchronous aio is desired */
ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ...,
ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
@ -63,6 +65,16 @@ buf_read_page_low(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
if (trx_doublewrite && space == TRX_SYS_SPACE
&& ( (offset >= trx_doublewrite->block1
&& offset < trx_doublewrite->block1
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
|| (offset >= trx_doublewrite->block2
&& offset < trx_doublewrite->block2
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
return(0);
}
#ifdef UNIV_LOG_DEBUG
if (space % 2 == 1) {
/* We are updating a replicate space while holding the