mariadb/storage/maria/ma_control_file.c
unknown e27890cab0 WL#3072 Maria recovery
* create page cache before initializing engine and not after, because
Maria's recovery needs a page cache
* make the creation of a bitmap page more crash-resistent
* bugfix (see ma_blockrec.c)
* back to old way: create an 8k bitmap page when creating table
* preparations for the UNDO phase: recreate TRNs
* preparations for Checkpoint: list of dirty pages, testing
of rec_lsn to know if page should be skipped during Recovery
(unused in this patch as no Checkpoint module pushed yet)
* maria_chk tags repaired table with a special LSN
* reworking all around in ma_recovery.c (less duplication)


mysys/my_realloc.c:
  noted an issue in my_realloc()
sql/mysqld.cc:
  page cache needs to be created before engines are initialized,
  because Maria's initialization may do a recovery which needs
  the page cache.
storage/maria/ha_maria.cc:
  update to new prototype
storage/maria/ma_bitmap.c:
  when creating the first bitmap page we used chsize to 8192 bytes then 
  pwrite (overwrite) the last 2 bytes (8191-8192). If crash between
  the two operations, this leaves a bitmap page full without its end
  marker. A later recovery may try to read this page and find it
  exists and misses a marker and conclude it's corrupted and fail.
  Changing the chsize to only 8190 bytes: recovery will then find
  the page is too short and recreate it entirely.
storage/maria/ma_blockrec.c:
  Fix for a bug: when executing a REDO, if the data page is created,
  data_file_length was increased before _ma_bitmap_set():
  _ma_bitmap_set() called _ma_read_bitmap_page() which, due to the
  increased data_file_length, expected to find a bitmap page on disk
  with a correct end marker; if the bitmap page didn't exist already
  in fact, this failed. Fixed by increasing data_file_length only after
  _ma_read_bitmap_page() has created the new bitmap page correctly.
  This bug could happen every time a REDO is about creating a new
  bitmap page.
storage/maria/ma_check.c:
  empty data file has a bitmap page
storage/maria/ma_control_file.c:
  useless parameter to ma_control_file_create_or_open(), just
  test if this is recovery.
storage/maria/ma_control_file.h:
  new prototype
storage/maria/ma_create.c:
  Back to how it was before: maria_create() creates an 8k bitmap page.
  Thus (bugfix) data_file_length needs to reflect this instead of being 0.
storage/maria/ma_loghandler.c:
  as ma_test1 and ma_test2 now use real transactions and not
  dummy_transaction_object, REDO for INSERT/UPDATE/DELETE are always
  about real transactions, can assert this.
  A function for Recovery to assign a short id to a table.
storage/maria/ma_loghandler.h:
  new function
storage/maria/ma_loghandler_lsn.h:
  maria_chk tags repaired tables with this LSN
storage/maria/ma_open.c:
  * enforce that DMLs on transactional tables use real transactions
  and not dummy_transaction_object.
  * test if table was repaired with maria_chk (which has to been
  seen as an import of an external table into the server), test
  validity of create_rename_lsn (header corruption detection)
  * comments.
storage/maria/ma_recovery.c:
  * preparations for the UNDO phase: recreate TRNs
  * preparations for Checkpoint: list of dirty pages, testing
  of rec_lsn to know if page should be skipped during Recovery
  (unused in this patch as no Checkpoint module pushed yet)
  * reworking all around (less duplication)
storage/maria/ma_recovery.h:
  a parameter to say if the UNDO phase should be skipped
storage/maria/maria_chk.c:
  tag repaired tables with a special LSN
storage/maria/maria_read_log.c:
  * update to new prototype
  * no UNDO phase in maria_read_log for now
storage/maria/trnman.c:
  * a function for Recovery to create a transaction (TRN), needed
  in the UNDO phase
  * a function for Recovery to grab an existing transaction, needed
  in the UNDO phase (rollback all existing transactions)
storage/maria/trnman_public.h:
  new functions
2007-08-29 16:43:01 +02:00

325 lines
11 KiB
C

/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
/*
WL#3234 Maria control file
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
#include "maria_def.h"
/* Here is the implementation of this module */
/*
a control file contains 3 objects: magic string, LSN of last checkpoint,
number of last log.
*/
/* total size should be < sector size for atomic write operation */
#define CONTROL_FILE_MAGIC_STRING "\xfe\xfe\xc\1MACF"
#define CONTROL_FILE_MAGIC_STRING_OFFSET 0
#define CONTROL_FILE_MAGIC_STRING_SIZE (sizeof(CONTROL_FILE_MAGIC_STRING)-1)
#define CONTROL_FILE_CHECKSUM_OFFSET (CONTROL_FILE_MAGIC_STRING_OFFSET + CONTROL_FILE_MAGIC_STRING_SIZE)
#define CONTROL_FILE_CHECKSUM_SIZE 4
#define CONTROL_FILE_LSN_OFFSET (CONTROL_FILE_CHECKSUM_OFFSET + CONTROL_FILE_CHECKSUM_SIZE)
#define CONTROL_FILE_LSN_SIZE LSN_STORE_SIZE
#define CONTROL_FILE_FILENO_OFFSET (CONTROL_FILE_LSN_OFFSET + CONTROL_FILE_LSN_SIZE)
#define CONTROL_FILE_FILENO_SIZE 4
#define CONTROL_FILE_SIZE (CONTROL_FILE_FILENO_OFFSET + CONTROL_FILE_FILENO_SIZE)
/* This module owns these two vars. */
/**
This LSN serves for the two-checkpoint rule, and also to find the
checkpoint record when doing a recovery.
*/
LSN last_checkpoint_lsn= LSN_IMPOSSIBLE;
uint32 last_logno= FILENO_IMPOSSIBLE;
/**
@brief If log's lock should be asserted when writing to control file.
Can be re-used by any function which needs to be thread-safe except when
it is called at startup.
*/
my_bool maria_multi_threaded= FALSE;
/** @brief if currently doing a recovery */
my_bool maria_in_recovery= FALSE;
/*
Control file is less then 512 bytes (a disk sector),
to be as atomic as possible
*/
static int control_file_fd= -1;
/*
@brief Initialize control file subsystem
Looks for the control file. If none and creation is requested, creates file.
If present, reads it to find out last checkpoint's LSN and last log, updates
the last_checkpoint_lsn and last_logno global variables.
Called at engine's start.
@note
The format of the control file is:
4 bytes: magic string
4 bytes: checksum of the following bytes
4 bytes: number of log where last checkpoint is
4 bytes: offset in log where last checkpoint is
4 bytes: number of last log
@note If in recovery, file is not created
@return Operation status
@retval 0 OK
@retval 1 Error (in which case the file is left closed)
*/
CONTROL_FILE_ERROR ma_control_file_create_or_open()
{
char buffer[CONTROL_FILE_SIZE];
char name[FN_REFLEN];
MY_STAT stat_buff;
my_bool create_file;
int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR;
int error= CONTROL_FILE_UNKNOWN_ERROR;
DBUG_ENTER("ma_control_file_create_or_open");
/*
If you change sizes in the #defines, you at least have to change the
"*store" and "*korr" calls in this file, and can even create backward
compatibility problems. Beware!
*/
DBUG_ASSERT(CONTROL_FILE_LSN_SIZE == (3+4));
DBUG_ASSERT(CONTROL_FILE_FILENO_SIZE == 4);
if (control_file_fd >= 0) /* already open */
DBUG_RETURN(0);
if (fn_format(name, CONTROL_FILE_BASE_NAME,
maria_data_root, "", MYF(MY_WME)) == NullS)
DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
create_file= test(my_access(name,F_OK));
if (create_file)
{
/* in a recovery, we expect to find a control file */
if (maria_in_recovery)
DBUG_RETURN(CONTROL_FILE_MISSING);
if ((control_file_fd= my_create(name, 0,
open_flags, MYF(MY_SYNC_DIR))) < 0)
DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
/*
To be safer we should make sure that there are no logs or data/index
files around (indeed it could be that the control file alone was deleted
or not restored, and we should not go on with life at this point).
TODO: For now we trust (this is alpha version), but for beta if would
be great to verify.
We could have a tool which can rebuild the control file, by reading the
directory of logs, finding the newest log, reading it to find last
checkpoint... Slow but can save your db. For this to be possible, we
must always write to the control file right after writing the checkpoint
log record, and do nothing in between (i.e. the checkpoint must be
usable as soon as it has been written to the log).
*/
/* init the file with these "undefined" values */
DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE,
FILENO_IMPOSSIBLE,
CONTROL_FILE_UPDATE_ALL));
}
/* Otherwise, file exists */
if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0)
goto err;
if (my_stat(name, &stat_buff, MYF(MY_WME)) == NULL)
goto err;
if ((uint)stat_buff.st_size < CONTROL_FILE_SIZE)
{
/*
Given that normally we write only a sector and it's atomic, the only
possibility for a file to be of too short size is if we crashed at the
very first startup, between file creation and file write. Quite unlikely
(and can be made even more unlikely by doing this: create a temp file,
write it, and then rename it to be the control file).
What's more likely is if someone forgot to restore the control file,
just did a "touch control" to try to get Maria to start, or if the
disk/filesystem has a problem.
So let's be rigid.
*/
/*
TODO: store a message "too small file" somewhere, so that it goes to
MySQL's error log at startup.
*/
error= CONTROL_FILE_TOO_SMALL;
goto err;
}
if ((uint)stat_buff.st_size > CONTROL_FILE_SIZE)
{
/* TODO: store "too big file" message */
error= CONTROL_FILE_TOO_BIG;
goto err;
}
if (my_read(control_file_fd, buffer, CONTROL_FILE_SIZE,
MYF(MY_FNABP | MY_WME)))
goto err;
if (memcmp(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET,
CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE))
{
/* TODO: store message "bad magic string" somewhere */
error= CONTROL_FILE_BAD_MAGIC_STRING;
goto err;
}
if (my_checksum(0, buffer + CONTROL_FILE_LSN_OFFSET,
CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET) !=
uint4korr(buffer + CONTROL_FILE_CHECKSUM_OFFSET))
{
/* TODO: store message "checksum mismatch" somewhere */
error= CONTROL_FILE_BAD_CHECKSUM;
goto err;
}
last_checkpoint_lsn= lsn_korr(buffer + CONTROL_FILE_LSN_OFFSET);
last_logno= uint4korr(buffer + CONTROL_FILE_FILENO_OFFSET);
DBUG_RETURN(0);
err:
ma_control_file_end();
DBUG_RETURN(error);
}
/*
Write information durably to the control file; stores this information into
the last_checkpoint_lsn and last_logno global variables.
Called when we have created a new log (after syncing this log's creation)
and when we have written a checkpoint (after syncing this log record).
Variables last_checkpoint_lsn and last_logno must be protected by caller
using log's lock, unless this function is called at startup.
SYNOPSIS
ma_control_file_write_and_force()
checkpoint_lsn LSN of last checkpoint
logno last log file number
objs_to_write which of the arguments should be used as new values
(for example, CONTROL_FILE_UPDATE_ONLY_LSN will not
write the logno argument to the control file and will
not update the last_logno global variable); can be:
CONTROL_FILE_UPDATE_ALL
CONTROL_FILE_UPDATE_ONLY_LSN
CONTROL_FILE_UPDATE_ONLY_LOGNO.
NOTE
We always want to do one single my_pwrite() here to be as atomic as
possible.
RETURN
0 - OK
1 - Error
*/
int ma_control_file_write_and_force(const LSN checkpoint_lsn, uint32 logno,
uint objs_to_write)
{
char buffer[CONTROL_FILE_SIZE];
my_bool update_checkpoint_lsn= FALSE, update_logno= FALSE;
DBUG_ENTER("ma_control_file_write_and_force");
DBUG_ASSERT(control_file_fd >= 0); /* must be open */
#ifndef DBUG_OFF
if (maria_multi_threaded)
translog_lock_assert_owner();
#endif
memcpy(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET,
CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE);
if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LSN)
update_checkpoint_lsn= TRUE;
else if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LOGNO)
update_logno= TRUE;
else if (objs_to_write == CONTROL_FILE_UPDATE_ALL)
update_checkpoint_lsn= update_logno= TRUE;
else /* incorrect value of objs_to_write */
DBUG_ASSERT(0);
if (update_checkpoint_lsn)
lsn_store(buffer + CONTROL_FILE_LSN_OFFSET, checkpoint_lsn);
else /* store old value == change nothing */
lsn_store(buffer + CONTROL_FILE_LSN_OFFSET, last_checkpoint_lsn);
if (update_logno)
int4store(buffer + CONTROL_FILE_FILENO_OFFSET, logno);
else
int4store(buffer + CONTROL_FILE_FILENO_OFFSET, last_logno);
{
uint32 sum= (uint32)
my_checksum(0, buffer + CONTROL_FILE_LSN_OFFSET,
CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET);
int4store(buffer + CONTROL_FILE_CHECKSUM_OFFSET, sum);
}
if (my_pwrite(control_file_fd, buffer, sizeof(buffer),
0, MYF(MY_FNABP | MY_WME)) ||
my_sync(control_file_fd, MYF(MY_WME)))
DBUG_RETURN(1);
if (update_checkpoint_lsn)
last_checkpoint_lsn= checkpoint_lsn;
if (update_logno)
last_logno= logno;
DBUG_RETURN(0);
}
/*
Free resources taken by control file subsystem
SYNOPSIS
ma_control_file_end()
*/
int ma_control_file_end()
{
int close_error;
DBUG_ENTER("ma_control_file_end");
if (control_file_fd < 0) /* already closed */
DBUG_RETURN(0);
close_error= my_close(control_file_fd, MYF(MY_WME));
/*
As my_close() frees structures even if close() fails, we do the same,
i.e. we mark the file as closed in all cases.
*/
control_file_fd= -1;
/*
As this module owns these variables, closing the module forbids access to
them (just a safety):
*/
last_checkpoint_lsn= LSN_IMPOSSIBLE;
last_logno= FILENO_IMPOSSIBLE;
DBUG_RETURN(close_error);
}