mirror of
https://github.com/MariaDB/server.git
synced 2025-01-17 04:22:27 +01:00
a41ac15b96
1) on Mac OS X >=10.3, fcntl() is recommended over fsync (from the man page: "[With fsync()] the disk drive may also re-order the data so that later writes may be present while earlier writes are not. Applications such as databases that require a strict ordering of writes should use F_FULLFSYNC to ensure their data is written in the order they expect"). I have seen two other pieces of software changing from fsync to F_FULLFSYNC on Mac OS X. 2) to make a file creation/deletion/renaming durable on Linux (at least ext2 as I have tested) (see "man fsync"), a fsync() on the directory is needed: new functions to do that, and a flag MY_SYNC_DIR to do it in my_create/my_delete/my_rename. 3) now using this directory syncing when creating he frm if opt_sync_frm, and for Maria's control file when it is created. include/my_sys.h: new flag to my_create/my_delete/my_rename, which asks to sync the directory after the operation is done (currently does nothing except on Linux) libmysql/CMakeLists.txt: my_create() now depends on my_sync() so my_sync is needed for libmysql libmysql/Makefile.shared: my_create() now depends on my_sync() so my_sync is needed for libmysql mysys/my_create.c: my_create() can now sync the directory if asked for mysys/my_delete.c: my_delete() can now sync the directory if asked for mysys/my_open.c: it was a bug that my_close() is done on fd but a positive fd would still be returned, by my_register_filename(). mysys/my_rename.c: my_rename() can now sync the two directories (the one of "from" and the one of "to") if asked for. mysys/my_sync.c: On recent Mac OS X, fcntl(F_FULLFSYNC) is recommended over fsync() (see "man fsync" on Mac OS X 10.3). my_sync_dir(): to sync a directory after a file creation/deletion/ renaming; can be called directly or via MY_SYNC_DIR in my_create/ my_delete/my_rename(). No-op except on Linux (see "man fsync" on Linux). my_sync_dir_from_file(): same as above, just more practical when the caller has a file name but no directory name ready. Should the #warning even be a #error? I mean do we want to release binaries which don't guarantee any durability? sql/log.cc: a TODO for the future. sql/unireg.cc: If we sync the frm it makes sense to also sync its creation in the directory. storage/maria/ma_control_file.c: control file is vital, try to make it to disk
337 lines
11 KiB
C
337 lines
11 KiB
C
/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
|
|
|
/*
|
|
WL#3234 Maria control file
|
|
First version written by Guilhem Bichot on 2006-04-27.
|
|
Does not compile yet.
|
|
*/
|
|
|
|
#include "maria_def.h"
|
|
#include "ma_control_file.h"
|
|
|
|
/* Here is the implementation of this module */
|
|
|
|
/*
|
|
a control file contains 3 objects: magic string, LSN of last checkpoint,
|
|
number of last log.
|
|
*/
|
|
|
|
/* total size should be < sector size for atomic write operation */
|
|
#define CONTROL_FILE_MAGIC_STRING "MACF"
|
|
#define CONTROL_FILE_MAGIC_STRING_OFFSET 0
|
|
#define CONTROL_FILE_MAGIC_STRING_SIZE (sizeof(CONTROL_FILE_MAGIC_STRING)-1)
|
|
#define CONTROL_FILE_CHECKSUM_OFFSET (CONTROL_FILE_MAGIC_STRING_OFFSET + CONTROL_FILE_MAGIC_STRING_SIZE)
|
|
#define CONTROL_FILE_CHECKSUM_SIZE 1
|
|
#define CONTROL_FILE_LSN_OFFSET (CONTROL_FILE_CHECKSUM_OFFSET + CONTROL_FILE_CHECKSUM_SIZE)
|
|
#define CONTROL_FILE_LSN_SIZE (4+4)
|
|
#define CONTROL_FILE_FILENO_OFFSET (CONTROL_FILE_LSN_OFFSET + CONTROL_FILE_LSN_SIZE)
|
|
#define CONTROL_FILE_FILENO_SIZE 4
|
|
#define CONTROL_FILE_SIZE (CONTROL_FILE_FILENO_OFFSET + CONTROL_FILE_FILENO_SIZE)
|
|
|
|
/*
|
|
This module owns these two vars.
|
|
uint32 is always atomically updated, but LSN is 8 bytes, we will need
|
|
provisions to ensure that it's updated atomically in
|
|
ma_control_file_write_and_force(). Probably the log mutex could be
|
|
used. TODO.
|
|
*/
|
|
LSN last_checkpoint_lsn;
|
|
uint32 last_logno;
|
|
|
|
|
|
/*
|
|
Control file is less then 512 bytes (a disk sector),
|
|
to be as atomic as possible
|
|
*/
|
|
static int control_file_fd= -1;
|
|
|
|
static void lsn8store(char *buffer, const LSN *lsn)
|
|
{
|
|
int4store(buffer, lsn->file_no);
|
|
int4store(buffer + CONTROL_FILE_FILENO_SIZE, lsn->rec_offset);
|
|
}
|
|
|
|
static LSN lsn8korr(char *buffer)
|
|
{
|
|
LSN tmp;
|
|
tmp.file_no= uint4korr(buffer);
|
|
tmp.rec_offset= uint4korr(buffer + CONTROL_FILE_FILENO_SIZE);
|
|
return tmp;
|
|
}
|
|
|
|
static char simple_checksum(char *buffer, uint size)
|
|
{
|
|
/* TODO: improve this sum if we want */
|
|
char s= 0;
|
|
uint i;
|
|
for (i= 0; i<size; i++)
|
|
s+= buffer[i];
|
|
return s;
|
|
}
|
|
|
|
/*
|
|
Initialize control file subsystem
|
|
|
|
SYNOPSIS
|
|
ma_control_file_create_or_open()
|
|
|
|
Looks for the control file. If absent, it's a fresh start, creates file.
|
|
If present, reads it to find out last checkpoint's LSN and last log, updates
|
|
the last_checkpoint_lsn and last_logno global variables.
|
|
Called at engine's start.
|
|
|
|
The format of the control file is:
|
|
4 bytes: magic string
|
|
1 byte: checksum of the following bytes
|
|
4 bytes: number of log where last checkpoint is
|
|
4 bytes: offset in log where last checkpoint is
|
|
4 bytes: number of last log
|
|
|
|
RETURN
|
|
0 - OK
|
|
1 - Error (in which case the file is left closed)
|
|
*/
|
|
CONTROL_FILE_ERROR ma_control_file_create_or_open()
|
|
{
|
|
char buffer[CONTROL_FILE_SIZE];
|
|
char name[FN_REFLEN];
|
|
MY_STAT stat_buff;
|
|
my_bool create_file;
|
|
int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR;
|
|
int error= CONTROL_FILE_UNKNOWN_ERROR;
|
|
DBUG_ENTER("ma_control_file_create_or_open");
|
|
|
|
/*
|
|
If you change sizes in the #defines, you at least have to change the
|
|
"*store" and "*korr" calls in this file, and can even create backward
|
|
compatibility problems. Beware!
|
|
*/
|
|
DBUG_ASSERT(CONTROL_FILE_LSN_SIZE == (4+4));
|
|
DBUG_ASSERT(CONTROL_FILE_FILENO_SIZE == 4);
|
|
|
|
if (control_file_fd >= 0) /* already open */
|
|
DBUG_RETURN(0);
|
|
|
|
if (fn_format(name, CONTROL_FILE_BASE_NAME,
|
|
maria_data_root, "", MYF(MY_WME)) == NullS)
|
|
DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
|
|
|
|
create_file= test(my_access(name,F_OK));
|
|
|
|
if (create_file)
|
|
{
|
|
if ((control_file_fd= my_create(name, 0,
|
|
open_flags, MYF(MY_SYNC_DIR))) < 0)
|
|
DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
|
|
|
|
/*
|
|
To be safer we should make sure that there are no logs or data/index
|
|
files around (indeed it could be that the control file alone was deleted
|
|
or not restored, and we should not go on with life at this point).
|
|
|
|
TODO: For now we trust (this is alpha version), but for beta if would
|
|
be great to verify.
|
|
|
|
We could have a tool which can rebuild the control file, by reading the
|
|
directory of logs, finding the newest log, reading it to find last
|
|
checkpoint... Slow but can save your db. For this to be possible, we
|
|
must always write to the control file right after writing the checkpoint
|
|
log record, and do nothing in between (i.e. the checkpoint must be
|
|
usable as soon as it has been written to the log).
|
|
*/
|
|
|
|
LSN imposs_lsn= CONTROL_FILE_IMPOSSIBLE_LSN;
|
|
uint32 imposs_logno= CONTROL_FILE_IMPOSSIBLE_FILENO;
|
|
|
|
/* init the file with these "undefined" values */
|
|
DBUG_RETURN(ma_control_file_write_and_force(&imposs_lsn, imposs_logno,
|
|
CONTROL_FILE_UPDATE_ALL));
|
|
}
|
|
|
|
/* Otherwise, file exists */
|
|
|
|
if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0)
|
|
goto err;
|
|
|
|
if (my_stat(name, &stat_buff, MYF(MY_WME)) == NULL)
|
|
goto err;
|
|
|
|
if ((uint)stat_buff.st_size < CONTROL_FILE_SIZE)
|
|
{
|
|
/*
|
|
Given that normally we write only a sector and it's atomic, the only
|
|
possibility for a file to be of too short size is if we crashed at the
|
|
very first startup, between file creation and file write. Quite unlikely
|
|
(and can be made even more unlikely by doing this: create a temp file,
|
|
write it, and then rename it to be the control file).
|
|
What's more likely is if someone forgot to restore the control file,
|
|
just did a "touch control" to try to get Maria to start, or if the
|
|
disk/filesystem has a problem.
|
|
So let's be rigid.
|
|
*/
|
|
/*
|
|
TODO: store a message "too small file" somewhere, so that it goes to
|
|
MySQL's error log at startup.
|
|
*/
|
|
error= CONTROL_FILE_TOO_SMALL;
|
|
goto err;
|
|
}
|
|
|
|
if ((uint)stat_buff.st_size > CONTROL_FILE_SIZE)
|
|
{
|
|
/* TODO: store "too big file" message */
|
|
error= CONTROL_FILE_TOO_BIG;
|
|
goto err;
|
|
}
|
|
|
|
if (my_read(control_file_fd, buffer, CONTROL_FILE_SIZE,
|
|
MYF(MY_FNABP | MY_WME)))
|
|
goto err;
|
|
if (memcmp(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET,
|
|
CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE))
|
|
{
|
|
/* TODO: store message "bad magic string" somewhere */
|
|
error= CONTROL_FILE_BAD_MAGIC_STRING;
|
|
goto err;
|
|
}
|
|
if (simple_checksum(buffer + CONTROL_FILE_LSN_OFFSET,
|
|
CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET) !=
|
|
buffer[CONTROL_FILE_CHECKSUM_OFFSET])
|
|
{
|
|
/* TODO: store message "checksum mismatch" somewhere */
|
|
error= CONTROL_FILE_BAD_CHECKSUM;
|
|
goto err;
|
|
}
|
|
last_checkpoint_lsn= lsn8korr(buffer + CONTROL_FILE_LSN_OFFSET);
|
|
last_logno= uint4korr(buffer + CONTROL_FILE_FILENO_OFFSET);
|
|
|
|
DBUG_RETURN(0);
|
|
err:
|
|
ma_control_file_end();
|
|
DBUG_RETURN(error);
|
|
}
|
|
|
|
|
|
/*
|
|
Write information durably to the control file; stores this information into
|
|
the last_checkpoint_lsn and last_logno global variables.
|
|
Called when we have created a new log (after syncing this log's creation)
|
|
and when we have written a checkpoint (after syncing this log record).
|
|
|
|
SYNOPSIS
|
|
ma_control_file_write_and_force()
|
|
checkpoint_lsn LSN of last checkpoint
|
|
logno last log file number
|
|
objs_to_write which of the arguments should be used as new values
|
|
(for example, CONTROL_FILE_UPDATE_ONLY_LSN will not
|
|
write the logno argument to the control file and will
|
|
not update the last_logno global variable); can be:
|
|
CONTROL_FILE_UPDATE_ALL
|
|
CONTROL_FILE_UPDATE_ONLY_LSN
|
|
CONTROL_FILE_UPDATE_ONLY_LOGNO.
|
|
|
|
NOTE
|
|
We always want to do one single my_pwrite() here to be as atomic as
|
|
possible.
|
|
|
|
RETURN
|
|
0 - OK
|
|
1 - Error
|
|
*/
|
|
|
|
int ma_control_file_write_and_force(const LSN *checkpoint_lsn, uint32 logno,
|
|
uint objs_to_write)
|
|
{
|
|
char buffer[CONTROL_FILE_SIZE];
|
|
my_bool update_checkpoint_lsn= FALSE, update_logno= FALSE;
|
|
DBUG_ENTER("ma_control_file_write_and_force");
|
|
|
|
DBUG_ASSERT(control_file_fd >= 0); /* must be open */
|
|
|
|
memcpy(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET,
|
|
CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE);
|
|
|
|
/* TODO: you need some protection to be able to read last_* global vars */
|
|
|
|
if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LSN)
|
|
update_checkpoint_lsn= TRUE;
|
|
else if (objs_to_write == CONTROL_FILE_UPDATE_ONLY_LOGNO)
|
|
update_logno= TRUE;
|
|
else if (objs_to_write == CONTROL_FILE_UPDATE_ALL)
|
|
update_checkpoint_lsn= update_logno= TRUE;
|
|
else /* incorrect value of objs_to_write */
|
|
DBUG_ASSERT(0);
|
|
|
|
if (update_checkpoint_lsn)
|
|
lsn8store(buffer + CONTROL_FILE_LSN_OFFSET, checkpoint_lsn);
|
|
else /* store old value == change nothing */
|
|
lsn8store(buffer + CONTROL_FILE_LSN_OFFSET, &last_checkpoint_lsn);
|
|
|
|
if (update_logno)
|
|
int4store(buffer + CONTROL_FILE_FILENO_OFFSET, logno);
|
|
else
|
|
int4store(buffer + CONTROL_FILE_FILENO_OFFSET, last_logno);
|
|
|
|
buffer[CONTROL_FILE_CHECKSUM_OFFSET]=
|
|
simple_checksum(buffer + CONTROL_FILE_LSN_OFFSET,
|
|
CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET);
|
|
|
|
if (my_pwrite(control_file_fd, buffer, sizeof(buffer),
|
|
0, MYF(MY_FNABP | MY_WME)) ||
|
|
my_sync(control_file_fd, MYF(MY_WME)))
|
|
DBUG_RETURN(1);
|
|
|
|
/* TODO: you need some protection to be able to write last_* global vars */
|
|
if (update_checkpoint_lsn)
|
|
last_checkpoint_lsn= *checkpoint_lsn;
|
|
if (update_logno)
|
|
last_logno= logno;
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
/*
|
|
Free resources taken by control file subsystem
|
|
|
|
SYNOPSIS
|
|
ma_control_file_end()
|
|
*/
|
|
|
|
int ma_control_file_end()
|
|
{
|
|
int close_error;
|
|
DBUG_ENTER("ma_control_file_end");
|
|
|
|
if (control_file_fd < 0) /* already closed */
|
|
DBUG_RETURN(0);
|
|
|
|
close_error= my_close(control_file_fd, MYF(MY_WME));
|
|
/*
|
|
As my_close() frees structures even if close() fails, we do the same,
|
|
i.e. we mark the file as closed in all cases.
|
|
*/
|
|
control_file_fd= -1;
|
|
/*
|
|
As this module owns these variables, closing the module forbids access to
|
|
them (just a safety):
|
|
*/
|
|
last_checkpoint_lsn= CONTROL_FILE_IMPOSSIBLE_LSN;
|
|
last_logno= CONTROL_FILE_IMPOSSIBLE_FILENO;
|
|
|
|
DBUG_RETURN(close_error);
|
|
}
|