mirror of
https://github.com/MariaDB/server.git
synced 2025-01-16 03:52:35 +01:00
WL#3072 Maria Recovery. Making DDLs durable in Maria:
Sync table files after CREATE (of non-temp table), DROP, RENAME, TRUNCATE, sync directories and symlinks (for the 3 first commands). Comments for future log records. In ma_rename(), if rename of index works and then rename of data fails, try to undo the rename of the index to leave a consistent state. mysys/my_symlink.c: sync directory after creation of a symbolic link in it, if asked mysys/my_sync.c: comment. Fix for when the file's name has no directory in it. storage/maria/ma_create.c: sync files and links and dirs when creating a non-temporary table. Optimizations of the above to reduce syncs in the common cases: * if index file and data file have the exact same paths (regular and link), sync the directories (of regular and link) only once after creating the last file (the data file). * don't sync the data file if we didn't write to it (always true in our builds). storage/maria/ma_delete_all.c: sync files after truncating a table storage/maria/ma_delete_table.c: sync files and symbolic links and dirs after dropping a table storage/maria/ma_extra.c: a function which wraps the sync of the index file and the sync of the data file. storage/maria/ma_locking.c: using a wrapper function storage/maria/ma_rename.c: sync files and symbolic links and dirs after renaming a table. If rename of index works and then rename of data fails, try to undo the rename of the index to leave a consistent state. That is just a try, it may fail... storage/maria/ma_test3.c: warning to not pay attention to this test. storage/maria/maria_def.h: declaration for the function added to ma_extra.c
This commit is contained in:
parent
adfba203ff
commit
de6f550ec7
10 changed files with 134 additions and 24 deletions
|
@ -85,6 +85,8 @@ int my_symlink(const char *content, const char *linkname, myf MyFlags)
|
|||
if (MyFlags & MY_WME)
|
||||
my_error(EE_CANT_SYMLINK, MYF(0), linkname, content, errno);
|
||||
}
|
||||
else if ((MyFlags & MY_SYNC_DIR) && my_sync_dir_by_file(linkname, MyFlags))
|
||||
result= -1;
|
||||
DBUG_RETURN(result);
|
||||
#endif /* HAVE_READLINK */
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ int my_sync(File fd, myf my_flags)
|
|||
#if defined(F_FULLFSYNC)
|
||||
/*
|
||||
In Mac OS X >= 10.3 this call is safer than fsync() (it forces the
|
||||
disk's cache).
|
||||
disk's cache and guarantees ordered writes).
|
||||
*/
|
||||
if (!(res= fcntl(fd, F_FULLFSYNC, 0)))
|
||||
break; /* ok */
|
||||
|
@ -89,6 +89,7 @@ int my_sync(File fd, myf my_flags)
|
|||
} /* my_sync */
|
||||
|
||||
|
||||
static const char cur_dir_name[]= {FN_CURLIB, 0};
|
||||
/*
|
||||
Force directory information to disk.
|
||||
|
||||
|
@ -107,11 +108,14 @@ int my_sync_dir(const char *dir_name, myf my_flags)
|
|||
DBUG_PRINT("my",("Dir: '%s' my_flags: %d", dir_name, my_flags));
|
||||
File dir_fd;
|
||||
int res= 0;
|
||||
const char *correct_dir_name;
|
||||
/* Sometimes the path does not contain an explicit directory */
|
||||
correct_dir_name= (dir_name[0] == 0) ? cur_dir_name : dir_name;
|
||||
/*
|
||||
Syncing a dir may give EINVAL on tmpfs on Linux, which is ok.
|
||||
EIO on the other hand is very important. Hence MY_IGNORE_BADFD.
|
||||
*/
|
||||
if ((dir_fd= my_open(dir_name, O_RDONLY, MYF(my_flags))) >= 0)
|
||||
if ((dir_fd= my_open(correct_dir_name, O_RDONLY, MYF(my_flags))) >= 0)
|
||||
{
|
||||
if (my_sync(dir_fd, MYF(my_flags | MY_IGNORE_BADFD)))
|
||||
res= 2;
|
||||
|
|
|
@ -60,6 +60,8 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
ulong *rec_per_key_part;
|
||||
my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MARIA_MAX_KEY_BLOCK_SIZE];
|
||||
MARIA_CREATE_INFO tmp_create_info;
|
||||
my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */
|
||||
myf sync_dir= MY_SYNC_DIR;
|
||||
DBUG_ENTER("maria_create");
|
||||
DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u",
|
||||
keys, columns, uniques, flags));
|
||||
|
@ -560,7 +562,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
|
||||
/* max_data_file_length and max_key_file_length are recalculated on open */
|
||||
if (options & HA_OPTION_TMP_TABLE)
|
||||
{
|
||||
tmp_table= TRUE;
|
||||
sync_dir= 0;
|
||||
share.base.max_data_file_length=(my_off_t) ci->data_file_length;
|
||||
}
|
||||
|
||||
share.base.min_block_length=
|
||||
(share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH &&
|
||||
|
@ -576,7 +582,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
{
|
||||
char *iext= strrchr(ci->index_file_name, '.');
|
||||
int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
|
||||
if (options & HA_OPTION_TMP_TABLE)
|
||||
if (tmp_table)
|
||||
{
|
||||
char *path;
|
||||
/* chop off the table name, tempory tables use generated name */
|
||||
|
@ -597,8 +603,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
/*
|
||||
Don't create the table if the link or file exists to ensure that one
|
||||
doesn't accidently destroy another table.
|
||||
Don't sync dir now if the data file has the same path.
|
||||
*/
|
||||
create_flag=0;
|
||||
create_flag=
|
||||
(ci->data_file_name &&
|
||||
!strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -607,8 +616,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
(flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) |
|
||||
MY_APPEND_EXT);
|
||||
linkname_ptr=0;
|
||||
/* Replace the current file */
|
||||
create_flag=MY_DELETE_OLD;
|
||||
/*
|
||||
Replace the current file.
|
||||
Don't sync dir now if the data file has the same path.
|
||||
*/
|
||||
create_flag= MY_DELETE_OLD | (!ci->data_file_name ? 0 : sync_dir);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -627,7 +639,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
}
|
||||
|
||||
if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
|
||||
MYF(MY_WME | create_flag))) < 0)
|
||||
MYF(MY_WME|create_flag))) < 0)
|
||||
goto err;
|
||||
errpos=1;
|
||||
|
||||
|
@ -653,7 +665,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
char *dext= strrchr(ci->data_file_name, '.');
|
||||
int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT);
|
||||
|
||||
if (options & HA_OPTION_TMP_TABLE)
|
||||
if (tmp_table)
|
||||
{
|
||||
char *path;
|
||||
/* chop off the table name, tempory tables use generated name */
|
||||
|
@ -682,7 +694,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
}
|
||||
if ((dfile=
|
||||
my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
|
||||
MYF(MY_WME | create_flag))) < 0)
|
||||
MYF(MY_WME | create_flag | sync_dir))) < 0)
|
||||
goto err;
|
||||
}
|
||||
errpos=3;
|
||||
|
@ -802,12 +814,18 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0)))
|
||||
goto err;
|
||||
|
||||
if (!tmp_table && my_sync(file, MYF(0)))
|
||||
goto err;
|
||||
|
||||
if (! (flags & HA_DONT_TOUCH_DATA))
|
||||
{
|
||||
#ifdef USE_RELOC
|
||||
if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0)))
|
||||
goto err;
|
||||
if (!tmp_table && my_sync(dfile, MYF(0)))
|
||||
goto err;
|
||||
#endif
|
||||
/* if !USE_RELOC, there was no write to the file, no need to sync it */
|
||||
errpos=2;
|
||||
if (my_close(dfile,MYF(0)))
|
||||
goto err;
|
||||
|
@ -816,6 +834,19 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
|
|||
pthread_mutex_unlock(&THR_LOCK_maria);
|
||||
if (my_close(file,MYF(0)))
|
||||
goto err;
|
||||
/*
|
||||
RECOVERYTODO
|
||||
Write a log record describing the CREATE operation (just the file
|
||||
names, link names, and the full header's content).
|
||||
For this record to be of any use for Recovery, we need the upper
|
||||
MySQL layer to be crash-safe, which it is not now (that would require work
|
||||
using the ddl_log of sql/sql_table.cc); when is is, we should reconsider
|
||||
the moment of writing this log record (before or after op, under
|
||||
THR_LOCK_maria or not...), how to use it in Recovery, and force the log.
|
||||
For now this record is just informative.
|
||||
If operation failed earlier, we clean up in "err:" and the MySQL layer
|
||||
will clean up the frm, so we needn't write anything to the log.
|
||||
*/
|
||||
my_free((char*) rec_per_key_part,MYF(0));
|
||||
DBUG_RETURN(0);
|
||||
|
||||
|
@ -831,14 +862,14 @@ err:
|
|||
if (! (flags & HA_DONT_TOUCH_DATA))
|
||||
my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT,
|
||||
MY_UNPACK_FILENAME | MY_APPEND_EXT),
|
||||
MYF(0));
|
||||
MYF(sync_dir));
|
||||
/* fall through */
|
||||
case 1:
|
||||
VOID(my_close(file,MYF(0)));
|
||||
if (! (flags & HA_DONT_TOUCH_DATA))
|
||||
my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT,
|
||||
MY_UNPACK_FILENAME | MY_APPEND_EXT),
|
||||
MYF(0));
|
||||
MYF(sync_dir));
|
||||
}
|
||||
my_free((char*) rec_per_key_part, MYF(0));
|
||||
DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */
|
||||
|
|
|
@ -30,6 +30,7 @@ int maria_delete_all_rows(MARIA_HA *info)
|
|||
{
|
||||
DBUG_RETURN(my_errno=EACCES);
|
||||
}
|
||||
/* LOCKTODO take X-lock on table here */
|
||||
if (_ma_readinfo(info,F_WRLCK,1))
|
||||
DBUG_RETURN(my_errno);
|
||||
if (_ma_mark_file_changed(info))
|
||||
|
@ -53,9 +54,23 @@ int maria_delete_all_rows(MARIA_HA *info)
|
|||
since it was locked then there may be key blocks in the key cache
|
||||
*/
|
||||
flush_key_blocks(share->key_cache, share->kfile, FLUSH_IGNORE_CHANGED);
|
||||
/*
|
||||
RECOVERYTODO Log the two chsize and header modifications and force the
|
||||
log. So that if crash between the two chsize, we finish the work at
|
||||
Recovery. For this scenario:
|
||||
"TRUNCATE TABLE t1; DROP TABLE t1; RENAME TABLE t2 to t1; crash;"
|
||||
Recovery mustn't truncate the new t1, so the log records of TRUNCATE
|
||||
should be applied only if t1 exists and its ZeroDirtyPagesLSN is smaller
|
||||
than the records'. See more comments below.
|
||||
*/
|
||||
if (my_chsize(info->dfile, 0, 0, MYF(MY_WME)) ||
|
||||
my_chsize(share->kfile, share->base.keystart, 0, MYF(MY_WME)) )
|
||||
goto err;
|
||||
/*
|
||||
RECOVERYTODO Consider updating ZeroDirtyPagesLSN here. It is
|
||||
not a necessity (it is one only in RENAME commands) but an optional
|
||||
optimization which will allow some REDO skipping at Recovery.
|
||||
*/
|
||||
VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
|
||||
#ifdef HAVE_MMAP
|
||||
/* Resize mmaped area */
|
||||
|
@ -63,14 +78,25 @@ int maria_delete_all_rows(MARIA_HA *info)
|
|||
_ma_remap_file(info, (my_off_t)0);
|
||||
rw_unlock(&info->s->mmap_lock);
|
||||
#endif
|
||||
/*
|
||||
RECOVERYTODO Until we have the TRUNCATE log record and take it into
|
||||
account for log-low-water-mark calculation and use it in Recovery, we need
|
||||
to sync.
|
||||
*/
|
||||
if (_ma_sync_table_files(info))
|
||||
goto err;
|
||||
allow_break(); /* Allow SIGHUP & SIGINT */
|
||||
DBUG_RETURN(0);
|
||||
|
||||
err:
|
||||
{
|
||||
int save_errno=my_errno;
|
||||
/* RECOVERYTODO log the header modifications */
|
||||
VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
|
||||
info->update|=HA_STATE_WRITTEN; /* Buffer changed */
|
||||
/* RECOVERYTODO until we log above we have to sync */
|
||||
if (_ma_sync_table_files(info) && !save_errno)
|
||||
save_errno= my_errno;
|
||||
allow_break(); /* Allow SIGHUP & SIGINT */
|
||||
DBUG_RETURN(my_errno=save_errno);
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ int maria_delete_table(const char *name)
|
|||
#ifdef EXTRA_DEBUG
|
||||
_ma_check_table_is_closed(name,"delete");
|
||||
#endif
|
||||
/* LOCKTODO take X-lock on table here */
|
||||
#ifdef USE_RAID
|
||||
{
|
||||
MARIA_HA *info;
|
||||
|
@ -59,12 +60,22 @@ int maria_delete_table(const char *name)
|
|||
#endif /* USE_RAID */
|
||||
|
||||
fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
|
||||
if (my_delete_with_symlink(from, MYF(MY_WME)))
|
||||
/*
|
||||
RECOVERYTODO log the two deletes below.
|
||||
Then do the file deletions.
|
||||
For this log record to be of any use for Recovery, we need the upper MySQL
|
||||
layer to be crash-safe in DDLs; when it is we should reconsider the moment
|
||||
of writing this log record, how to use it in Recovery, and force the log.
|
||||
For now this record is only informative.
|
||||
*/
|
||||
if (my_delete_with_symlink(from, MYF(MY_WME | MY_SYNC_DIR)))
|
||||
DBUG_RETURN(my_errno);
|
||||
fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
|
||||
#ifdef USE_RAID
|
||||
if (raid_type)
|
||||
DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME)) ? my_errno : 0);
|
||||
DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | MY_SYNC_DIR)) ?
|
||||
my_errno : 0);
|
||||
#endif
|
||||
DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME)) ? my_errno : 0);
|
||||
DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | MY_SYNC_DIR)) ?
|
||||
my_errno : 0);
|
||||
}
|
||||
|
|
|
@ -316,9 +316,7 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function, void *extra_arg
|
|||
if (share->not_flushed)
|
||||
{
|
||||
share->not_flushed=0;
|
||||
if (my_sync(share->kfile, MYF(0)))
|
||||
error= my_errno;
|
||||
if (my_sync(info->dfile, MYF(0)))
|
||||
if (_ma_sync_table_files(info))
|
||||
error= my_errno;
|
||||
if (error)
|
||||
{
|
||||
|
@ -439,3 +437,10 @@ int maria_reset(MARIA_HA *info)
|
|||
HA_STATE_PREV_FOUND);
|
||||
DBUG_RETURN(error);
|
||||
}
|
||||
|
||||
|
||||
int _ma_sync_table_files(const MARIA_HA *info)
|
||||
{
|
||||
return (my_sync(info->dfile, MYF(0)) ||
|
||||
my_sync(info->s->kfile, MYF(0)));
|
||||
}
|
||||
|
|
|
@ -103,9 +103,7 @@ int maria_lock_database(MARIA_HA *info, int lock_type)
|
|||
share->changed=0;
|
||||
if (maria_flush)
|
||||
{
|
||||
if (my_sync(share->kfile, MYF(0)))
|
||||
error= my_errno;
|
||||
if (my_sync(info->dfile, MYF(0)))
|
||||
if (_ma_sync_table_files(info))
|
||||
error= my_errno;
|
||||
}
|
||||
else
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
int maria_rename(const char *old_name, const char *new_name)
|
||||
{
|
||||
char from[FN_REFLEN],to[FN_REFLEN];
|
||||
int data_file_rename_error;
|
||||
#ifdef USE_RAID
|
||||
uint raid_type=0,raid_chunks=0;
|
||||
#endif
|
||||
|
@ -32,6 +33,7 @@ int maria_rename(const char *old_name, const char *new_name)
|
|||
_ma_check_table_is_closed(old_name,"rename old_table");
|
||||
_ma_check_table_is_closed(new_name,"rename new table2");
|
||||
#endif
|
||||
/* LOCKTODO take X-lock on table here */
|
||||
#ifdef USE_RAID
|
||||
{
|
||||
MARIA_HA *info;
|
||||
|
@ -48,14 +50,40 @@ int maria_rename(const char *old_name, const char *new_name)
|
|||
|
||||
fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
|
||||
fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
|
||||
if (my_rename_with_symlink(from, to, MYF(MY_WME)))
|
||||
/*
|
||||
RECOVERYTODO log the two renames below. Update
|
||||
ZeroDirtyPagesLSN of the table on disk (=> sync the files), this is
|
||||
needed so that Recovery does not pick a wrong table.
|
||||
Then do the file renames.
|
||||
For this log record to be of any use for Recovery, we need the upper MySQL
|
||||
layer to be crash-safe in DDLs; when it is we should reconsider the moment
|
||||
of writing this log record, how to use it in Recovery, and force the log.
|
||||
For now this record is only informative. But ZeroDirtyPagesLSN is
|
||||
critically needed!
|
||||
*/
|
||||
if (my_rename_with_symlink(from, to, MYF(MY_WME | MY_SYNC_DIR)))
|
||||
DBUG_RETURN(my_errno);
|
||||
fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
|
||||
fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
|
||||
#ifdef USE_RAID
|
||||
if (raid_type)
|
||||
DBUG_RETURN(my_raid_rename(from, to, raid_chunks, MYF(MY_WME)) ? my_errno :
|
||||
0);
|
||||
data_file_rename_error= my_raid_rename(from, to, raid_chunks,
|
||||
MYF(MY_WME | MY_SYNC_DIR));
|
||||
else
|
||||
#endif
|
||||
DBUG_RETURN(my_rename_with_symlink(from, to,MYF(MY_WME)) ? my_errno : 0);
|
||||
data_file_rename_error=
|
||||
my_rename_with_symlink(from, to, MYF(MY_WME | MY_SYNC_DIR));
|
||||
if (data_file_rename_error)
|
||||
{
|
||||
/*
|
||||
now we have a renamed index file and a non-renamed data file, try to
|
||||
undo the rename of the index file.
|
||||
*/
|
||||
data_file_rename_error= my_errno;
|
||||
fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
|
||||
fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
|
||||
my_rename_with_symlink(to, from, MYF(MY_WME | MY_SYNC_DIR));
|
||||
}
|
||||
DBUG_RETURN(data_file_rename_error);
|
||||
|
||||
}
|
||||
|
|
|
@ -65,6 +65,10 @@ int main(int argc,char **argv)
|
|||
MY_INIT(argv[0]);
|
||||
get_options(argc,argv);
|
||||
|
||||
fprintf(stderr, "WARNING! this program is to test 'external locking'"
|
||||
" (when several processes share a table through file locking)"
|
||||
" which is not supported by Maria at all; expect errors."
|
||||
" We may soon remove this program.\n");
|
||||
maria_init();
|
||||
bzero((char*) keyinfo,sizeof(keyinfo));
|
||||
bzero((char*) recinfo,sizeof(recinfo));
|
||||
|
|
|
@ -741,3 +741,4 @@ int _ma_flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file);
|
|||
int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param);
|
||||
int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
|
||||
ulong);
|
||||
int _ma_sync_table_files(const MARIA_HA *info);
|
||||
|
|
Loading…
Reference in a new issue