WL#3072 Maria Recovery. Making DDLs durable in Maria:

Sync table files after CREATE (of non-temp table), DROP, RENAME,
TRUNCATE, sync directories and symlinks (for the 3 first commands).
Comments for future log records.
In ma_rename(), if rename of index works and then rename of data fails,
try to undo the rename of the index to leave a consistent state.


mysys/my_symlink.c:
  sync directory after creation of a symbolic link in it, if asked
mysys/my_sync.c:
  comment. Fix for when the file's name has no directory in it.
storage/maria/ma_create.c:
  sync files and links and dirs when creating a non-temporary table.
  Optimizations of the above to reduce syncs in the common cases:
  * if index file and data file have the exact same paths (regular
  and link), sync the directories (of regular and link) only once
  after creating the last file (the data file).
  * don't sync the data file if we didn't write to it (always true
  in our builds).
storage/maria/ma_delete_all.c:
  sync files after truncating a table
storage/maria/ma_delete_table.c:
  sync files and symbolic links and dirs after dropping a table
storage/maria/ma_extra.c:
  a function which wraps the sync of the index file and the sync of the
  data file.
storage/maria/ma_locking.c:
  using a wrapper function
storage/maria/ma_rename.c:
  sync files and symbolic links and dirs after renaming a table.
  If rename of index works and then rename of data fails, try to undo
  the rename of the index to leave a consistent state. That is just a
  try, it may fail...
storage/maria/ma_test3.c:
  warning to not pay attention to this test.
storage/maria/maria_def.h:
  declaration for the function added to ma_extra.c
This commit is contained in:
unknown 2006-11-27 22:01:29 +01:00
parent adfba203ff
commit de6f550ec7
10 changed files with 134 additions and 24 deletions

View file

@ -85,6 +85,8 @@ int my_symlink(const char *content, const char *linkname, myf MyFlags)
if (MyFlags & MY_WME)
my_error(EE_CANT_SYMLINK, MYF(0), linkname, content, errno);
}
else if ((MyFlags & MY_SYNC_DIR) && my_sync_dir_by_file(linkname, MyFlags))
result= -1;
DBUG_RETURN(result);
#endif /* HAVE_READLINK */
}

View file

@ -52,7 +52,7 @@ int my_sync(File fd, myf my_flags)
#if defined(F_FULLFSYNC)
/*
In Mac OS X >= 10.3 this call is safer than fsync() (it forces the
disk's cache).
disk's cache and guarantees ordered writes).
*/
if (!(res= fcntl(fd, F_FULLFSYNC, 0)))
break; /* ok */
@ -89,6 +89,7 @@ int my_sync(File fd, myf my_flags)
} /* my_sync */
static const char cur_dir_name[]= {FN_CURLIB, 0};
/*
Force directory information to disk.
@ -107,11 +108,14 @@ int my_sync_dir(const char *dir_name, myf my_flags)
DBUG_PRINT("my",("Dir: '%s' my_flags: %d", dir_name, my_flags));
File dir_fd;
int res= 0;
const char *correct_dir_name;
/* Sometimes the path does not contain an explicit directory */
correct_dir_name= (dir_name[0] == 0) ? cur_dir_name : dir_name;
/*
Syncing a dir may give EINVAL on tmpfs on Linux, which is ok.
EIO on the other hand is very important. Hence MY_IGNORE_BADFD.
*/
if ((dir_fd= my_open(dir_name, O_RDONLY, MYF(my_flags))) >= 0)
if ((dir_fd= my_open(correct_dir_name, O_RDONLY, MYF(my_flags))) >= 0)
{
if (my_sync(dir_fd, MYF(my_flags | MY_IGNORE_BADFD)))
res= 2;

View file

@ -60,6 +60,8 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
ulong *rec_per_key_part;
my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MARIA_MAX_KEY_BLOCK_SIZE];
MARIA_CREATE_INFO tmp_create_info;
my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */
myf sync_dir= MY_SYNC_DIR;
DBUG_ENTER("maria_create");
DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u",
keys, columns, uniques, flags));
@ -560,7 +562,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
/* max_data_file_length and max_key_file_length are recalculated on open */
if (options & HA_OPTION_TMP_TABLE)
{
tmp_table= TRUE;
sync_dir= 0;
share.base.max_data_file_length=(my_off_t) ci->data_file_length;
}
share.base.min_block_length=
(share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH &&
@ -576,7 +582,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
{
char *iext= strrchr(ci->index_file_name, '.');
int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
if (options & HA_OPTION_TMP_TABLE)
if (tmp_table)
{
char *path;
/* chop off the table name, tempory tables use generated name */
@ -597,8 +603,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
/*
Don't create the table if the link or file exists to ensure that one
doesn't accidently destroy another table.
Don't sync dir now if the data file has the same path.
*/
create_flag=0;
create_flag=
(ci->data_file_name &&
!strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir;
}
else
{
@ -607,8 +616,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
(flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) |
MY_APPEND_EXT);
linkname_ptr=0;
/* Replace the current file */
create_flag=MY_DELETE_OLD;
/*
Replace the current file.
Don't sync dir now if the data file has the same path.
*/
create_flag= MY_DELETE_OLD | (!ci->data_file_name ? 0 : sync_dir);
}
/*
@ -627,7 +639,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
}
if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
MYF(MY_WME | create_flag))) < 0)
MYF(MY_WME|create_flag))) < 0)
goto err;
errpos=1;
@ -653,7 +665,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
char *dext= strrchr(ci->data_file_name, '.');
int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT);
if (options & HA_OPTION_TMP_TABLE)
if (tmp_table)
{
char *path;
/* chop off the table name, tempory tables use generated name */
@ -682,7 +694,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
}
if ((dfile=
my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
MYF(MY_WME | create_flag))) < 0)
MYF(MY_WME | create_flag | sync_dir))) < 0)
goto err;
}
errpos=3;
@ -802,12 +814,18 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0)))
goto err;
if (!tmp_table && my_sync(file, MYF(0)))
goto err;
if (! (flags & HA_DONT_TOUCH_DATA))
{
#ifdef USE_RELOC
if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0)))
goto err;
if (!tmp_table && my_sync(dfile, MYF(0)))
goto err;
#endif
/* if !USE_RELOC, there was no write to the file, no need to sync it */
errpos=2;
if (my_close(dfile,MYF(0)))
goto err;
@ -816,6 +834,19 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs,
pthread_mutex_unlock(&THR_LOCK_maria);
if (my_close(file,MYF(0)))
goto err;
/*
RECOVERYTODO
Write a log record describing the CREATE operation (just the file
names, link names, and the full header's content).
For this record to be of any use for Recovery, we need the upper
MySQL layer to be crash-safe, which it is not now (that would require work
using the ddl_log of sql/sql_table.cc); when is is, we should reconsider
the moment of writing this log record (before or after op, under
THR_LOCK_maria or not...), how to use it in Recovery, and force the log.
For now this record is just informative.
If operation failed earlier, we clean up in "err:" and the MySQL layer
will clean up the frm, so we needn't write anything to the log.
*/
my_free((char*) rec_per_key_part,MYF(0));
DBUG_RETURN(0);
@ -831,14 +862,14 @@ err:
if (! (flags & HA_DONT_TOUCH_DATA))
my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT,
MY_UNPACK_FILENAME | MY_APPEND_EXT),
MYF(0));
MYF(sync_dir));
/* fall through */
case 1:
VOID(my_close(file,MYF(0)));
if (! (flags & HA_DONT_TOUCH_DATA))
my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT,
MY_UNPACK_FILENAME | MY_APPEND_EXT),
MYF(0));
MYF(sync_dir));
}
my_free((char*) rec_per_key_part, MYF(0));
DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */

View file

@ -30,6 +30,7 @@ int maria_delete_all_rows(MARIA_HA *info)
{
DBUG_RETURN(my_errno=EACCES);
}
/* LOCKTODO take X-lock on table here */
if (_ma_readinfo(info,F_WRLCK,1))
DBUG_RETURN(my_errno);
if (_ma_mark_file_changed(info))
@ -53,9 +54,23 @@ int maria_delete_all_rows(MARIA_HA *info)
since it was locked then there may be key blocks in the key cache
*/
flush_key_blocks(share->key_cache, share->kfile, FLUSH_IGNORE_CHANGED);
/*
RECOVERYTODO Log the two chsize and header modifications and force the
log. So that if crash between the two chsize, we finish the work at
Recovery. For this scenario:
"TRUNCATE TABLE t1; DROP TABLE t1; RENAME TABLE t2 to t1; crash;"
Recovery mustn't truncate the new t1, so the log records of TRUNCATE
should be applied only if t1 exists and its ZeroDirtyPagesLSN is smaller
than the records'. See more comments below.
*/
if (my_chsize(info->dfile, 0, 0, MYF(MY_WME)) ||
my_chsize(share->kfile, share->base.keystart, 0, MYF(MY_WME)) )
goto err;
/*
RECOVERYTODO Consider updating ZeroDirtyPagesLSN here. It is
not a necessity (it is one only in RENAME commands) but an optional
optimization which will allow some REDO skipping at Recovery.
*/
VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
#ifdef HAVE_MMAP
/* Resize mmaped area */
@ -63,14 +78,25 @@ int maria_delete_all_rows(MARIA_HA *info)
_ma_remap_file(info, (my_off_t)0);
rw_unlock(&info->s->mmap_lock);
#endif
/*
RECOVERYTODO Until we have the TRUNCATE log record and take it into
account for log-low-water-mark calculation and use it in Recovery, we need
to sync.
*/
if (_ma_sync_table_files(info))
goto err;
allow_break(); /* Allow SIGHUP & SIGINT */
DBUG_RETURN(0);
err:
{
int save_errno=my_errno;
/* RECOVERYTODO log the header modifications */
VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
info->update|=HA_STATE_WRITTEN; /* Buffer changed */
/* RECOVERYTODO until we log above we have to sync */
if (_ma_sync_table_files(info) && !save_errno)
save_errno= my_errno;
allow_break(); /* Allow SIGHUP & SIGINT */
DBUG_RETURN(my_errno=save_errno);
}

View file

@ -31,6 +31,7 @@ int maria_delete_table(const char *name)
#ifdef EXTRA_DEBUG
_ma_check_table_is_closed(name,"delete");
#endif
/* LOCKTODO take X-lock on table here */
#ifdef USE_RAID
{
MARIA_HA *info;
@ -59,12 +60,22 @@ int maria_delete_table(const char *name)
#endif /* USE_RAID */
fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
if (my_delete_with_symlink(from, MYF(MY_WME)))
/*
RECOVERYTODO log the two deletes below.
Then do the file deletions.
For this log record to be of any use for Recovery, we need the upper MySQL
layer to be crash-safe in DDLs; when it is we should reconsider the moment
of writing this log record, how to use it in Recovery, and force the log.
For now this record is only informative.
*/
if (my_delete_with_symlink(from, MYF(MY_WME | MY_SYNC_DIR)))
DBUG_RETURN(my_errno);
fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
#ifdef USE_RAID
if (raid_type)
DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME)) ? my_errno : 0);
DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | MY_SYNC_DIR)) ?
my_errno : 0);
#endif
DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME)) ? my_errno : 0);
DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | MY_SYNC_DIR)) ?
my_errno : 0);
}

View file

@ -316,9 +316,7 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function, void *extra_arg
if (share->not_flushed)
{
share->not_flushed=0;
if (my_sync(share->kfile, MYF(0)))
error= my_errno;
if (my_sync(info->dfile, MYF(0)))
if (_ma_sync_table_files(info))
error= my_errno;
if (error)
{
@ -439,3 +437,10 @@ int maria_reset(MARIA_HA *info)
HA_STATE_PREV_FOUND);
DBUG_RETURN(error);
}
int _ma_sync_table_files(const MARIA_HA *info)
{
return (my_sync(info->dfile, MYF(0)) ||
my_sync(info->s->kfile, MYF(0)));
}

View file

@ -103,9 +103,7 @@ int maria_lock_database(MARIA_HA *info, int lock_type)
share->changed=0;
if (maria_flush)
{
if (my_sync(share->kfile, MYF(0)))
error= my_errno;
if (my_sync(info->dfile, MYF(0)))
if (_ma_sync_table_files(info))
error= my_errno;
}
else

View file

@ -23,6 +23,7 @@
int maria_rename(const char *old_name, const char *new_name)
{
char from[FN_REFLEN],to[FN_REFLEN];
int data_file_rename_error;
#ifdef USE_RAID
uint raid_type=0,raid_chunks=0;
#endif
@ -32,6 +33,7 @@ int maria_rename(const char *old_name, const char *new_name)
_ma_check_table_is_closed(old_name,"rename old_table");
_ma_check_table_is_closed(new_name,"rename new table2");
#endif
/* LOCKTODO take X-lock on table here */
#ifdef USE_RAID
{
MARIA_HA *info;
@ -48,14 +50,40 @@ int maria_rename(const char *old_name, const char *new_name)
fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
if (my_rename_with_symlink(from, to, MYF(MY_WME)))
/*
RECOVERYTODO log the two renames below. Update
ZeroDirtyPagesLSN of the table on disk (=> sync the files), this is
needed so that Recovery does not pick a wrong table.
Then do the file renames.
For this log record to be of any use for Recovery, we need the upper MySQL
layer to be crash-safe in DDLs; when it is we should reconsider the moment
of writing this log record, how to use it in Recovery, and force the log.
For now this record is only informative. But ZeroDirtyPagesLSN is
critically needed!
*/
if (my_rename_with_symlink(from, to, MYF(MY_WME | MY_SYNC_DIR)))
DBUG_RETURN(my_errno);
fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
#ifdef USE_RAID
if (raid_type)
DBUG_RETURN(my_raid_rename(from, to, raid_chunks, MYF(MY_WME)) ? my_errno :
0);
data_file_rename_error= my_raid_rename(from, to, raid_chunks,
MYF(MY_WME | MY_SYNC_DIR));
else
#endif
DBUG_RETURN(my_rename_with_symlink(from, to,MYF(MY_WME)) ? my_errno : 0);
data_file_rename_error=
my_rename_with_symlink(from, to, MYF(MY_WME | MY_SYNC_DIR));
if (data_file_rename_error)
{
/*
now we have a renamed index file and a non-renamed data file, try to
undo the rename of the index file.
*/
data_file_rename_error= my_errno;
fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
my_rename_with_symlink(to, from, MYF(MY_WME | MY_SYNC_DIR));
}
DBUG_RETURN(data_file_rename_error);
}

View file

@ -65,6 +65,10 @@ int main(int argc,char **argv)
MY_INIT(argv[0]);
get_options(argc,argv);
fprintf(stderr, "WARNING! this program is to test 'external locking'"
" (when several processes share a table through file locking)"
" which is not supported by Maria at all; expect errors."
" We may soon remove this program.\n");
maria_init();
bzero((char*) keyinfo,sizeof(keyinfo));
bzero((char*) recinfo,sizeof(recinfo));

View file

@ -741,3 +741,4 @@ int _ma_flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file);
int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param);
int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
ulong);
int _ma_sync_table_files(const MARIA_HA *info);