diff --git a/mysys/my_symlink.c b/mysys/my_symlink.c index 7be3fcd36f0..b3d68992578 100644 --- a/mysys/my_symlink.c +++ b/mysys/my_symlink.c @@ -85,6 +85,8 @@ int my_symlink(const char *content, const char *linkname, myf MyFlags) if (MyFlags & MY_WME) my_error(EE_CANT_SYMLINK, MYF(0), linkname, content, errno); } + else if ((MyFlags & MY_SYNC_DIR) && my_sync_dir_by_file(linkname, MyFlags)) + result= -1; DBUG_RETURN(result); #endif /* HAVE_READLINK */ } diff --git a/mysys/my_sync.c b/mysys/my_sync.c index ada2ea84414..26bee5a293f 100644 --- a/mysys/my_sync.c +++ b/mysys/my_sync.c @@ -52,7 +52,7 @@ int my_sync(File fd, myf my_flags) #if defined(F_FULLFSYNC) /* In Mac OS X >= 10.3 this call is safer than fsync() (it forces the - disk's cache). + disk's cache and guarantees ordered writes). */ if (!(res= fcntl(fd, F_FULLFSYNC, 0))) break; /* ok */ @@ -89,6 +89,7 @@ int my_sync(File fd, myf my_flags) } /* my_sync */ +static const char cur_dir_name[]= {FN_CURLIB, 0}; /* Force directory information to disk. @@ -107,11 +108,14 @@ int my_sync_dir(const char *dir_name, myf my_flags) DBUG_PRINT("my",("Dir: '%s' my_flags: %d", dir_name, my_flags)); File dir_fd; int res= 0; + const char *correct_dir_name; + /* Sometimes the path does not contain an explicit directory */ + correct_dir_name= (dir_name[0] == 0) ? cur_dir_name : dir_name; /* Syncing a dir may give EINVAL on tmpfs on Linux, which is ok. EIO on the other hand is very important. Hence MY_IGNORE_BADFD. */ - if ((dir_fd= my_open(dir_name, O_RDONLY, MYF(my_flags))) >= 0) + if ((dir_fd= my_open(correct_dir_name, O_RDONLY, MYF(my_flags))) >= 0) { if (my_sync(dir_fd, MYF(my_flags | MY_IGNORE_BADFD))) res= 2; diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c index 5926bba9406..76942e3d5e8 100644 --- a/storage/maria/ma_create.c +++ b/storage/maria/ma_create.c @@ -60,6 +60,8 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, ulong *rec_per_key_part; my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MARIA_MAX_KEY_BLOCK_SIZE]; MARIA_CREATE_INFO tmp_create_info; + my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */ + myf sync_dir= MY_SYNC_DIR; DBUG_ENTER("maria_create"); DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u", keys, columns, uniques, flags)); @@ -560,7 +562,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, /* max_data_file_length and max_key_file_length are recalculated on open */ if (options & HA_OPTION_TMP_TABLE) + { + tmp_table= TRUE; + sync_dir= 0; share.base.max_data_file_length=(my_off_t) ci->data_file_length; + } share.base.min_block_length= (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH && @@ -576,7 +582,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, { char *iext= strrchr(ci->index_file_name, '.'); int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); - if (options & HA_OPTION_TMP_TABLE) + if (tmp_table) { char *path; /* chop off the table name, tempory tables use generated name */ @@ -597,8 +603,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, /* Don't create the table if the link or file exists to ensure that one doesn't accidently destroy another table. + Don't sync dir now if the data file has the same path. */ - create_flag=0; + create_flag= + (ci->data_file_name && + !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir; } else { @@ -607,8 +616,11 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, (flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) | MY_APPEND_EXT); linkname_ptr=0; - /* Replace the current file */ - create_flag=MY_DELETE_OLD; + /* + Replace the current file. + Don't sync dir now if the data file has the same path. + */ + create_flag= MY_DELETE_OLD | (!ci->data_file_name ? 0 : sync_dir); } /* @@ -627,7 +639,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, } if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode, - MYF(MY_WME | create_flag))) < 0) + MYF(MY_WME|create_flag))) < 0) goto err; errpos=1; @@ -653,7 +665,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, char *dext= strrchr(ci->data_file_name, '.'); int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT); - if (options & HA_OPTION_TMP_TABLE) + if (tmp_table) { char *path; /* chop off the table name, tempory tables use generated name */ @@ -682,7 +694,7 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, } if ((dfile= my_create_with_symlink(linkname_ptr, filename, 0, create_mode, - MYF(MY_WME | create_flag))) < 0) + MYF(MY_WME | create_flag | sync_dir))) < 0) goto err; } errpos=3; @@ -802,12 +814,18 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0))) goto err; + if (!tmp_table && my_sync(file, MYF(0))) + goto err; + if (! (flags & HA_DONT_TOUCH_DATA)) { #ifdef USE_RELOC if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0))) goto err; + if (!tmp_table && my_sync(dfile, MYF(0))) + goto err; #endif + /* if !USE_RELOC, there was no write to the file, no need to sync it */ errpos=2; if (my_close(dfile,MYF(0))) goto err; @@ -816,6 +834,19 @@ int maria_create(const char *name,uint keys,MARIA_KEYDEF *keydefs, pthread_mutex_unlock(&THR_LOCK_maria); if (my_close(file,MYF(0))) goto err; + /* + RECOVERYTODO + Write a log record describing the CREATE operation (just the file + names, link names, and the full header's content). + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require work + using the ddl_log of sql/sql_table.cc); when is is, we should reconsider + the moment of writing this log record (before or after op, under + THR_LOCK_maria or not...), how to use it in Recovery, and force the log. + For now this record is just informative. + If operation failed earlier, we clean up in "err:" and the MySQL layer + will clean up the frm, so we needn't write anything to the log. + */ my_free((char*) rec_per_key_part,MYF(0)); DBUG_RETURN(0); @@ -831,14 +862,14 @@ err: if (! (flags & HA_DONT_TOUCH_DATA)) my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT, MY_UNPACK_FILENAME | MY_APPEND_EXT), - MYF(0)); + MYF(sync_dir)); /* fall through */ case 1: VOID(my_close(file,MYF(0))); if (! (flags & HA_DONT_TOUCH_DATA)) my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT, MY_UNPACK_FILENAME | MY_APPEND_EXT), - MYF(0)); + MYF(sync_dir)); } my_free((char*) rec_per_key_part, MYF(0)); DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */ diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c index b16d82ed9f7..fccd29b15f1 100644 --- a/storage/maria/ma_delete_all.c +++ b/storage/maria/ma_delete_all.c @@ -30,6 +30,7 @@ int maria_delete_all_rows(MARIA_HA *info) { DBUG_RETURN(my_errno=EACCES); } + /* LOCKTODO take X-lock on table here */ if (_ma_readinfo(info,F_WRLCK,1)) DBUG_RETURN(my_errno); if (_ma_mark_file_changed(info)) @@ -53,9 +54,23 @@ int maria_delete_all_rows(MARIA_HA *info) since it was locked then there may be key blocks in the key cache */ flush_key_blocks(share->key_cache, share->kfile, FLUSH_IGNORE_CHANGED); + /* + RECOVERYTODO Log the two chsize and header modifications and force the + log. So that if crash between the two chsize, we finish the work at + Recovery. For this scenario: + "TRUNCATE TABLE t1; DROP TABLE t1; RENAME TABLE t2 to t1; crash;" + Recovery mustn't truncate the new t1, so the log records of TRUNCATE + should be applied only if t1 exists and its ZeroDirtyPagesLSN is smaller + than the records'. See more comments below. + */ if (my_chsize(info->dfile, 0, 0, MYF(MY_WME)) || my_chsize(share->kfile, share->base.keystart, 0, MYF(MY_WME)) ) goto err; + /* + RECOVERYTODO Consider updating ZeroDirtyPagesLSN here. It is + not a necessity (it is one only in RENAME commands) but an optional + optimization which will allow some REDO skipping at Recovery. + */ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); #ifdef HAVE_MMAP /* Resize mmaped area */ @@ -63,14 +78,25 @@ int maria_delete_all_rows(MARIA_HA *info) _ma_remap_file(info, (my_off_t)0); rw_unlock(&info->s->mmap_lock); #endif + /* + RECOVERYTODO Until we have the TRUNCATE log record and take it into + account for log-low-water-mark calculation and use it in Recovery, we need + to sync. + */ + if (_ma_sync_table_files(info)) + goto err; allow_break(); /* Allow SIGHUP & SIGINT */ DBUG_RETURN(0); err: { int save_errno=my_errno; + /* RECOVERYTODO log the header modifications */ VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + /* RECOVERYTODO until we log above we have to sync */ + if (_ma_sync_table_files(info) && !save_errno) + save_errno= my_errno; allow_break(); /* Allow SIGHUP & SIGINT */ DBUG_RETURN(my_errno=save_errno); } diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c index dd781a93fc4..5c7b4337b20 100644 --- a/storage/maria/ma_delete_table.c +++ b/storage/maria/ma_delete_table.c @@ -31,6 +31,7 @@ int maria_delete_table(const char *name) #ifdef EXTRA_DEBUG _ma_check_table_is_closed(name,"delete"); #endif + /* LOCKTODO take X-lock on table here */ #ifdef USE_RAID { MARIA_HA *info; @@ -59,12 +60,22 @@ int maria_delete_table(const char *name) #endif /* USE_RAID */ fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); - if (my_delete_with_symlink(from, MYF(MY_WME))) + /* + RECOVERYTODO log the two deletes below. + Then do the file deletions. + For this log record to be of any use for Recovery, we need the upper MySQL + layer to be crash-safe in DDLs; when it is we should reconsider the moment + of writing this log record, how to use it in Recovery, and force the log. + For now this record is only informative. + */ + if (my_delete_with_symlink(from, MYF(MY_WME | MY_SYNC_DIR))) DBUG_RETURN(my_errno); fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); #ifdef USE_RAID if (raid_type) - DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME)) ? my_errno : 0); + DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | MY_SYNC_DIR)) ? + my_errno : 0); #endif - DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME)) ? my_errno : 0); + DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | MY_SYNC_DIR)) ? + my_errno : 0); } diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c index 57e540242b9..1f649a00753 100644 --- a/storage/maria/ma_extra.c +++ b/storage/maria/ma_extra.c @@ -316,9 +316,7 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function, void *extra_arg if (share->not_flushed) { share->not_flushed=0; - if (my_sync(share->kfile, MYF(0))) - error= my_errno; - if (my_sync(info->dfile, MYF(0))) + if (_ma_sync_table_files(info)) error= my_errno; if (error) { @@ -439,3 +437,10 @@ int maria_reset(MARIA_HA *info) HA_STATE_PREV_FOUND); DBUG_RETURN(error); } + + +int _ma_sync_table_files(const MARIA_HA *info) +{ + return (my_sync(info->dfile, MYF(0)) || + my_sync(info->s->kfile, MYF(0))); +} diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c index 5689d57f2a5..848fb7e9682 100644 --- a/storage/maria/ma_locking.c +++ b/storage/maria/ma_locking.c @@ -103,9 +103,7 @@ int maria_lock_database(MARIA_HA *info, int lock_type) share->changed=0; if (maria_flush) { - if (my_sync(share->kfile, MYF(0))) - error= my_errno; - if (my_sync(info->dfile, MYF(0))) + if (_ma_sync_table_files(info)) error= my_errno; } else diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c index 5f65cd2b213..5d89cc063d7 100644 --- a/storage/maria/ma_rename.c +++ b/storage/maria/ma_rename.c @@ -23,6 +23,7 @@ int maria_rename(const char *old_name, const char *new_name) { char from[FN_REFLEN],to[FN_REFLEN]; + int data_file_rename_error; #ifdef USE_RAID uint raid_type=0,raid_chunks=0; #endif @@ -32,6 +33,7 @@ int maria_rename(const char *old_name, const char *new_name) _ma_check_table_is_closed(old_name,"rename old_table"); _ma_check_table_is_closed(new_name,"rename new table2"); #endif + /* LOCKTODO take X-lock on table here */ #ifdef USE_RAID { MARIA_HA *info; @@ -48,14 +50,40 @@ int maria_rename(const char *old_name, const char *new_name) fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); - if (my_rename_with_symlink(from, to, MYF(MY_WME))) + /* + RECOVERYTODO log the two renames below. Update + ZeroDirtyPagesLSN of the table on disk (=> sync the files), this is + needed so that Recovery does not pick a wrong table. + Then do the file renames. + For this log record to be of any use for Recovery, we need the upper MySQL + layer to be crash-safe in DDLs; when it is we should reconsider the moment + of writing this log record, how to use it in Recovery, and force the log. + For now this record is only informative. But ZeroDirtyPagesLSN is + critically needed! + */ + if (my_rename_with_symlink(from, to, MYF(MY_WME | MY_SYNC_DIR))) DBUG_RETURN(my_errno); fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); #ifdef USE_RAID if (raid_type) - DBUG_RETURN(my_raid_rename(from, to, raid_chunks, MYF(MY_WME)) ? my_errno : - 0); + data_file_rename_error= my_raid_rename(from, to, raid_chunks, + MYF(MY_WME | MY_SYNC_DIR)); + else #endif - DBUG_RETURN(my_rename_with_symlink(from, to,MYF(MY_WME)) ? my_errno : 0); + data_file_rename_error= + my_rename_with_symlink(from, to, MYF(MY_WME | MY_SYNC_DIR)); + if (data_file_rename_error) + { + /* + now we have a renamed index file and a non-renamed data file, try to + undo the rename of the index file. + */ + data_file_rename_error= my_errno; + fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + my_rename_with_symlink(to, from, MYF(MY_WME | MY_SYNC_DIR)); + } + DBUG_RETURN(data_file_rename_error); + } diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c index 96b896b03c6..2f205c33b12 100644 --- a/storage/maria/ma_test3.c +++ b/storage/maria/ma_test3.c @@ -65,6 +65,10 @@ int main(int argc,char **argv) MY_INIT(argv[0]); get_options(argc,argv); + fprintf(stderr, "WARNING! this program is to test 'external locking'" + " (when several processes share a table through file locking)" + " which is not supported by Maria at all; expect errors." + " We may soon remove this program.\n"); maria_init(); bzero((char*) keyinfo,sizeof(keyinfo)); bzero((char*) recinfo,sizeof(recinfo)); diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index 506bdbc71ca..62c50187888 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -741,3 +741,4 @@ int _ma_flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file); int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param); int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, ulong); +int _ma_sync_table_files(const MARIA_HA *info);