MDEV-29015/MDEV-29260/MDEV-34938: os_file_get_size() WSL work-around

When MariaDB Server is run in a container under
Windows Subsystem for Linux, the fstat(2) system calls that InnoDB
invokes in os_file_set_size() or os_file_get_size() are causing a
failure in case the file had been renamed in the past while the file
handle was open. This affects at least ALTER TABLE and OPTIMIZE TABLE.

os_file_get_size(): Invoke lseek(2) instead of fstat(2). We do not mind
if the file pointer is moving to the end of the file, because InnoDB
exclusively invokes positioned reads and writes, or in some rare cases,
appends to an existing file.

os_file_set_size(): Invoke os_file_get_size() instead of fstat(2).
Define the POSIX and Windows versions separately. Formerly, the
Windows version was called os_file_change_size_win32().

fil_node_t::read_page0(): Use os_file_get_size() to determine the
size, and do not crash on error.

fil_node_t::read_metadata(): Remove the non-Windows stat* parameter
and always invoke fstat(2) outside Windows, but do tolerate errors.
Because fstat(2) is more likely to fail than lseek(2), and this is
not time critical code, we can afford the extra lseek(2) system call.

Reviewed by: Vladislav Vaintroub
This commit is contained in:
Marko Mäkelä 2024-10-24 16:08:56 +03:00
parent 3cd706b107
commit decdd4bf49
4 changed files with 159 additions and 265 deletions

View file

@ -352,7 +352,7 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
this->size += size;
UT_LIST_ADD_LAST(chain, node);
if (node->is_open()) {
node->find_metadata(node->handle);
node->find_metadata();
n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
if (++fil_system.n_open >= srv_max_n_open_files) {
reacquire();
@ -1197,7 +1197,7 @@ err_exit:
if (create_new_db)
{
node->find_metadata(node->handle);
node->find_metadata();
continue;
}
if (skip_read)

View file

@ -1060,17 +1060,12 @@ struct fil_node_t final
return(handle != OS_FILE_CLOSED);
}
/** Read the first page of a data file.
@return whether the page was found valid */
bool read_page0();
/** Read the first page of a data file.
@return whether the page was found valid */
bool read_page0() noexcept;
/** Determine some file metadata when creating or reading the file.
@param file the file that is being created, or OS_FILE_CLOSED */
void find_metadata(os_file_t file = OS_FILE_CLOSED
#ifndef _WIN32
, struct stat* statbuf = NULL
#endif
);
/** Determine some file metadata when creating or reading the file. */
void find_metadata() noexcept;
/** Close the file handle. */
void close();

View file

@ -905,35 +905,13 @@ os_file_get_size(
const char* filename)
MY_ATTRIBUTE((warn_unused_result));
/** Gets a file size.
@param[in] file handle to a file
@return file size, or (os_offset_t) -1 on failure */
os_offset_t
os_file_get_size(
os_file_t file)
MY_ATTRIBUTE((warn_unused_result));
/** Extend a file.
On Windows, extending a file allocates blocks for the file,
unless the file is sparse.
On Unix, we will extend the file with ftruncate(), if
file needs to be sparse. Otherwise posix_fallocate() is used
when available, and if not, binary zeroes are added to the end
of file.
@param[in] name file name
@param[in] file file handle
@param[in] size desired file size
@param[in] sparse whether to create a sparse file (no preallocating)
@return whether the operation succeeded */
bool
os_file_set_size(
const char* name,
os_file_t file,
os_offset_t size,
bool is_sparse = false)
/** Determine the logical size of a file.
This may change the current write position of the file to the end of the file.
(Not currently a problem; InnoDB typically uses positioned I/O.)
@param file handle to an open file
@return file size, in octets
@retval -1 on failure */
os_offset_t os_file_get_size(os_file_t file) noexcept
MY_ATTRIBUTE((warn_unused_result));
/** Truncates a file at its current position.
@ -1189,11 +1167,25 @@ If file is normal, file system allocates storage.
@param[in] size size to preserve in bytes
@return true if success */
bool
os_file_change_size_win32(
os_file_set_size(
const char* pathname,
os_file_t file,
os_offset_t size);
inline bool
os_file_set_size(const char* name, os_file_t file, os_offset_t size, bool)
{
return os_file_set_size(name, file, size);
}
#else
/** Extend a file by appending NUL.
@param[in] name file name
@param[in] file file handle
@param[in] size desired file size
@param[in] sparse whether to create a sparse file with ftruncate()
@return whether the operation succeeded */
bool os_file_set_size(const char *name, os_file_t file, os_offset_t size,
bool is_sparse= false) noexcept;
#endif /*_WIN32 */
/** Free storage space associated with a section of the file.

View file

@ -46,9 +46,6 @@ Created 10/21/1995 Heikki Tuuri
#include "srv0start.h"
#include "fil0fil.h"
#include "fsp0fsp.h"
#ifdef HAVE_LINUX_UNISTD_H
#include "unistd.h"
#endif
#include "os0event.h"
#include "os0thread.h"
@ -74,10 +71,12 @@ Created 10/21/1995 Heikki Tuuri
#ifdef _WIN32
#include <winioctl.h>
#else
#include <unistd.h>
// my_test_if_atomic_write()
#include <my_sys.h>
#endif
#include "log.h"
#include "buf0dblwr.h"
#include <thread>
@ -1505,16 +1504,9 @@ bool os_file_close_func(os_file_t file)
return false;
}
/** Gets a file size.
@param[in] file handle to an open file
@return file size, or (os_offset_t) -1 on failure */
os_offset_t
os_file_get_size(os_file_t file)
os_offset_t os_file_get_size(os_file_t file) noexcept
{
struct stat statbuf;
if (fstat(file, &statbuf)) return os_offset_t(-1);
MSAN_STAT_WORKAROUND(&statbuf);
return statbuf.st_size;
return lseek(file, 0, SEEK_END);
}
/** Gets a file size.
@ -1653,6 +1645,110 @@ os_file_set_eof(
return(!ftruncate(fileno(file), ftell(file)));
}
bool os_file_set_size(const char *name, os_file_t file, os_offset_t size,
bool is_sparse) noexcept
{
ut_ad(!(size & 4095));
if (is_sparse) {
bool success = !ftruncate(file, size);
if (!success) {
sql_print_error("InnoDB: ftruncate of file %s"
" to %llu bytes failed with error %d",
name, size, errno);
}
return success;
}
# ifdef HAVE_POSIX_FALLOCATE
int err;
os_offset_t current_size;
do {
current_size = os_file_get_size(file);
if (current_size == os_offset_t(-1)) {
err = errno;
} else {
if (current_size >= size) {
return true;
}
current_size &= ~4095ULL;
# ifdef __linux__
if (!fallocate(file, 0, current_size,
size - current_size)) {
err = 0;
break;
}
err = errno;
# else
err = posix_fallocate(file, current_size,
size - current_size);
# endif
}
} while (err == EINTR
&& srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
switch (err) {
case 0:
return true;
default:
sql_print_error("InnoDB: preallocating %llu"
" bytes for file %s failed with error %d",
size, name, err);
/* fall through */
case EINTR:
errno = err;
return false;
case EINVAL:
case EOPNOTSUPP:
/* fall back to the code below */
break;
}
# else /* HAVE_POSIX_ALLOCATE */
os_offset_t current_size = os_file_get_size(file);
# endif /* HAVE_POSIX_ALLOCATE */
current_size &= ~4095ULL;
if (current_size >= size) {
return true;
}
/* Write up to 1 megabyte at a time. */
ulint buf_size = std::min<ulint>(64,
ulint(size >> srv_page_size_shift))
<< srv_page_size_shift;
/* Align the buffer for possible raw i/o */
byte* buf = static_cast<byte*>(aligned_malloc(buf_size,
srv_page_size));
/* Write buffer full of zeros */
memset(buf, 0, buf_size);
while (current_size < size
&& srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
ulint n_bytes;
if (size - current_size < (os_offset_t) buf_size) {
n_bytes = (ulint) (size - current_size);
} else {
n_bytes = buf_size;
}
if (os_file_write(IORequestWrite, name,
file, buf, current_size, n_bytes) !=
DB_SUCCESS) {
break;
}
current_size += n_bytes;
}
aligned_free(buf);
return current_size >= size && os_file_flush(file);
}
#else /* !_WIN32 */
#include <WinIoCtl.h>
@ -2556,21 +2652,12 @@ bool os_file_close_func(os_file_t file)
return true;
}
/** Gets a file size.
@param[in] file Handle to a file
@return file size, or (os_offset_t) -1 on failure */
os_offset_t
os_file_get_size(
os_file_t file)
os_offset_t os_file_get_size(os_file_t file) noexcept
{
DWORD high;
DWORD low = GetFileSize(file, &high);
if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
return((os_offset_t) -1);
}
return(os_offset_t(low | (os_offset_t(high) << 32)));
DWORD high, low= GetFileSize(file, &high);
if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR)
return os_offset_t(-1);
return os_offset_t{low} | os_offset_t{high} << 32;
}
/** Gets a file size.
@ -2712,24 +2799,8 @@ bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
}
/**
Change file size on Windows.
If file is extended, the bytes between old and new EOF
are zeros.
If file is sparse, "virtual" block is added at the end of
allocated area.
If file is normal, file system allocates storage.
@param[in] pathname file path
@param[in] file file handle
@param[in] size size to preserve in bytes
@return true if success */
bool
os_file_change_size_win32(
os_file_set_size(
const char* pathname,
os_file_t file,
os_offset_t size)
@ -3184,149 +3255,6 @@ IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
#endif /* _WIN32 */
}
/** Extend a file.
On Windows, extending a file allocates blocks for the file,
unless the file is sparse.
On Unix, we will extend the file with ftruncate(), if
file needs to be sparse. Otherwise posix_fallocate() is used
when available, and if not, binary zeroes are added to the end
of file.
@param[in] name file name
@param[in] file file handle
@param[in] size desired file size
@param[in] sparse whether to create a sparse file (no preallocating)
@return whether the operation succeeded */
bool
os_file_set_size(
const char* name,
os_file_t file,
os_offset_t size,
bool is_sparse)
{
ut_ad(!(size & 4095));
#ifdef _WIN32
/* On Windows, changing file size works well and as expected for both
sparse and normal files. */
return os_file_change_size_win32(name, file, size);
#else
struct stat statbuf;
if (is_sparse) {
bool success = !ftruncate(file, size);
if (!success) {
ib::error() << "ftruncate of file " << name << " to "
<< size << " bytes failed with error "
<< errno;
}
return(success);
}
# ifdef HAVE_POSIX_FALLOCATE
int err;
do {
if (fstat(file, &statbuf)) {
err = errno;
} else {
MSAN_STAT_WORKAROUND(&statbuf);
os_offset_t current_size = statbuf.st_size;
if (current_size >= size) {
return true;
}
current_size &= ~4095ULL;
# ifdef __linux__
if (!fallocate(file, 0, current_size,
size - current_size)) {
err = 0;
break;
}
err = errno;
# else
err = posix_fallocate(file, current_size,
size - current_size);
# endif
}
} while (err == EINTR
&& srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
switch (err) {
case 0:
return true;
default:
ib::error() << "preallocating "
<< size << " bytes for file " << name
<< " failed with error " << err;
/* fall through */
case EINTR:
errno = err;
return false;
case EINVAL:
case EOPNOTSUPP:
/* fall back to the code below */
break;
}
# endif /* HAVE_POSIX_ALLOCATE */
#endif /* _WIN32*/
#ifdef _WIN32
os_offset_t current_size = os_file_get_size(file);
FILE_STORAGE_INFO info;
if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
sizeof info)) {
if (info.LogicalBytesPerSector) {
current_size &= ~os_offset_t(info.LogicalBytesPerSector
- 1);
}
}
#else
if (fstat(file, &statbuf)) {
return false;
}
os_offset_t current_size = statbuf.st_size & ~4095ULL;
#endif
if (current_size >= size) {
return true;
}
/* Write up to 1 megabyte at a time. */
ulint buf_size = ut_min(ulint(64),
ulint(size >> srv_page_size_shift))
<< srv_page_size_shift;
/* Align the buffer for possible raw i/o */
byte* buf = static_cast<byte*>(aligned_malloc(buf_size,
srv_page_size));
/* Write buffer full of zeros */
memset(buf, 0, buf_size);
while (current_size < size
&& srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
ulint n_bytes;
if (size - current_size < (os_offset_t) buf_size) {
n_bytes = (ulint) (size - current_size);
} else {
n_bytes = buf_size;
}
if (os_file_write(IORequestWrite, name,
file, buf, current_size, n_bytes) !=
DB_SUCCESS) {
break;
}
current_size += n_bytes;
}
aligned_free(buf);
return(current_size >= size && os_file_flush(file));
}
/** Truncate a file to a specified size in bytes.
@param[in] pathname file path
@param[in] file file to be truncated
@ -3351,7 +3279,7 @@ os_file_truncate(
}
#ifdef _WIN32
return(os_file_change_size_win32(pathname, file, size));
return os_file_set_size(pathname, file, size);
#else /* _WIN32 */
return(os_file_truncate_posix(pathname, file, size));
#endif /* _WIN32 */
@ -4096,18 +4024,10 @@ static bool is_file_on_ssd(HANDLE handle, char *file_path)
#endif
/** Determine some file metadata when creating or reading the file.
@param file the file that is being created, or OS_FILE_CLOSED */
void fil_node_t::find_metadata(os_file_t file
#ifndef _WIN32
, struct stat* statbuf
#endif
)
void fil_node_t::find_metadata() noexcept
{
if (file == OS_FILE_CLOSED) {
file = handle;
ut_ad(is_open());
}
ut_ad(is_open());
os_file_t file = handle;
#ifdef _WIN32 /* FIXME: make this unconditional */
if (space->punch_hole) {
@ -4139,20 +4059,17 @@ void fil_node_t::find_metadata(os_file_t file
block_size = 512;
}
#else
struct stat sbuf;
if (!statbuf && !fstat(file, &sbuf)) {
MSAN_STAT_WORKAROUND(&sbuf);
statbuf = &sbuf;
}
if (statbuf) {
block_size = statbuf->st_blksize;
}
on_ssd = space->atomic_write_supported
on_ssd = space->atomic_write_supported;
struct stat statbuf;
if (!fstat(file, &statbuf)) {
MSAN_STAT_WORKAROUND(&statbuf);
block_size = statbuf.st_blksize;
# ifdef __linux__
|| (statbuf && fil_system.is_ssd(statbuf->st_dev))
on_ssd = on_ssd || fil_system.is_ssd(statbuf.st_dev);
# endif
;
}
#endif
if (!space->atomic_write_supported) {
space->atomic_write_supported = atomic_write
&& srv_use_atomic_writes
@ -4176,21 +4093,14 @@ void fil_node_t::find_metadata(os_file_t file
/** Read the first page of a data file.
@return whether the page was found valid */
bool fil_node_t::read_page0()
bool fil_node_t::read_page0() noexcept
{
ut_ad(mutex_own(&fil_system.mutex));
const unsigned psize = space->physical_size();
#ifndef _WIN32
struct stat statbuf;
if (fstat(handle, &statbuf)) {
os_offset_t size_bytes = os_file_get_size(handle);
if (size_bytes == os_offset_t(-1)) {
return false;
}
MSAN_STAT_WORKAROUND(&statbuf);
os_offset_t size_bytes = statbuf.st_size;
#else
os_offset_t size_bytes = os_file_get_size(handle);
ut_a(size_bytes != (os_offset_t) -1);
#endif
const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
if (size_bytes < min_size) {
@ -4258,11 +4168,8 @@ invalid:
return false;
}
#ifdef __linux__
find_metadata(handle, &statbuf);
#else
find_metadata();
#endif
/* Truncate the size to a multiple of extent size. */
ulint mask = psize * FSP_EXTENT_SIZE - 1;