mariadb/mysys/my_virtual_mem.c
Marko Mäkelä 56e0be34bc MDEV-36780: InnoDB buffer pool reserves all assigned memory
In commit b6923420f3 (MDEV-29445)
we started to specify the MAP_POPULATE flag for allocating the
InnoDB buffer pool. This would cause a lot of time to be spent
on __mm_populate() inside the Linux kernel, such as 16 seconds
to pre-fault or commit innodb_buffer_pool_size=64G.

Let us revert to the previous way of allocating the buffer pool
at startup. Note: An attempt to increase the buffer pool size by
SET GLOBAL innodb_buffer_pool_size (up to innodb_buffer_pool_size_max)
will invoke my_virtual_mem_commit(), which will use MAP_POPULATE
to zero-fill and prefault the requested additional memory area, blocking
buf_pool.mutex.

Before MDEV-29445 we allocated the InnoDB buffer pool by invoking
mmap(2) once (via my_large_malloc()). After the change, we would
invoke mmap(2) twice, first via my_virtual_mem_reserve() and then
via my_virtual_mem_commit(). Outside Microsoft Windows, we are
reverting back to my_large_malloc() like allocation.

my_virtual_mem_reserve(): Define only for Microsoft Windows.
Other platforms should invoke my_large_virtual_alloc() and
update_malloc_size() instead of my_virtual_mem_reserve() and
my_virtual_mem_commit().

my_large_virtual_alloc(): Define only outside Microsoft Windows.
Do not specify MAP_NORESERVE nor MAP_POPULATE, to preserve compatibility
with my_large_malloc(). Were MAP_POPULATE specified, the mmap()
system call would be significantly slower, for example 18 seconds
to reserve 64 GiB upfront.
2025-05-13 12:27:42 +03:00

201 lines
5.7 KiB
C

/* Copyright (c) 2025, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#include <my_global.h>
#include <my_sys.h>
#include <mysys_err.h>
#include <my_virtual_mem.h>
#ifdef _AIX
# include <sys/shm.h>
#endif
/*
Functionality for handling virtual memory
- reserve range,
- commit memory (within reserved range)
- decommit previously commited memory
- release range
Not every OS has a "reserve" functionality, i.e it is not always
possible to reserve memory larger than swap or RAM for example.
We try to respect use_large_pages setting, on Windows and Linux
*/
#ifdef _WIN32
char *my_virtual_mem_reserve(size_t *size)
{
DWORD flags= my_use_large_pages
? MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT
: MEM_RESERVE;
char *ptr= VirtualAlloc(NULL, *size, flags, PAGE_READWRITE);
if (!ptr && (flags & MEM_LARGE_PAGES))
{
/* Try without large pages */
ptr= VirtualAlloc(NULL, *size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
if (!ptr)
my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), *size);
}
return ptr;
}
#endif
#if defined _WIN32 && !defined DBUG_OFF
static my_bool is_memory_committed(char *ptr, size_t size)
{
MEMORY_BASIC_INFORMATION mbi;
if (VirtualQuery(ptr, &mbi, sizeof mbi) == 0)
DBUG_ASSERT(0);
return !!(mbi.State & MEM_COMMIT);
}
#endif
char *my_virtual_mem_commit(char *ptr, size_t size)
{
DBUG_ASSERT(ptr);
#ifdef _WIN32
if (my_use_large_pages)
{
DBUG_ASSERT(is_memory_committed(ptr, size));
}
else
{
void *p= VirtualAlloc(ptr, size, MEM_COMMIT, PAGE_READWRITE);
DBUG_ASSERT(p == ptr);
if (!p)
{
my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
return NULL;
}
}
#else
if (my_use_large_pages)
/* my_large_virtual_alloc() already created a read/write mapping. */;
else
{
# ifdef _AIX
/*
MAP_FIXED does not not work on IBM AIX in the way does works elsewhere.
Apparently, it is not possible to mmap(2) a range that is already in use,
at least not by default.
mprotect(2) is the fallback, it can't communicate out-of-memory
conditions, but it looks like overcommitting is not possible on
AIX anyway.
*/
if (mprotect(ptr, size, PROT_READ | PROT_WRITE))
{
my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
return NULL;
}
# else
void *p= 0;
const int flags=
# ifdef MAP_POPULATE
MAP_POPULATE |
# endif
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED;
p= mmap(ptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
if (p == MAP_FAILED)
{
my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
return NULL;
}
DBUG_ASSERT(p == ptr);
# if defined MADV_FREE_REUSABLE && defined MADV_FREE_REUSE /* Apple macOS */
madvise(ptr, size, MADV_FREE_REUSE); /* cancel MADV_FREE_REUSABLE */
# endif
# endif
}
#endif
update_malloc_size(size, 0);
return ptr;
}
void my_virtual_mem_decommit(char *ptr, size_t size)
{
#ifdef _WIN32
DBUG_ASSERT(is_memory_committed(ptr, size));
# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
# error "VirtualFree(MEM_DECOMMIT) will not allow subsequent reads!"
# endif
if (!my_use_large_pages)
{
if (!VirtualFree(ptr, size, MEM_DECOMMIT))
{
my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size,
GetLastError());
DBUG_ASSERT(0);
}
}
#else
const int prot=
# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
/*
In InnoDB, buf_pool_t::page_guess() may deference pointers to
this, assuming that either the original contents or zeroed
contents is available.
*/
PROT_READ
# else
/* We will explicitly mark the memory unaccessible. */
PROT_NONE
# endif
;
# ifdef _AIX
disclaim(ptr, size, DISCLAIM_ZEROMEM);
# elif defined __linux__ || defined __osf__
madvise(ptr, size, MADV_DONTNEED); /* OSF/1, Linux mimicing AIX disclaim() */
# elif defined MADV_FREE_REUSABLE && defined MADV_FREE_REUSE
/* Mac OS X 10.9; undocumented in Apple macOS */
madvise(ptr, size, MADV_FREE_REUSABLE); /* macOS mimicing AIX disclaim() */
# elif defined MADV_PURGE /* Illumos */
madvise(ptr, size, MADV_PURGE); /* Illumos mimicing AIX disclaim() */
# elif defined MADV_FREE
/* FreeBSD, NetBSD, OpenBSD, Dragonfly BSD, OpenSolaris, Apple macOS */
madvise(ptr, size, MADV_FREE); /* allow lazy zeroing out */
# elif defined MADV_DONTNEED
# warning "It is unclear if madvise(MADV_DONTNEED) works as intended"
madvise(ptr, size, MADV_DONTNEED);
# else
# warning "Do not know how to decommit memory"
# endif
if (mprotect(ptr, size, prot))
{
my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno);
DBUG_ASSERT(0);
}
#endif
update_malloc_size(-(longlong) size, 0);
}
void my_virtual_mem_release(char *ptr, size_t size)
{
#ifdef _WIN32
DBUG_ASSERT(my_use_large_pages || !is_memory_committed(ptr, size));
if (!VirtualFree(ptr, 0, MEM_RELEASE))
{
my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size,
GetLastError());
DBUG_ASSERT(0);
}
#else
if (munmap(ptr, size))
{
my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno);
DBUG_ASSERT(0);
}
#endif
}