MDEV-10814: Innodb large allocations - madvise - Don't dump

Note: Linux only

Core dumps of large buffer pool pages take time and space
and pose potential data expose in scenarios where data-at-rest
encryption is deployed.

Here we use madvise(MADV_DONT_DUMP) on large memory allocations
used by the innodb buffer pool, log_sys and recv_sys. The effect
of this system call is that these memory areas will not appear in
a core dump. Data from these buffers is rarely useful in fault
diagnosis.

log_sys and recv_sys structures now use large memory allocations
for their large buffer.

Debug builds don't include the madvise syscall and as such will
include full core dumps.

A function, buf_madvise_do_dump, is added but never called. It
is there to be called from a debugger to re-enable the core
dumping of all of these pages if for some reason the entire
contents of these buffers are needed.

Idea thanks to Hartmut Holzgraefe
This commit is contained in:
Daniel Black 2017-04-25 16:49:27 +10:00 committed by Daniel Black
parent 990289a78f
commit b600f30786
8 changed files with 180 additions and 41 deletions

View file

@ -1173,6 +1173,57 @@ buf_page_is_corrupted(
}
#ifndef UNIV_INNOCHECKSUM
#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
/** Enable buffers to be dumped to core files
A convience function, not called anyhwere directly however
it is left available for gdb or any debugger to call
in the event that you want all of the memory to be dumped
to a core file.
Returns number of errors found in madvise calls. */
int
buf_madvise_do_dump()
{
int ret= 0;
buf_pool_t* buf_pool;
ulint n;
buf_chunk_t* chunk;
/* mirrors allocation in log_sys_init() */
if (log_sys->buf)
{
ret+= madvise(log_sys->first_in_use ? log_sys->buf
: log_sys->buf - log_sys->buf_size,
log_sys->buf_size,
MADV_DODUMP);
}
/* mirrors recv_sys_init() */
if (recv_sys->buf)
{
ret+= madvise(recv_sys->buf, recv_sys->len, MADV_DODUMP);
}
buf_pool_mutex_enter_all();
for (int i= 0; i < srv_buf_pool_instances; i++)
{
buf_pool = buf_pool_from_array(i);
chunk = buf_pool->chunks;
for (int n = buf_pool->n_chunks; n--; chunk++)
{
ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
}
}
buf_pool_mutex_exit_all();
return ret;
}
#endif
/** Dump a page to stderr.
@param[in] read_buf database page
@param[in] page_size page size */
@ -1502,7 +1553,7 @@ buf_chunk_init(
DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL););
chunk->mem = buf_pool->allocator.allocate_large(mem_size,
&chunk->mem_pfx);
&chunk->mem_pfx, true);
if (UNIV_UNLIKELY(chunk->mem == NULL)) {
@ -1796,7 +1847,8 @@ buf_pool_init_instance(
}
buf_pool->allocator.deallocate_large(
chunk->mem, &chunk->mem_pfx);
chunk->mem, &chunk->mem_pfx, chunk->mem_size(),
true);
}
ut_free(buf_pool->chunks);
buf_pool_mutex_exit(buf_pool);
@ -1943,7 +1995,7 @@ buf_pool_free_instance(
}
buf_pool->allocator.deallocate_large(
chunk->mem, &chunk->mem_pfx);
chunk->mem, &chunk->mem_pfx, true);
}
for (ulint i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) {
@ -2819,7 +2871,7 @@ withdraw_retry:
}
buf_pool->allocator.deallocate_large(
chunk->mem, &chunk->mem_pfx);
chunk->mem, &chunk->mem_pfx, true);
sum_freed += chunk->size;

View file

@ -614,15 +614,15 @@ struct log_t{
mtr_commit and still ensure that
insertions in the flush_list happen
in the LSN order. */
byte* buf_ptr; /*!< unaligned log buffer, which should
be of double of buf_size */
byte* buf; /*!< log buffer currently in use;
this could point to either the first
half of the aligned(buf_ptr) or the
byte* buf; /*!< Memory of double the buf_size is
allocated here. This pointer will change
however to either the first half or the
second half in turns, so that log
write/flush to disk don't block
concurrent mtrs which will write
log to this buffer */
log to this buffer. Care to switch back
to the first half before freeing/resizing
must be undertaken. */
bool first_in_use; /*!< true if buf points to the first
half of the aligned(buf_ptr), false
if the second half */

View file

@ -217,6 +217,7 @@ struct recv_sys_t{
/*!< this is TRUE when a log rec application
batch is running */
byte* buf; /*!< buffer for parsing log records */
size_t buf_size; /*!< size of buf */
ulint len; /*!< amount of data in buf */
lsn_t parse_start_lsn;
/*!< this is the lsn from which we were able to

View file

@ -129,6 +129,10 @@ InnoDB:
#include <string.h> /* strlen(), strrchr(), strncmp() */
#include "my_global.h" /* needed for headers from mysql/psi/ */
#if !defined(DBUG_OFF) && defined(HAVE_MADVISE)
#include <sys/mman.h>
#endif
/* JAN: TODO: missing 5.7 header */
#ifdef HAVE_MYSQL_MEMORY_H
#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */
@ -234,6 +238,45 @@ struct ut_new_pfx_t {
#endif
};
static void ut_allocate_trace_dontdump(void * ptr,
size_t bytes,
bool dontdump,
ut_new_pfx_t* pfx,
const char* file)
{
ut_a(ptr != NULL);
#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DONTDUMP)
if (dontdump && madvise(ptr, bytes, MADV_DONTDUMP)) {
ib::warn() << "Failed to set memory to DONTDUMP: "
<< strerror(errno)
<< " ptr " << ptr
<< " size " << bytes;
}
#endif
if (pfx != NULL) {
#ifdef UNIV_PFS_MEMORY
allocate_trace(bytes, file, pfx);
#endif /* UNIV_PFS_MEMORY */
pfx->m_size = bytes;
}
}
static void ut_dodump(void* ptr, size_t m_size)
{
if (ptr == NULL) {
return;
}
#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
if (madvise(ptr, m_size, MADV_DODUMP)) {
ib::warn() << "Failed to set memory to DODUMP: "
<< strerror(errno)
<< " ptr " << ptr
<< " size " << m_size;
}
#endif
}
/** Allocator class for allocating memory from inside std::* containers.
@tparam T type of allocated object
@tparam oom_fatal whether to commit suicide when running out of memory */
@ -294,6 +337,7 @@ public:
@param[in] file file name of the caller
@param[in] set_to_zero if true, then the returned memory is
initialized with 0x0 bytes.
@param[in] throw_on_error if true, raize exception if too big
@return pointer to the allocated memory */
pointer
allocate(
@ -566,6 +610,8 @@ public:
/** Allocate a large chunk of memory that can hold 'n_elements'
objects of type 'T' and trace the allocation.
@param[in] n_elements number of elements
@param[in] dontdump if true, advise the OS is not to core
dump this memory.
@param[out] pfx storage for the description of the
allocated memory. The caller must provide space for this one and keep
it until the memory is no longer needed and then pass it to
@ -574,7 +620,8 @@ public:
pointer
allocate_large(
size_type n_elements,
ut_new_pfx_t* pfx)
ut_new_pfx_t* pfx,
bool dontdump = false)
{
if (n_elements == 0 || n_elements > max_size()) {
return(NULL);
@ -585,13 +632,11 @@ public:
pointer ptr = reinterpret_cast<pointer>(
os_mem_alloc_large(&n_bytes));
#ifdef UNIV_PFS_MEMORY
if (ptr != NULL) {
allocate_trace(n_bytes, NULL, pfx);
if (ptr == NULL) {
return NULL;
}
#else
pfx->m_size = n_bytes;
#endif /* UNIV_PFS_MEMORY */
ut_allocate_trace_dontdump(ptr, n_bytes, dontdump, pfx, NULL);
return(ptr);
}
@ -600,17 +645,26 @@ public:
deallocation.
@param[in,out] ptr pointer to memory to free
@param[in] pfx descriptor of the memory, as returned by
allocate_large(). */
allocate_large().
@param[in] dodump if true, advise the OS to include this
memory again if a core dump occurs. */
void
deallocate_large(
pointer ptr,
const ut_new_pfx_t* pfx)
const ut_new_pfx_t* pfx,
size_t size,
bool dodump = false)
{
if (dodump) {
ut_dodump(ptr, size);
}
#ifdef UNIV_PFS_MEMORY
deallocate_trace(pfx);
if (pfx) {
deallocate_trace(pfx);
}
#endif /* UNIV_PFS_MEMORY */
os_mem_free_large(ptr, pfx->m_size);
os_mem_free_large(ptr, size);
}
#ifdef UNIV_PFS_MEMORY
@ -842,6 +896,10 @@ ut_delete_array(
ut_allocator<byte>(key).allocate( \
n_bytes, NULL, __FILE__, false, false))
#define ut_malloc_dontdump(n_bytes) static_cast<void*>( \
ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate_large( \
n_bytes, true))
#define ut_zalloc(n_bytes, key) static_cast<void*>( \
ut_allocator<byte>(key).allocate( \
n_bytes, NULL, __FILE__, true, false))
@ -865,6 +923,10 @@ ut_delete_array(
#define ut_free(ptr) ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \
reinterpret_cast<byte*>(ptr))
#define ut_free_dodump(ptr, size) static_cast<void*>( \
ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate_large( \
ptr, NULL, size, true))
#else /* UNIV_PFS_MEMORY */
/* Fallbacks when memory tracing is disabled at compile time. */
@ -887,6 +949,14 @@ ut_delete_array(
#define ut_malloc_nokey(n_bytes) ::malloc(n_bytes)
static inline void *ut_malloc_dontdump(size_t n_bytes)
{
void *ptr = os_mem_alloc_large(&n_bytes);
ut_allocate_trace_dontdump(ptr, n_bytes, true, NULL, NULL);
return ptr;
}
#define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes)
#define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes)
@ -895,6 +965,12 @@ ut_delete_array(
#define ut_free(ptr) ::free(ptr)
static inline void ut_free_dodump(void *ptr, size_t size)
{
ut_dodump(ptr, size);
os_mem_free_large(ptr, size);
}
#endif /* UNIV_PFS_MEMORY */
#endif /* ut0new_h */

View file

@ -226,16 +226,18 @@ log_buffer_extend(
log_sys->buf_free -= move_start;
log_sys->buf_next_to_write -= move_start;
/* free previous after getting the right address */
if (!log_sys->first_in_use) {
log_sys->buf -= log_sys->buf_size;
}
ut_free_dodump(log_sys->buf, log_sys->buf_size * 2);
/* reallocate log buffer */
srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
ut_free(log_sys->buf_ptr);
log_sys->buf_size = LOG_BUFFER_SIZE;
log_sys->buf_ptr = static_cast<byte*>(
ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
log_sys->buf = static_cast<byte*>(
ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
ut_malloc_dontdump(log_sys->buf_size * 2));
log_sys->first_in_use = true;
@ -723,10 +725,8 @@ log_sys_init()
log_sys->buf_size = LOG_BUFFER_SIZE;
log_sys->buf_ptr = static_cast<byte*>(
ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE));
log_sys->buf = static_cast<byte*>(
ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
ut_malloc_dontdump(log_sys->buf_size * 2));
log_sys->first_in_use = true;
@ -1085,12 +1085,12 @@ log_buffer_switch()
OS_FILE_LOG_BLOCK_SIZE);
if (log_sys->first_in_use) {
ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
ut_ad(log_sys->buf == ut_align(log_sys->buf,
OS_FILE_LOG_BLOCK_SIZE));
log_sys->buf += log_sys->buf_size;
} else {
log_sys->buf -= log_sys->buf_size;
ut_ad(log_sys->buf == ut_align(log_sys->buf_ptr,
ut_ad(log_sys->buf == ut_align(log_sys->buf,
OS_FILE_LOG_BLOCK_SIZE));
}
@ -2254,8 +2254,10 @@ log_shutdown()
{
log_group_close_all();
ut_free(log_sys->buf_ptr);
log_sys->buf_ptr = NULL;
if (!log_sys->first_in_use) {
log_sys->buf -= log_sys->buf_size;
}
ut_free_dodump(log_sys->buf, log_sys->buf_size * 2);
log_sys->buf = NULL;
ut_free(log_sys->checkpoint_buf_ptr);
log_sys->checkpoint_buf_ptr = NULL;

View file

@ -438,7 +438,9 @@ recv_sys_close()
os_event_destroy(recv_sys->flush_end);
}
ut_free(recv_sys->buf);
if (recv_sys->buf != NULL) {
ut_free_dodump(recv_sys->buf, recv_sys->buf_size);
}
ut_ad(!recv_writer_thread_active);
mutex_free(&recv_sys->writer_mutex);
@ -553,7 +555,8 @@ recv_sys_init()
}
recv_sys->buf = static_cast<byte*>(
ut_malloc_nokey(RECV_PARSING_BUF_SIZE));
ut_malloc_dontdump(RECV_PARSING_BUF_SIZE));
recv_sys->buf_size = RECV_PARSING_BUF_SIZE;
recv_sys->addr_hash = hash_create(size / 512);
recv_sys->progress_time = ut_time();
@ -588,8 +591,9 @@ recv_sys_debug_free(void)
hash_table_free(recv_sys->addr_hash);
mem_heap_free(recv_sys->heap);
ut_free(recv_sys->buf);
ut_free_dodump(recv_sys->buf, recv_sys->buf_size);
recv_sys->buf_size = 0;
recv_sys->buf = NULL;
recv_sys->heap = NULL;
recv_sys->addr_hash = NULL;

View file

@ -71,6 +71,7 @@ enum row_op {
/** Log block for modifications during online ALTER TABLE */
struct row_log_buf_t {
byte* block; /*!< file block buffer */
size_t size; /*!< length of block in bytes */
ut_new_pfx_t block_pfx; /*!< opaque descriptor of "block". Set
by ut_allocator::allocate_large() and fed to
ut_allocator::deallocate_large(). */
@ -265,6 +266,7 @@ row_log_block_allocate(
if (log_buf.block == NULL) {
DBUG_RETURN(false);
}
log_buf.size = srv_sort_buf_size;
}
DBUG_RETURN(true);
}
@ -279,7 +281,7 @@ row_log_block_free(
DBUG_ENTER("row_log_block_free");
if (log_buf.block != NULL) {
ut_allocator<byte>(mem_key_row_log_buf).deallocate_large(
log_buf.block, &log_buf.block_pfx);
log_buf.block, &log_buf.block_pfx, log_buf.size);
log_buf.block = NULL;
}
DBUG_VOID_RETURN;

View file

@ -4619,6 +4619,7 @@ row_merge_build_indexes(
merge_file_t* merge_files;
row_merge_block_t* block;
ut_new_pfx_t block_pfx;
size_t block_size;
ut_new_pfx_t crypt_pfx;
row_merge_block_t* crypt_block = NULL;
ulint i;
@ -4654,7 +4655,8 @@ row_merge_build_indexes(
/* This will allocate "3 * srv_sort_buf_size" elements of type
row_merge_block_t. The latter is defined as byte. */
block = alloc.allocate_large(3 * srv_sort_buf_size, &block_pfx);
block_size = 3 * srv_sort_buf_size;
block = alloc.allocate_large(block_size, &block_pfx);
if (block == NULL) {
DBUG_RETURN(DB_OUT_OF_MEMORY);
@ -4664,7 +4666,7 @@ row_merge_build_indexes(
if (log_tmp_is_encrypted()) {
crypt_block = static_cast<row_merge_block_t*>(
alloc.allocate_large(3 * srv_sort_buf_size,
alloc.allocate_large(block_size,
&crypt_pfx));
if (crypt_block == NULL) {
@ -5035,10 +5037,10 @@ func_exit:
ut_free(merge_files);
alloc.deallocate_large(block, &block_pfx);
alloc.deallocate_large(block, &block_pfx, block_size);
if (crypt_block) {
alloc.deallocate_large(crypt_block, &crypt_pfx);
alloc.deallocate_large(crypt_block, &crypt_pfx, block_size);
}
DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);