From e8351934b68d6d3ee273292eaa2ece203bb2b846 Mon Sep 17 00:00:00 2001 From: Daniel Black Date: Fri, 3 Apr 2020 06:54:08 +1100 Subject: [PATCH] Merge pull request #1221 from grooverdan/10.4-MDEV-18851-multiple-sized-large-page-support MDEV-18851: multiple sized large page support (linux) --- cmake/os/FreeBSD.cmake | 3 + cmake/os/Linux.cmake | 9 - cmake/os/Windows.cmake | 1 + cmake/os/WindowsCache.cmake | 2 - config.h.cmake | 7 +- configure.cmake | 7 +- extra/mariabackup/write_filt.cc | 4 +- include/my_bit.h | 9 + include/my_sys.h | 40 +- mysql-test/main/mysqld--help,win.rdiff | 16 - .../sys_vars/r/sysvars_server_embedded.result | 2 +- .../r/sysvars_server_notembedded.result | 2 +- mysys/CMakeLists.txt | 3 +- mysys/mf_keycache.c | 13 +- mysys/my_init.c | 6 - mysys/my_largepage.c | 520 +++++++++++++++--- mysys/my_static.c | 6 - mysys/my_wintoken.c | 42 ++ mysys/mysys_priv.h | 3 - sql/mysqld.cc | 84 +-- sql/sys_vars.cc | 9 +- storage/innobase/buf/buf0buf.cc | 11 +- storage/innobase/include/os0proc.h | 17 +- storage/innobase/include/ut0new.h | 52 +- storage/innobase/os/os0proc.cc | 146 ----- storage/innobase/row/row0log.cc | 17 +- storage/innobase/row/row0merge.cc | 5 +- storage/maria/ma_pagecache.c | 10 +- 28 files changed, 601 insertions(+), 445 deletions(-) create mode 100644 mysys/my_wintoken.c diff --git a/cmake/os/FreeBSD.cmake b/cmake/os/FreeBSD.cmake index b9c335e4ddf..a708765e42a 100644 --- a/cmake/os/FreeBSD.cmake +++ b/cmake/os/FreeBSD.cmake @@ -33,3 +33,6 @@ IF(EXECINFO) SET(LIBEXECINFO ${EXECINFO}) ENDIF() +INCLUDE(CheckSymbolExists) + +CHECK_SYMBOL_EXISTS(MAP_ALIGNED "sys/mman.h" HAVE_MMAP_ALIGNED) diff --git a/cmake/os/Linux.cmake b/cmake/os/Linux.cmake index f629661a5c9..0ce48cc20b7 100644 --- a/cmake/os/Linux.cmake +++ b/cmake/os/Linux.cmake @@ -40,12 +40,3 @@ ENDIF() # 64 bit file offset support flag SET(_FILE_OFFSET_BITS 64) - -# Linux specific HUGETLB /large page support -CHECK_SYMBOL_EXISTS(SHM_HUGETLB sys/shm.h HAVE_DECL_SHM_HUGETLB) -IF(HAVE_DECL_SHM_HUGETLB) - SET(HAVE_LINUX_LARGE_PAGES 1) - SET(HUGETLB_USE_PROC_MEMINFO 1) - SET(HAVE_LARGE_PAGE_OPTION 1) -ENDIF() - diff --git a/cmake/os/Windows.cmake b/cmake/os/Windows.cmake index 6845bd86a26..7181d57a6ac 100644 --- a/cmake/os/Windows.cmake +++ b/cmake/os/Windows.cmake @@ -281,6 +281,7 @@ ENDIF() SET(FN_NO_CASE_SENSE 1) SET(USE_SYMDIR 1) +SET(HAVE_LARGE_PAGE_OPTION 1) # Force static C runtime for targets in current directory # (useful to get rid of MFC dll's dependency, or in installer) diff --git a/cmake/os/WindowsCache.cmake b/cmake/os/WindowsCache.cmake index 0dd9a990335..e0d2aac163e 100644 --- a/cmake/os/WindowsCache.cmake +++ b/cmake/os/WindowsCache.cmake @@ -210,7 +210,6 @@ SET(HAVE_SYS_ERRLIST CACHE INTERNAL "") SET(HAVE_SYS_FILE_H CACHE INTERNAL "") SET(HAVE_SYS_FPU_H CACHE INTERNAL "") SET(HAVE_SYS_IOCTL_H CACHE INTERNAL "") -SET(HAVE_SYS_IPC_H CACHE INTERNAL "") SET(HAVE_SYS_MALLOC_H CACHE INTERNAL "") SET(HAVE_SYS_MMAN_H CACHE INTERNAL "") SET(HAVE_SYS_PARAM_H CACHE INTERNAL "") @@ -219,7 +218,6 @@ SET(HAVE_SYS_PTEM_H CACHE INTERNAL "") SET(HAVE_SYS_PTE_H CACHE INTERNAL "") SET(HAVE_SYS_RESOURCE_H CACHE INTERNAL "") SET(HAVE_SYS_SELECT_H CACHE INTERNAL "") -SET(HAVE_SYS_SHM_H CACHE INTERNAL "") SET(HAVE_SYS_SOCKIO_H CACHE INTERNAL "") SET(HAVE_SYS_SOCKET_H CACHE INTERNAL "") SET(HAVE_SYS_STAT_H 1 CACHE INTERNAL "") diff --git a/config.h.cmake b/config.h.cmake index 9eb067b2362..61c266b83fe 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -64,7 +64,6 @@ #cmakedefine HAVE_SYS_FILE_H 1 #cmakedefine HAVE_SYS_FPU_H 1 #cmakedefine HAVE_SYS_IOCTL_H 1 -#cmakedefine HAVE_SYS_IPC_H 1 #cmakedefine HAVE_SYS_MALLOC_H 1 #cmakedefine HAVE_SYS_MMAN_H 1 #cmakedefine HAVE_SYS_NDIR_H 1 @@ -73,7 +72,6 @@ #cmakedefine HAVE_SYS_PRCTL_H 1 #cmakedefine HAVE_SYS_RESOURCE_H 1 #cmakedefine HAVE_SYS_SELECT_H 1 -#cmakedefine HAVE_SYS_SHM_H 1 #cmakedefine HAVE_SYS_SOCKET_H 1 #cmakedefine HAVE_SYS_SOCKIO_H 1 #cmakedefine HAVE_SYS_UTSNAME_H 1 @@ -153,6 +151,7 @@ #cmakedefine HAVE_GETHOSTBYADDR_R 1 #cmakedefine HAVE_GETHRTIME 1 #cmakedefine HAVE_GETPAGESIZE 1 +#cmakedefine HAVE_GETPAGESIZES 1 #cmakedefine HAVE_GETPASS 1 #cmakedefine HAVE_GETPASSPHRASE 1 #cmakedefine HAVE_GETPWNAM 1 @@ -184,6 +183,7 @@ #cmakedefine HAVE_MLOCKALL 1 #cmakedefine HAVE_MMAP 1 #cmakedefine HAVE_MMAP64 1 +#cmakedefine HAVE_MMAP_ALIGNED 1 #cmakedefine HAVE_PERROR 1 #cmakedefine HAVE_POLL 1 #cmakedefine HAVE_POSIX_FALLOCATE 1 @@ -387,9 +387,6 @@ #cmakedefine HAVE_GCC_C11_ATOMICS 1 #cmakedefine HAVE_SOLARIS_ATOMIC 1 -#cmakedefine HAVE_DECL_SHM_HUGETLB 1 -#cmakedefine HAVE_LINUX_LARGE_PAGES 1 -#cmakedefine HUGETLB_USE_PROC_MEMINFO 1 #cmakedefine NO_FCNTL_NONBLOCK 1 #cmakedefine NO_ALARM 1 diff --git a/configure.cmake b/configure.cmake index 43c32fda0ee..bb9c60b468d 100644 --- a/configure.cmake +++ b/configure.cmake @@ -217,13 +217,11 @@ CHECK_INCLUDE_FILES (sysent.h HAVE_SYSENT_H) CHECK_INCLUDE_FILES (sys/file.h HAVE_SYS_FILE_H) CHECK_INCLUDE_FILES (sys/fpu.h HAVE_SYS_FPU_H) CHECK_INCLUDE_FILES (sys/ioctl.h HAVE_SYS_IOCTL_H) -CHECK_INCLUDE_FILES ("sys/types.h;sys/ipc.h" HAVE_SYS_IPC_H) CHECK_INCLUDE_FILES ("sys/types.h;sys/malloc.h" HAVE_SYS_MALLOC_H) CHECK_INCLUDE_FILES (sys/mman.h HAVE_SYS_MMAN_H) CHECK_INCLUDE_FILES (sys/prctl.h HAVE_SYS_PRCTL_H) CHECK_INCLUDE_FILES (sys/resource.h HAVE_SYS_RESOURCE_H) CHECK_INCLUDE_FILES (sys/select.h HAVE_SYS_SELECT_H) -CHECK_INCLUDE_FILES ("sys/types.h;sys/shm.h" HAVE_SYS_SHM_H) CHECK_INCLUDE_FILES (sys/socket.h HAVE_SYS_SOCKET_H) CHECK_INCLUDE_FILES (sys/stat.h HAVE_SYS_STAT_H) CHECK_INCLUDE_FILES (sys/stream.h HAVE_SYS_STREAM_H) @@ -417,6 +415,10 @@ CHECK_FUNCTION_EXISTS (vsnprintf HAVE_VSNPRINTF) CHECK_FUNCTION_EXISTS (memalign HAVE_MEMALIGN) CHECK_FUNCTION_EXISTS (nl_langinfo HAVE_NL_LANGINFO) +IF(HAVE_MMAP) + SET(HAVE_LARGE_PAGE_OPTION 1) +ENDIF() + IF(HAVE_SYS_EVENT_H) CHECK_FUNCTION_EXISTS (kqueue HAVE_KQUEUE) ENDIF() @@ -459,6 +461,7 @@ CHECK_FUNCTION_EXISTS(time HAVE_TIME) #CHECK_SYMBOL_EXISTS(sys_errlist "stdio.h" HAVE_SYS_ERRLIST) CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE) +CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES) CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME) CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48) CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE) diff --git a/extra/mariabackup/write_filt.cc b/extra/mariabackup/write_filt.cc index 09470384684..8c4d2345f91 100644 --- a/extra/mariabackup/write_filt.cc +++ b/extra/mariabackup/write_filt.cc @@ -76,7 +76,7 @@ wf_incremental_init(xb_write_filt_ctxt_t *ctxt, char *dst_name, /* allocate buffer for incremental backup (4096 pages) */ cp->delta_buf_size = (cursor->page_size / 4) * cursor->page_size; - cp->delta_buf = (unsigned char *)os_mem_alloc_large(&cp->delta_buf_size); + cp->delta_buf = (unsigned char *)my_large_malloc(&cp->delta_buf_size, MYF(0)); if (!cp->delta_buf) { msg(cursor->thread_n,"Can't allocate %zu bytes", @@ -185,7 +185,7 @@ static void wf_incremental_deinit(xb_write_filt_ctxt_t *ctxt) { xb_wf_incremental_ctxt_t *cp = &(ctxt->u.wf_incremental_ctxt); - os_mem_free_large(cp->delta_buf, cp->delta_buf_size); + my_large_free(cp->delta_buf, cp->delta_buf_size); } /************************************************************************ diff --git a/include/my_bit.h b/include/my_bit.h index 863ad30fd7e..c2e5e41c4a1 100644 --- a/include/my_bit.h +++ b/include/my_bit.h @@ -70,6 +70,15 @@ static inline CONSTEXPR uint my_bit_log2_uint64(ulonglong value) my_bit_log2_uint32((uint32) (value >> 32)) + 32 : my_bit_log2_uint32((uint32) value); } +static inline CONSTEXPR uint my_bit_log2_size_t(size_t value) +{ +#ifdef __cplusplus + static_assert(sizeof(size_t) <= sizeof(ulonglong), + "size_t <= ulonglong is an assumption that needs to be fixed for this architecture. " + "Please create an issue on https://jira.mariadb.org"); +#endif + return my_bit_log2_uint64((ulonglong) value); +} /* diff --git a/include/my_sys.h b/include/my_sys.h index e2c10e7e553..f650b484cf4 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -176,15 +176,34 @@ extern void *my_memdup(PSI_memory_key key, const void *from,size_t length,myf My extern char *my_strdup(PSI_memory_key key, const char *from,myf MyFlags); extern char *my_strndup(PSI_memory_key key, const char *from, size_t length, myf MyFlags); -#ifdef HAVE_LINUX_LARGE_PAGES -extern uint my_get_large_page_size(void); -extern uchar * my_large_malloc(size_t size, myf my_flags); -extern void my_large_free(uchar *ptr); +#if defined(__linux__) || defined(HAVE_GETPAGESIZES) +extern size_t my_next_large_page_size(size_t sz, int *start); #else -#define my_get_large_page_size() (0) -#define my_large_malloc(A,B) my_malloc_lock((A),(B)) -#define my_large_free(A) my_free_lock((A)) -#endif /* HAVE_LINUX_LARGE_PAGES */ +#define my_next_large_page_size(A,B) (0) +#endif + +#if defined(_WIN32) || (defined(HAVE_MMAP) && !defined(__linux__) \ + && !defined(HAVE_MMAP_ALIGNED)) +extern void my_get_large_page_size(void); +#else +#define my_get_large_page_size() do {} while(0) +#endif + +#ifdef HAVE_LARGE_PAGE_OPTION +int my_init_large_pages(my_bool super_large_pages); +uchar * my_large_malloc(size_t *size, myf my_flags); +void my_large_free(void *ptr, size_t size); +#else +#define my_large_malloc(A,B) my_malloc_lock(*(A),(B)) +#define my_large_free(A,B) my_free_lock((A)) +#endif /* HAVE_LARGE_PAGE_OPTION */ + +#ifdef _WIN32 +extern BOOL my_obtain_privilege(LPCSTR lpPrivilege); +#else +#define my_obtain_privilege(A) (1) +#define SE_LOCK_MEMORY_NAME "SeLockMemoryPrivilege" +#endif void my_init_atomic_write(void); #ifdef __linux__ @@ -242,11 +261,6 @@ extern int sf_leaking_memory; /* set to 1 to disable memleak detection */ extern void (*proc_info_hook)(void *, const PSI_stage_info *, PSI_stage_info *, const char *, const char *, const unsigned int); -#ifdef HAVE_LINUX_LARGE_PAGES -extern my_bool my_use_large_pages; -extern uint my_large_page_size; -#endif - /* charsets */ #define MY_ALL_CHARSETS_SIZE 2048 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO *default_charset_info; diff --git a/mysql-test/main/mysqld--help,win.rdiff b/mysql-test/main/mysqld--help,win.rdiff index 77f6f76858f..91b0540c879 100644 --- a/mysql-test/main/mysqld--help,win.rdiff +++ b/mysql-test/main/mysqld--help,win.rdiff @@ -1,13 +1,5 @@ --- a/mysql-test/r/mysqld--help.result +++ b/mysql-test/r/mysqld--help.result -@@ -419,7 +419,6 @@ - The number of segments in a key cache - -L, --language=name Client error messages in given language. May be given as - a full path. Deprecated. Use --lc-messages-dir instead. -- --large-pages Enable support for large pages - --lc-messages=name Set the language used for the error messages. - -L, --lc-messages-dir=name - Directory where error messages are @@ -647,6 +646,7 @@ Use MySQL-5.6 (instead of MariaDB-5.3) format for TIME, DATETIME, TIMESTAMP columns. @@ -58,14 +50,6 @@ --transaction-alloc-block-size=# Allocation block size for transactions to be stored in binary log -@@ -1513,7 +1523,6 @@ - key-cache-division-limit 100 - key-cache-file-hash-size 512 - key-cache-segments 0 --large-pages FALSE - lc-messages en_US - lc-messages-dir MYSQL_SHAREDIR/ - lc-time-names en_US @@ -1587,6 +1596,7 @@ myisam-stats-method NULLS_UNEQUAL myisam-use-mmap FALSE diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result index 57f68d20066..2824f82cfff 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result @@ -1476,7 +1476,7 @@ COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME LARGE_PAGE_SIZE VARIABLE_SCOPE GLOBAL VARIABLE_TYPE INT UNSIGNED -VARIABLE_COMMENT If large page support is enabled, this shows the size of memory pages +VARIABLE_COMMENT Previously showed the size of large memory pages, unused since multiple page size support was added NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 1 diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index bed0bf53bb4..f1b73b0e1c8 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -1576,7 +1576,7 @@ COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME LARGE_PAGE_SIZE VARIABLE_SCOPE GLOBAL VARIABLE_TYPE INT UNSIGNED -VARIABLE_COMMENT If large page support is enabled, this shows the size of memory pages +VARIABLE_COMMENT Previously showed the size of large memory pages, unused since multiple page size support was added NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 1 diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt index f4a6a6d55b3..b29256da4a0 100644 --- a/mysys/CMakeLists.txt +++ b/mysys/CMakeLists.txt @@ -50,6 +50,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c IF (WIN32) SET (MYSYS_SOURCES ${MYSYS_SOURCES} my_winthread.c + my_wintoken.c my_wincond.c my_winerr.c my_winfile.c @@ -65,7 +66,7 @@ IF(HAVE_ALARM) SET(MYSYS_SOURCES ${MYSYS_SOURCES} my_alarm.c) ENDIF() -IF(HAVE_LINUX_LARGE_PAGES) +IF(HAVE_LARGE_PAGE_OPTION) SET(MYSYS_SOURCES ${MYSYS_SOURCES} my_largepage.c) ENDIF() diff --git a/mysys/mf_keycache.c b/mysys/mf_keycache.c index 94b720b1006..ab085c8e3b7 100644 --- a/mysys/mf_keycache.c +++ b/mysys/mf_keycache.c @@ -162,6 +162,7 @@ typedef struct st_simple_key_cache_cb my_bool resize_in_flush; /* true during flush of resize operation */ my_bool can_be_used; /* usage of cache for read/write is allowed */ size_t key_cache_mem_size; /* specified size of the cache memory */ + size_t allocated_mem_size; /* size of the memory actually allocated */ uint key_cache_block_size; /* size of the page buffer of a cache block */ ulong min_warm_blocks; /* min number of warm blocks; */ ulong age_threshold; /* age threshold for hot blocks */ @@ -545,10 +546,8 @@ int init_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache, sizeof(BLOCK_LINK*)* (changed_blocks_hash_size*2))) + ((size_t) blocks * keycache->key_cache_block_size) > use_mem && blocks > 8) blocks--; - /* Allocate memory for cache page buffers */ - if ((keycache->block_mem= - my_large_malloc((size_t) blocks * keycache->key_cache_block_size, - MYF(0)))) + keycache->allocated_mem_size= blocks * keycache->key_cache_block_size; + if ((keycache->block_mem=my_large_malloc(&keycache->allocated_mem_size, MYF(0)))) { /* Allocate memory for blocks, hash_links and hash entries; @@ -570,7 +569,7 @@ int init_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache, changed_blocks_hash_size), NullS)) break; - my_large_free(keycache->block_mem); + my_large_free(keycache->block_mem, keycache->allocated_mem_size); keycache->block_mem= 0; } if (blocks < 8) @@ -631,7 +630,7 @@ err: keycache->blocks= 0; if (keycache->block_mem) { - my_large_free((uchar*) keycache->block_mem); + my_large_free((uchar*) keycache->block_mem, keycache->allocated_mem_size); keycache->block_mem= NULL; } if (keycache->block_root) @@ -965,7 +964,7 @@ void end_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache, my_bool cleanup) { if (keycache->block_mem) { - my_large_free((uchar*) keycache->block_mem); + my_large_free((uchar*) keycache->block_mem, keycache->allocated_mem_size); keycache->block_mem= NULL; my_free(keycache->block_root); keycache->block_root= NULL; diff --git a/mysys/my_init.c b/mysys/my_init.c index fdde04be084..4ae0cb9966c 100644 --- a/mysys/my_init.c +++ b/mysys/my_init.c @@ -430,16 +430,10 @@ static PSI_thread_info all_mysys_threads[]= }; -#ifdef HUGETLB_USE_PROC_MEMINFO -PSI_file_key key_file_proc_meminfo; -#endif /* HUGETLB_USE_PROC_MEMINFO */ PSI_file_key key_file_charset, key_file_cnf; static PSI_file_info all_mysys_files[]= { -#ifdef HUGETLB_USE_PROC_MEMINFO - { &key_file_proc_meminfo, "proc_meminfo", 0}, -#endif /* HUGETLB_USE_PROC_MEMINFO */ { &key_file_charset, "charset", 0}, { &key_file_cnf, "cnf", 0} }; diff --git a/mysys/my_largepage.c b/mysys/my_largepage.c index 33d0843cd58..c0d11ca6345 100644 --- a/mysys/my_largepage.c +++ b/mysys/my_largepage.c @@ -1,4 +1,5 @@ /* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + Copyright (c) 2019, 2020 IBM. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,53 +16,124 @@ #include "mysys_priv.h" -#ifdef HAVE_LINUX_LARGE_PAGES - -#ifdef HAVE_SYS_IPC_H -#include +#ifdef HAVE_SYS_MMAN_H +#include +#endif +#ifdef __linux__ +#include +#include +#endif +#if defined(__linux__) || defined(HAVE_MMAP_ALIGNED) +#include "my_bit.h" #endif -#ifdef HAVE_SYS_SHM_H -#include +#ifdef HAVE_SOLARIS_LARGE_PAGES +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#if defined(__sun__) && defined(__GNUC__) && defined(__cplusplus) \ + && defined(_XOPEN_SOURCE) +/* memcntl exist within sys/mman.h, but under-defines what is need to use it */ +extern int memcntl(caddr_t, size_t, int, caddr_t, int, int); +#endif /* __sun__ ... */ +#endif /* HAVE_SOLARIS_LARGE_PAGES */ + +#ifdef HAVE_LARGE_PAGE_OPTION +static my_bool my_use_large_pages= 0; +#else +#define my_use_large_pages 0 #endif -static uint my_get_large_page_size_int(void); -static uchar* my_large_malloc_int(size_t size, myf my_flags); -static my_bool my_large_free_int(uchar* ptr); +#if defined(__linux__) || defined(HAVE_GETPAGESIZES) +#define my_large_page_sizes_length 8 +static size_t my_large_page_sizes[my_large_page_sizes_length]; +static void my_get_large_page_sizes(size_t sizes[]); +#else +#define my_large_page_sizes_length 0 +#define my_get_large_page_sizes(A) do {} while(0) +#endif -/* Gets the size of large pages from the OS */ +static inline my_bool my_is_2pow(size_t n) { return !((n) & ((n) - 1)); } -uint my_get_large_page_size(void) +static uchar* my_large_malloc_int(size_t *size, myf my_flags); +static my_bool my_large_free_int(void *ptr, size_t size); + +#ifdef HAVE_LARGE_PAGE_OPTION + +int my_init_large_pages(my_bool super_large_pages) { - uint size; - DBUG_ENTER("my_get_large_page_size"); - - if (!(size = my_get_large_page_size_int())) - fprintf(stderr, "Warning: Failed to determine large page size\n"); + my_use_large_pages= 1; + my_get_large_page_sizes(my_large_page_sizes); + if (!my_obtain_privilege(SE_LOCK_MEMORY_NAME)) + { + fprintf(stderr, "mysqld: Lock Pages in memory access rights required for use with large-pages, " + "see https://mariadb.com/kb/en/library/mariadb-memory-allocation/#huge-pages"); + return 1; + } +#ifdef HAVE_SOLARIS_LARGE_PAGES +#define LARGE_PAGESIZE (4*1024*1024) /* 4MB */ +#define SUPER_LARGE_PAGESIZE (256*1024*1024) /* 256MB */ + /* + tell the kernel that we want to use 4/256MB page for heap storage + and also for the stack. We use 4 MByte as default and if the + super-large-page is set we increase it to 256 MByte. 256 MByte + is for server installations with GBytes of RAM memory where + the MySQL Server will have page caches and other memory regions + measured in a number of GBytes. + We use as big pages as possible which isn't bigger than the above + desired page sizes. + */ + int nelem= 0; + size_t max_desired_page_size; + size_t max_page_size= 0; + if (super_large_pages) + max_desired_page_size= SUPER_LARGE_PAGESIZE; + else + max_desired_page_size= LARGE_PAGESIZE; - DBUG_RETURN(size); + max_page_size= my_next_large_page_size(max_desired_page_size, &nelem); + if (max_page_size > 0) + { + struct memcntl_mha mpss; + + mpss.mha_cmd= MHA_MAPSIZE_BSSBRK; + mpss.mha_pagesize= max_page_size; + mpss.mha_flags= 0; + if (memcntl(NULL, 0, MC_HAT_ADVISE, (caddr_t)&mpss, 0, 0)) + { + perror("memcntl MC_HAT_ADVISE cmd MHA_MAPSIZE_BSSBRK error (continuing)"); + } + mpss.mha_cmd= MHA_MAPSIZE_STACK; + if (memcntl(NULL, 0, MC_HAT_ADVISE, (caddr_t)&mpss, 0, 0)) + { + perror("memcntl MC_HAT_ADVISE cmd MHA_MAPSIZE_STACK error (continuing)"); + } + } +#endif /* HAVE_SOLARIS_LARGE_PAGES */ + return 0; } /* General large pages allocator. Tries to allocate memory from large pages pool and falls back to - my_malloc_lock() in case of failure + my_malloc_lock() in case of failure. + Every implementation returns a zero filled buffer here. */ -uchar* my_large_malloc(size_t size, myf my_flags) +uchar* my_large_malloc(size_t *size, myf my_flags) { uchar* ptr; DBUG_ENTER("my_large_malloc"); - if (my_use_large_pages && my_large_page_size) + if ((ptr= my_large_malloc_int(size, my_flags)) != NULL) { - if ((ptr = my_large_malloc_int(size, my_flags)) != NULL) - DBUG_RETURN(ptr); - if (my_flags & MY_WME) - fprintf(stderr, "Warning: Using conventional memory pool\n"); + MEM_MAKE_DEFINED(ptr, *size); + DBUG_RETURN(ptr); } + if (my_flags & MY_WME) + fprintf(stderr, "Warning: Using conventional memory pool\n"); - DBUG_RETURN(my_malloc_lock(size, my_flags)); + DBUG_RETURN(my_malloc_lock(*size, my_flags)); } /* @@ -70,7 +142,7 @@ uchar* my_large_malloc(size_t size, myf my_flags) to my_free_lock() in case of failure */ -void my_large_free(uchar* ptr) +void my_large_free(void *ptr, size_t size) { DBUG_ENTER("my_large_free"); @@ -79,88 +151,364 @@ void my_large_free(uchar* ptr) my_large_malloc_int(), i.e. my_malloc_lock() was used so we should free it with my_free_lock() */ - if (!my_use_large_pages || !my_large_page_size || !my_large_free_int(ptr)) + if (!my_large_free_int(ptr, size)) my_free_lock(ptr); + /* + For ASAN, we need to explicitly unpoison this memory region because the OS + may reuse that memory for some TLS or stack variable. It will remain + poisoned if it was explicitly poisioned before release. If this happens, + we'll have hard to debug false positives like in MDEV-21239. + For valgrind, we mark it as UNDEFINED rather than NOACCESS because of the + implict reuse possiblility. + */ + else + MEM_UNDEFINED(ptr, size); DBUG_VOID_RETURN; } +#endif /* HAVE_LARGE_PAGE_OPTION */ -#ifdef HUGETLB_USE_PROC_MEMINFO -/* Linux-specific function to determine the size of large pages */ +#if defined(HAVE_GETPAGESIZES) || defined(__linux__) -uint my_get_large_page_size_int(void) +/* Descending sort */ + +static int size_t_cmp(const void *a, const void *b) { - MYSQL_FILE *f; - uint size = 0; - char buf[256]; - DBUG_ENTER("my_get_large_page_size_int"); - - if (!(f= mysql_file_fopen(key_file_proc_meminfo, "/proc/meminfo", - O_RDONLY, MYF(MY_WME)))) - goto finish; - - while (mysql_file_fgets(buf, sizeof(buf), f)) - if (sscanf(buf, "Hugepagesize: %u kB", &size)) - break; - - mysql_file_fclose(f, MYF(MY_WME)); - -finish: - DBUG_RETURN(size * 1024); + const size_t *ia= (const size_t *)a; // casting pointer types + const size_t *ib= (const size_t *)b; + if (*ib > *ia) + { + return 1; + } + else if (*ib < *ia) + { + return -1; + } + return 0; } -#endif /* HUGETLB_USE_PROC_MEMINFO */ -#if HAVE_DECL_SHM_HUGETLB -/* Linux-specific large pages allocator */ - -uchar* my_large_malloc_int(size_t size, myf my_flags) +/* + Returns the next large page size smaller or equal to the passed in size. + + The search starts at my_large_page_sizes[*start]. + + Assumes my_get_large_page_sizes(my_large_page_sizes) has been called before use. + + For first use, have *start=0. There is no need to increment *start. + + @param[in] sz size to be searched for. + @param[in,out] start ptr to int representing offset in my_large_page_sizes to start from. + *start is updated during search and can be used to search again if 0 isn't returned. + + @returns the next size found. *start will be incremented to the next potential size. + @retval a large page size that is valid on this system or 0 if no large page size possible. +*/ +size_t my_next_large_page_size(size_t sz, int *start) +{ + size_t cur; + DBUG_ENTER("my_next_large_page_size"); + + while (*start < my_large_page_sizes_length + && my_large_page_sizes[*start] > 0) + { + cur= *start; + (*start)++; + if (my_large_page_sizes[cur] <= sz) + { + DBUG_RETURN(my_large_page_sizes[cur]); + } + } + DBUG_RETURN(0); +} + +#endif /* defined(HAVE_GETPAGESIZES) || defined(__linux__) */ + +#ifdef __linux__ +/* Linux-specific function to determine the sizes of large pages */ + +static void my_get_large_page_sizes(size_t sizes[my_large_page_sizes_length]) +{ + DIR *dirp; + struct dirent *r; + int i= 0; + DBUG_ENTER("my_get_large_page_sizes"); + + dirp= opendir("/sys/kernel/mm/hugepages"); + if (dirp == NULL) + { + perror("Warning: failed to open /sys/kernel/mm/hugepages"); + } + else + { + while (i < my_large_page_sizes_length && + (r= readdir(dirp))) + { + if (strncmp("hugepages-", r->d_name, 10) == 0) + { + sizes[i]= strtoull(r->d_name + 10, NULL, 10) * 1024ULL; + if (!my_is_2pow(sizes[i])) + { + fprintf(stderr, "Warning: non-power of 2 large page size (%zu) found, skipping\n", sizes[i]); + sizes[i]= 0; + continue; + } + ++i; + } + } + if (closedir(dirp)) + { + perror("Warning: failed to close /sys/kernel/mm/hugepages"); + } + qsort(sizes, i, sizeof(size_t), size_t_cmp); + } + DBUG_VOID_RETURN; +} +#endif + +/* Multisized (Linux/FreeBSD) large pages allocator */ + +#if defined(__linux__) || defined(HAVE_MMAP_ALIGNED) +uchar* my_large_malloc_int(size_t *size, myf my_flags) { - int shmid; uchar* ptr; - struct shmid_ds buf; + int mapflag; + int page_i= 0; + size_t large_page_size= 0; + size_t aligned_size= *size; DBUG_ENTER("my_large_malloc_int"); - /* Align block size to my_large_page_size */ - size= MY_ALIGN(size, (size_t) my_large_page_size); - - shmid = shmget(IPC_PRIVATE, size, SHM_HUGETLB | SHM_R | SHM_W); - if (shmid < 0) + while (1) + { + mapflag= MAP_PRIVATE | MAP_ANONYMOUS; + if (my_use_large_pages) + { + large_page_size= my_next_large_page_size(*size, &page_i); + if (large_page_size) + { +#ifdef __linux__ + mapflag|= MAP_HUGETLB | my_bit_log2_size_t(large_page_size) << MAP_HUGE_SHIFT; +#else + mapflag|= MAP_ALIGNED_SUPER | MAP_ALIGNED(my_bit_log2_size_t(large_page_size)); +#endif + aligned_size= MY_ALIGN(*size, (size_t) large_page_size); + } + else + { + aligned_size= *size; + } + } + ptr= mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, mapflag, -1, 0); + if (ptr == (void*) -1) + { + ptr= NULL; + if (my_flags & MY_WME) + { + if (large_page_size) + { + fprintf(stderr, + "Warning: Failed to allocate %zu bytes from HugeTLB memory" + "(page size %zu). errno %d\n", aligned_size, large_page_size, errno); + } + else + { + fprintf(stderr, + "Warning: Failed to allocate %zu bytes from memory." + " errno %d\n", aligned_size, errno); + } + } + /* try next smaller memory size */ + if (large_page_size && errno == ENOMEM) + continue; + + /* other errors are more serious */ + DBUG_RETURN(NULL); + } + else /* success */ + { + if (large_page_size) + { + /* + we do need to record the adjustment so that munmap gets called with + the right size. This is only the case for HUGETLB pages. + */ + *size= aligned_size; + } + DBUG_RETURN(ptr); + } + if (large_page_size == 0) + { + break; /* no more options to try */ + } + } + DBUG_RETURN(ptr); +} + +#endif /* defined(__linux__) || defined(HAVE_MMAP_ALIGNED) */ + +#if defined(HAVE_GETPAGESIZES) && !defined(__linux__) +static void my_get_large_page_sizes(size_t sizes[my_large_page_sizes_length]) +{ + int nelem; + + nelem= getpagesizes(NULL, 0); + + assert(nelem <= my_large_page_sizes_length); + getpagesizes(sizes, my_large_page_sizes_length); + qsort(sizes, nelem, sizeof(size_t), size_t_cmp); + if (nelem < my_large_page_sizes_length) + { + sizes[nelem]= 0; + } +} +#endif + +#if defined(HAVE_MMAP) && !defined(_WIN32) + +/* mmap and Linux-specific large pages deallocator */ + +my_bool my_large_free_int(void *ptr, size_t size) +{ + DBUG_ENTER("my_large_free_int"); + + if (munmap(ptr, size)) + { + /* This occurs when the original allocation fell back to conventional memory so ignore the EINVAL error */ + if (errno != EINVAL) + { + fprintf(stderr, "Warning: Failed to unmap %zu bytes, errno %d\n", size, errno); + } + DBUG_RETURN(0); + } + DBUG_RETURN(1); +} +#endif /* HAVE_MMAP */ + +#if defined(HAVE_MMAP) && !defined(__linux__) && !defined(HAVE_MMAP_ALIGNED) \ + && !defined(_WIN32) + +/* Solaris for example has only MAP_ANON, FreeBSD has MAP_ANONYMOUS and +MAP_ANON but MAP_ANONYMOUS is marked "for compatibility" */ +#if defined(MAP_ANONYMOUS) +#define OS_MAP_ANON MAP_ANONYMOUS +#elif defined(MAP_ANON) +#define OS_MAP_ANON MAP_ANON +#else +#error unsupported mmap - no MAP_ANON{YMOUS} +#endif + +static size_t my_large_page_size= 0; + +/* mmap-specific function to determine the size of large pages + +This is a fudge as we only use this to ensure that mmap allocations +are of this size. +*/ + +void my_get_large_page_size(void) +{ + my_large_page_size= my_getpagesize(); +} + +/* mmap(non-Linux,non-FreeBSD) pages allocator */ + +uchar* my_large_malloc_int(size_t *size, myf my_flags) +{ + uchar* ptr; + int mapflag; + DBUG_ENTER("my_large_malloc_int"); + + mapflag= MAP_PRIVATE | OS_MAP_ANON; + + if (my_use_large_pages && my_large_page_size) + { + /* Align block size to my_large_page_size */ + *size= MY_ALIGN(*size, (size_t) my_large_page_size); + } + ptr= mmap(NULL, *size, PROT_READ | PROT_WRITE, mapflag, -1, 0); + if (ptr == (void*) -1) + { + ptr= NULL; + if (my_flags & MY_WME) + { + fprintf(stderr, + "Warning: Failed to allocate %zu bytes from memory." + " errno %d\n", *size, errno); + } + } + DBUG_RETURN(ptr); +} +#endif /* defined(HAVE_MMAP) && !defined(__linux__) && !defined(_WIN32) */ + +#ifdef _WIN32 +static size_t my_large_page_size= 0; + +/* Windows-specific function to determine the size of large pages */ + +void my_get_large_page_size(void) +{ + DBUG_ENTER("my_get_large_page_size_int"); + + my_large_page_size= my_use_large_pages ? GetLargePageMinimum() + : my_getpagesize(); + DBUG_VOID_RETURN; +} + +/* Windows-specific large pages allocator */ + +uchar* my_large_malloc_int(size_t *size, myf my_flags) +{ + DBUG_ENTER("my_large_malloc_int"); + void* ptr; + DWORD alloc_type= MEM_COMMIT | MEM_RESERVE; + size_t orig_size= *size; + + if (my_use_large_pages) + { + alloc_type|= MEM_LARGE_PAGES; + /* Align block size to my_large_page_size */ + *size= MY_ALIGN(*size, (size_t) my_large_page_size); + } + ptr= VirtualAlloc(NULL, *size, alloc_type, PAGE_READWRITE); + if (!ptr) { if (my_flags & MY_WME) + { fprintf(stderr, - "Warning: Failed to allocate %lu bytes from HugeTLB memory." - " errno %d\n", (ulong) size, errno); - - DBUG_RETURN(NULL); + "Warning: VirtualAlloc(%zu bytes%s) failed; Windows error %lu\n", + *size, + my_use_large_pages ? ", MEM_LARGE_PAGES" : "", + GetLastError()); + } + *size= orig_size; + ptr= VirtualAlloc(NULL, *size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); + if (!ptr && my_flags & MY_WME) + { + fprintf(stderr, + "Warning: VirtualAlloc(%zu bytes) failed; Windows error %lu\n", + *size, GetLastError()); + } } - ptr = (uchar*) shmat(shmid, NULL, 0); - if (ptr == (uchar *) -1) - { - if (my_flags& MY_WME) - fprintf(stderr, "Warning: Failed to attach shared memory segment," - " errno %d\n", errno); - shmctl(shmid, IPC_RMID, &buf); - - DBUG_RETURN(NULL); - } - - /* - Remove the shared memory segment so that it will be automatically freed - after memory is detached or process exits - */ - shmctl(shmid, IPC_RMID, &buf); - DBUG_RETURN(ptr); } -/* Linux-specific large pages deallocator */ +/* Windows-specific large pages deallocator */ -my_bool my_large_free_int(uchar *ptr) +my_bool my_large_free_int(void *ptr, size_t size) { DBUG_ENTER("my_large_free_int"); - DBUG_RETURN(shmdt(ptr) == 0); -} -#endif /* HAVE_DECL_SHM_HUGETLB */ + /* + When RELEASE memory, the size parameter must be 0. + Do not use MEM_RELEASE with MEM_DECOMMIT. + */ + if (ptr && !VirtualFree(ptr, 0, MEM_RELEASE)) + { + fprintf(stderr, + "Error: VirtualFree(%p, %zu) failed; Windows error %lu\n", ptr, size, GetLastError()); + DBUG_RETURN(0); + } + + DBUG_RETURN(1); +} +#endif /* _WIN32 */ -#endif /* HAVE_LINUX_LARGE_PAGES */ diff --git a/mysys/my_static.c b/mysys/my_static.c index a18b7adef61..cb0465929ab 100644 --- a/mysys/my_static.c +++ b/mysys/my_static.c @@ -91,12 +91,6 @@ const char *soundex_map= "01230120022455012623010202"; USED_MEM* my_once_root_block=0; /* pointer to first block */ uint my_once_extra=ONCE_ALLOC_INIT; /* Memory to alloc / block */ - /* from my_largepage.c */ -#ifdef HAVE_LINUX_LARGE_PAGES -my_bool my_use_large_pages= 0; -uint my_large_page_size= 0; -#endif - /* from my_alarm */ int volatile my_have_got_alarm=0; /* declare variable to reset */ ulong my_time_to_wait_for_lock=2; /* In seconds */ diff --git a/mysys/my_wintoken.c b/mysys/my_wintoken.c new file mode 100644 index 00000000000..78c4bc9dad5 --- /dev/null +++ b/mysys/my_wintoken.c @@ -0,0 +1,42 @@ +/* Copyright (c) 2019, IBM. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "mysys_priv.h" + +BOOL my_obtain_privilege(LPCSTR lpPrivilege) +{ + HANDLE hAccessToken; + TOKEN_PRIVILEGES token; + BOOL ret_value= FALSE; + + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hAccessToken)) + { + return FALSE; + } + + if (!LookupPrivilegeValue(NULL, lpPrivilege, &token.Privileges[0].Luid)) + return FALSE; + + token.PrivilegeCount= 1; + token.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + ret_value= AdjustTokenPrivileges(hAccessToken, FALSE, &token, 0, NULL, NULL); + + if (!ret_value || (GetLastError() != ERROR_SUCCESS)) + return FALSE; + + CloseHandle(hAccessToken); + return TRUE; +} diff --git a/mysys/mysys_priv.h b/mysys/mysys_priv.h index 240d66a3b51..5115e0452d5 100644 --- a/mysys/mysys_priv.h +++ b/mysys/mysys_priv.h @@ -101,9 +101,6 @@ extern mysql_mutex_t THR_LOCK_charset; #include #ifdef HAVE_PSI_INTERFACE -#ifdef HUGETLB_USE_PROC_MEMINFO -extern PSI_file_key key_file_proc_meminfo; -#endif /* HUGETLB_USE_PROC_MEMINFO */ extern PSI_file_key key_file_charset, key_file_cnf; #endif /* HAVE_PSI_INTERFACE */ diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 1497296e7cf..b7c93280004 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -167,15 +167,6 @@ extern "C" { // Because of SCO 3.2V4.2 #include #endif -#ifdef HAVE_SOLARIS_LARGE_PAGES -#if defined(__sun__) && defined(__GNUC__) && defined(__cplusplus) \ - && defined(_XOPEN_SOURCE) -extern int getpagesizes(size_t *, int); -extern int getpagesizes2(size_t *, int); -extern int memcntl(caddr_t, size_t, int, caddr_t, int, int); -#endif /* __sun__ ... */ -#endif /* HAVE_SOLARIS_LARGE_PAGES */ - #ifdef _AIX41 int initgroups(const char *,unsigned int); #endif @@ -3946,73 +3937,22 @@ static int init_common_variables() DBUG_PRINT("info",("%s Ver %s for %s on %s\n",my_progname, server_version, SYSTEM_TYPE,MACHINE_TYPE)); -#ifdef HAVE_LINUX_LARGE_PAGES +#ifdef HAVE_LARGE_PAGE_OPTION /* Initialize large page size */ if (opt_large_pages) { - SYSVAR_AUTOSIZE(opt_large_page_size, my_get_large_page_size()); - if (opt_large_page_size) + DBUG_PRINT("info", ("Large page set")); + if (my_init_large_pages(opt_super_large_pages)) { - DBUG_PRINT("info", ("Large page set, large_page_size = %d", - opt_large_page_size)); - my_use_large_pages= 1; - my_large_page_size= opt_large_page_size; + return 1; } - else - SYSVAR_AUTOSIZE(opt_large_pages, 0); } -#endif /* HAVE_LINUX_LARGE_PAGES */ -#ifdef HAVE_SOLARIS_LARGE_PAGES -#define LARGE_PAGESIZE (4*1024*1024) /* 4MB */ -#define SUPER_LARGE_PAGESIZE (256*1024*1024) /* 256MB */ - if (opt_large_pages) - { /* - tell the kernel that we want to use 4/256MB page for heap storage - and also for the stack. We use 4 MByte as default and if the - super-large-page is set we increase it to 256 MByte. 256 MByte - is for server installations with GBytes of RAM memory where - the MySQL Server will have page caches and other memory regions - measured in a number of GBytes. - We use as big pages as possible which isn't bigger than the above - desired page sizes. + my_get_large_page_size results used by large allocations even if not large pages. + This function must be called after my_init_large_pages. */ - int nelem; - size_t max_desired_page_size; - if (opt_super_large_pages) - max_desired_page_size= SUPER_LARGE_PAGESIZE; - else - max_desired_page_size= LARGE_PAGESIZE; - nelem = getpagesizes(NULL, 0); - if (nelem > 0) - { - size_t *pagesize = (size_t *) malloc(sizeof(size_t) * nelem); - if (pagesize != NULL && getpagesizes(pagesize, nelem) > 0) - { - size_t max_page_size= 0; - for (int i= 0; i < nelem; i++) - { - if (pagesize[i] > max_page_size && - pagesize[i] <= max_desired_page_size) - max_page_size= pagesize[i]; - } - free(pagesize); - if (max_page_size > 0) - { - struct memcntl_mha mpss; - - mpss.mha_cmd= MHA_MAPSIZE_BSSBRK; - mpss.mha_pagesize= max_page_size; - mpss.mha_flags= 0; - memcntl(NULL, 0, MC_HAT_ADVISE, (caddr_t)&mpss, 0, 0); - mpss.mha_cmd= MHA_MAPSIZE_STACK; - memcntl(NULL, 0, MC_HAT_ADVISE, (caddr_t)&mpss, 0, 0); - } - } - } - } -#endif /* HAVE_SOLARIS_LARGE_PAGES */ - + my_get_large_page_size(); +#endif /* HAVE_LARGE_PAGE_OPTION */ #if defined(HAVE_POOL_OF_THREADS) if (IS_SYSVAR_AUTOSIZE(&threadpool_size)) @@ -5101,6 +5041,12 @@ static int init_server_components() 0, 0, 0, GET_NO_ARG, OPT_ARG, 0, 0, 0, 0, 0, 0}, {"timed-mutexes", OPT_DEPRECATED_OPTION, "", 0, 0, 0, GET_NO_ARG, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#if defined(__linux__) + /* Linux was the only large page OS that we've now removed the (always) + unused super-large-pages (because its Solaris only). */ + {"super-large-pages", OPT_DEPRECATED_OPTION, "", + 0, 0, 0, GET_NO_ARG, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} }; /* @@ -6643,7 +6589,7 @@ struct my_option my_long_options[]= "mysql.gtid_slave_pos", >id_pos_auto_engines, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0 }, -#ifdef HAVE_LARGE_PAGE_OPTION +#ifdef HAVE_SOLARIS_LARGE_PAGES {"super-large-pages", 0, "Enable support for super large pages.", &opt_super_large_pages, &opt_super_large_pages, 0, GET_BOOL, OPT_ARG, 0, 0, 1, 0, 1, 0}, diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 8268ee26019..09a617cece1 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -1367,14 +1367,17 @@ static Sys_var_mybool Sys_large_files_support( static Sys_var_uint Sys_large_page_size( "large_page_size", - "If large page support is enabled, this shows the size of memory pages", + "Previously showed the size of large memory pages, unused since " + "multiple page size support was added", READ_ONLY GLOBAL_VAR(opt_large_page_size), NO_CMD_LINE, - VALID_RANGE(0, UINT_MAX), DEFAULT(0), BLOCK_SIZE(1)); + VALID_RANGE(0, UINT_MAX), DEFAULT(0), BLOCK_SIZE(1), + NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(0), + DEPRECATED("")); static Sys_var_mybool Sys_large_pages( "large_pages", "Enable support for large pages", READ_ONLY GLOBAL_VAR(opt_large_pages), - IF_WIN(NO_CMD_LINE, CMD_LINE(OPT_ARG)), DEFAULT(FALSE)); + CMD_LINE(OPT_ARG), DEFAULT(FALSE)); static Sys_var_charptr_fscs Sys_language( "lc_messages_dir", "Directory where error messages are", diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 0721ce2508b..c3cff2e561d 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1380,6 +1380,8 @@ inline bool buf_pool_t::chunk_t::create(size_t bytes) /* Align a pointer to the first frame. Note that when opt_large_page_size is smaller than srv_page_size, + (with max srv_page_size at 64k don't think any hardware + makes this true), we may allocate one fewer block than requested. When it is bigger, we may allocate more blocks than requested. */ static_assert(sizeof(byte*) == sizeof(ulint), "pointer size"); @@ -1526,8 +1528,7 @@ bool buf_pool_t::create() for (auto i= chunk->size; i--; block++) buf_block_free_mutexes(block); - allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx, - chunk->mem_size()); + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); } ut_free(chunks); chunks= nullptr; @@ -1653,8 +1654,7 @@ void buf_pool_t::close() for (auto i= chunk->size; i--; block++) buf_block_free_mutexes(block); - allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx, - chunk->mem_size()); + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); } for (ulint i= BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) @@ -2279,8 +2279,7 @@ withdraw_retry: } allocator.deallocate_large_dodump( - chunk->mem, &chunk->mem_pfx, - chunk->mem_size()); + chunk->mem, &chunk->mem_pfx); sum_freed += chunk->size; ++chunk; } diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h index d8952a56cc9..2a507e013fe 100644 --- a/storage/innobase/include/os0proc.h +++ b/storage/innobase/include/os0proc.h @@ -39,7 +39,7 @@ typedef void* os_process_t; typedef unsigned long int os_process_id_t; /** The total amount of memory currently allocated from the operating -system with os_mem_alloc_large(). */ +system with allocate_large(). */ extern Atomic_counter os_total_large_mem_allocated; /** Converts the current process id to a number. @@ -47,19 +47,4 @@ extern Atomic_counter os_total_large_mem_allocated; ulint os_proc_get_number(void); -/** Allocates large pages memory. -@param[in,out] n Number of bytes to allocate -@return allocated memory */ -void* -os_mem_alloc_large( - ulint* n); - -/** Frees large pages memory. -@param[in] ptr pointer returned by os_mem_alloc_large() -@param[in] size size returned by os_mem_alloc_large() */ -void -os_mem_free_large( - void *ptr, - ulint size); - #endif diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h index e4ef9712e0f..c35808a56e2 100644 --- a/storage/innobase/include/ut0new.h +++ b/storage/innobase/include/ut0new.h @@ -128,14 +128,16 @@ InnoDB: #include /* malloc() */ #include /* strlen(), strrchr(), strncmp() */ +#include /* my_large_free/malloc() */ + #include "my_global.h" /* needed for headers from mysql/psi/ */ #include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */ #include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */ -#include "os0proc.h" /* os_mem_alloc_large() */ #include "os0thread.h" /* os_thread_sleep() */ +#include "os0proc.h" /* os_total_large_mem_allocated */ #include "ut0ut.h" /* ut_strcmp_functor, ut_basename_noext() */ #define OUT_OF_MEMORY_MSG \ @@ -622,7 +624,7 @@ public: ulint n_bytes = n_elements * sizeof(T); pointer ptr = reinterpret_cast( - os_mem_alloc_large(&n_bytes)); + my_large_malloc(&n_bytes, MYF(0))); if (ptr == NULL) { return NULL; @@ -637,6 +639,8 @@ public: pfx->m_size = n_bytes; } + os_total_large_mem_allocated += n_bytes; + return(ptr); } @@ -655,40 +659,26 @@ public: void deallocate_large( pointer ptr, - const ut_new_pfx_t* -#ifdef UNIV_PFS_MEMORY - pfx -#endif - , - size_t size) + const ut_new_pfx_t* pfx) { + size_t size = pfx->m_size; #ifdef UNIV_PFS_MEMORY if (pfx) { deallocate_trace(pfx); } #endif /* UNIV_PFS_MEMORY */ + os_total_large_mem_allocated -= size; - os_mem_free_large(ptr, size); + my_large_free(ptr, size); } void deallocate_large_dodump( pointer ptr, - const ut_new_pfx_t* -#ifdef UNIV_PFS_MEMORY - pfx -#endif - , - size_t size) + const ut_new_pfx_t* pfx) { - ut_dodump(ptr, size); - deallocate_large(ptr, -#ifdef UNIV_PFS_MEMORY - pfx, -#else - NULL, -#endif - size); + ut_dodump(ptr, pfx->m_size); + deallocate_large(ptr, pfx); } #ifdef UNIV_PFS_MEMORY @@ -941,9 +931,6 @@ ut_delete_array( #define ut_free(ptr) ut_allocator(PSI_NOT_INSTRUMENTED).deallocate( \ reinterpret_cast(ptr)) -#define ut_free_dodump(ptr, size) ut_allocator(PSI_NOT_INSTRUMENTED).deallocate_large_dodump( \ - reinterpret_cast(ptr), NULL, size) - #else /* UNIV_PFS_MEMORY */ /* Fallbacks when memory tracing is disabled at compile time. */ @@ -968,9 +955,13 @@ ut_delete_array( static inline void *ut_malloc_dontdump(size_t n_bytes, ...) { - void *ptr = os_mem_alloc_large(&n_bytes); + void *ptr = my_large_malloc(&n_bytes, MYF(0)); ut_dontdump(ptr, n_bytes, true); + + if (ptr) { + os_total_large_mem_allocated += n_bytes; + } return ptr; } @@ -982,12 +973,13 @@ static inline void *ut_malloc_dontdump(size_t n_bytes, ...) #define ut_free(ptr) ::free(ptr) +#endif /* UNIV_PFS_MEMORY */ + static inline void ut_free_dodump(void *ptr, size_t size) { ut_dodump(ptr, size); - os_mem_free_large(ptr, size); + os_total_large_mem_allocated -= size; + my_large_free(ptr, size); } -#endif /* UNIV_PFS_MEMORY */ - #endif /* ut0new_h */ diff --git a/storage/innobase/os/os0proc.cc b/storage/innobase/os/os0proc.cc index 32067297a24..8cea535ff2a 100644 --- a/storage/innobase/os/os0proc.cc +++ b/storage/innobase/os/os0proc.cc @@ -26,17 +26,6 @@ Created 9/30/1995 Heikki Tuuri *******************************************************/ #include "univ.i" -#ifdef HAVE_LINUX_LARGE_PAGES -# include "mysqld.h" -#endif - -/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and -MAP_ANON but MAP_ANON is marked as deprecated */ -#if defined(MAP_ANONYMOUS) -#define OS_MAP_ANON MAP_ANONYMOUS -#elif defined(MAP_ANON) -#define OS_MAP_ANON MAP_ANON -#endif /** The total amount of memory currently allocated from the operating system with os_mem_alloc_large(). */ @@ -54,138 +43,3 @@ os_proc_get_number(void) return(static_cast(getpid())); #endif } - -/** Allocates large pages memory. -@param[in,out] n Number of bytes to allocate -@return allocated memory */ -void* -os_mem_alloc_large( - ulint* n) -{ - void* ptr; - ulint size; -#ifdef HAVE_LINUX_LARGE_PAGES - int shmid; - struct shmid_ds buf; - - if (!my_use_large_pages || !opt_large_page_size) { - goto skip; - } - - /* Align block size to opt_large_page_size */ - ut_ad(ut_is_2pow(opt_large_page_size)); - size = ut_2pow_round(*n + opt_large_page_size - 1, - ulint(opt_large_page_size)); - - shmid = shmget(IPC_PRIVATE, (size_t) size, SHM_HUGETLB | SHM_R | SHM_W); - if (shmid < 0) { - ib::warn() << "Failed to allocate " << size - << " bytes. errno " << errno; - ptr = NULL; - } else { - ptr = shmat(shmid, NULL, 0); - if (ptr == (void*)-1) { - ib::warn() << "Failed to attach shared memory segment," - " errno " << errno; - ptr = NULL; - } - - /* Remove the shared memory segment so that it will be - automatically freed after memory is detached or - process exits */ - shmctl(shmid, IPC_RMID, &buf); - } - - if (ptr) { - *n = size; - os_total_large_mem_allocated += size; - UNIV_MEM_ALLOC(ptr, size); - return(ptr); - } - - ib::warn() << "Using conventional memory pool"; -skip: -#endif /* HAVE_LINUX_LARGE_PAGES */ - -#ifdef _WIN32 - SYSTEM_INFO system_info; - GetSystemInfo(&system_info); - - /* Align block size to system page size */ - ut_ad(ut_is_2pow(system_info.dwPageSize)); - size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1), - system_info.dwPageSize); - ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, - PAGE_READWRITE); - if (!ptr) { - ib::info() << "VirtualAlloc(" << size << " bytes) failed;" - " Windows error " << GetLastError(); - } else { - os_total_large_mem_allocated += size; - UNIV_MEM_ALLOC(ptr, size); - } -#else - size = getpagesize(); - /* Align block size to system page size */ - ut_ad(ut_is_2pow(size)); - size = *n = ut_2pow_round(*n + (size - 1), size); - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | OS_MAP_ANON, -1, 0); - if (UNIV_UNLIKELY(ptr == (void*) -1)) { - ib::error() << "mmap(" << size << " bytes) failed;" - " errno " << errno; - ptr = NULL; - } else { - os_total_large_mem_allocated += size; - UNIV_MEM_ALLOC(ptr, size); - } -#endif - return(ptr); -} - -/** Frees large pages memory. -@param[in] ptr pointer returned by os_mem_alloc_large() -@param[in] size size returned by os_mem_alloc_large() */ -void -os_mem_free_large( - void *ptr, - ulint size) -{ - ut_a(os_total_large_mem_allocated >= size); - - // We could have manually poisoned that memory for ASAN. - // And we must unpoison it by ourself as specified in documentation - // for __asan_poison_memory_region() in sanitizer/asan_interface.h - // munmap() doesn't do it for us automatically. - UNIV_MEM_ALLOC(ptr, size); - -#ifdef HAVE_LINUX_LARGE_PAGES - if (my_use_large_pages && opt_large_page_size && !shmdt(ptr)) { - os_total_large_mem_allocated -= size; - return; - } -#endif /* HAVE_LINUX_LARGE_PAGES */ -#ifdef _WIN32 - /* When RELEASE memory, the size parameter must be 0. - Do not use MEM_RELEASE with MEM_DECOMMIT. */ - if (!VirtualFree(ptr, 0, MEM_RELEASE)) { - ib::error() << "VirtualFree(" << ptr << ", " << size - << ") failed; Windows error " << GetLastError(); - } else { - os_total_large_mem_allocated -= size; - } -#elif !defined OS_MAP_ANON - ut_free(ptr); -#else -# if defined(UNIV_SOLARIS) - if (munmap(static_cast(ptr), size)) { -# else - if (munmap(ptr, size)) { -# endif /* UNIV_SOLARIS */ - ib::error() << "munmap(" << ptr << ", " << size << ") failed;" - " errno " << errno; - } else { - os_total_large_mem_allocated -= size; - } -#endif -} diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index f7e9cb18e5e..693e81e4341 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -211,11 +211,13 @@ struct row_log_t { row_log_buf_t tail; /*!< writer context; protected by mutex and index->lock S-latch, or by index->lock X-latch only */ + size_t crypt_tail_size; /*!< size of crypt_tail_size*/ byte* crypt_tail; /*!< writer context; temporary buffer used in encryption, decryption or NULL*/ row_log_buf_t head; /*!< reader context; protected by MDL only; modifiable by row_log_apply_ops() */ + size_t crypt_head_size; /*!< size of crypt_tail_size*/ byte* crypt_head; /*!< reader context; temporary buffer used in encryption, decryption or NULL */ @@ -314,8 +316,7 @@ row_log_block_free( DBUG_ENTER("row_log_block_free"); if (log_buf.block != NULL) { ut_allocator(mem_key_row_log_buf).deallocate_large( - log_buf.block, &log_buf.block_pfx, - log_buf.size); + log_buf.block, &log_buf.block_pfx); log_buf.block = NULL; } DBUG_VOID_RETURN; @@ -3243,9 +3244,11 @@ row_log_allocate( index->online_log = log; if (log_tmp_is_encrypted()) { - ulint size = srv_sort_buf_size; - log->crypt_head = static_cast(os_mem_alloc_large(&size)); - log->crypt_tail = static_cast(os_mem_alloc_large(&size)); + log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size; + log->crypt_head = static_cast( + my_large_malloc(&log->crypt_head_size, MYF(MY_WME))); + log->crypt_tail = static_cast( + my_large_malloc(&log->crypt_tail_size, MYF(MY_WME))); if (!log->crypt_head || !log->crypt_tail) { row_log_free(log); @@ -3277,11 +3280,11 @@ row_log_free( row_merge_file_destroy_low(log->fd); if (log->crypt_head) { - os_mem_free_large(log->crypt_head, srv_sort_buf_size); + my_large_free(log->crypt_head, log->crypt_head_size); } if (log->crypt_tail) { - os_mem_free_large(log->crypt_tail, srv_sort_buf_size); + my_large_free(log->crypt_tail, log->crypt_tail_size); } mutex_free(&log->mutex); diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 1c3f52e1b4a..e4943195fb1 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -4745,11 +4745,10 @@ func_exit: ut_free(merge_files); - alloc.deallocate_large(block, &block_pfx, block_size); + alloc.deallocate_large(block, &block_pfx); if (crypt_block) { - alloc.deallocate_large(crypt_block, &crypt_pfx, - block_size); + alloc.deallocate_large(crypt_block, &crypt_pfx); } DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c index 8fa6646048d..0b2002267bf 100644 --- a/storage/maria/ma_pagecache.c +++ b/storage/maria/ma_pagecache.c @@ -849,9 +849,9 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem, (blocks << pagecache->shift) > use_mem && blocks > 8) blocks--; /* Allocate memory for cache page buffers */ + pagecache->mem_size= blocks * pagecache->block_size; if ((pagecache->block_mem= - my_large_malloc(blocks * pagecache->block_size, - MYF(MY_WME)))) + my_large_malloc(&pagecache->mem_size, MYF(MY_WME)))) { /* Allocate memory for blocks, hash_links and hash entries; @@ -875,7 +875,7 @@ size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem, changed_blocks_hash_size), NullS)) break; - my_large_free(pagecache->block_mem); + my_large_free(pagecache->block_mem, pagecache->mem_size); pagecache->block_mem= 0; } blocks= blocks / 4*3; @@ -926,7 +926,7 @@ err: pagecache->blocks= 0; if (pagecache->block_mem) { - my_large_free(pagecache->block_mem); + my_large_free(pagecache->block_mem, pagecache->mem_size); pagecache->block_mem= NULL; } if (pagecache->block_root) @@ -1200,7 +1200,7 @@ void end_pagecache(PAGECACHE *pagecache, my_bool cleanup) if (pagecache->block_mem) { - my_large_free(pagecache->block_mem); + my_large_free(pagecache->block_mem, pagecache->mem_size); pagecache->block_mem= NULL; my_free(pagecache->block_root); pagecache->block_root= NULL;