From 1be4b121d514393e2305273436cde25acbf1a25d Mon Sep 17 00:00:00 2001 From: Vladislav Vaintroub Date: Sun, 25 Mar 2012 19:27:24 +0200 Subject: [PATCH] Small Windows specific performance fixes: - Use native memcmp() supplied with C runtime instead of hand-unrolled loop ptr_compare_N loop Prior to fix ptr_compare_0() has 3.7% samples in OLTP-RO in-memory. Fix brings this down to 1.8% (all memcmp samples) - Innodb : fix UT_RELAX_CPU to be defined as YieldProcessor, as was also originally intended (but intention was lost in the #ifdef maze This reduces number of ut_delay() samples in profile from 1.5% to 0.5% --- mysys/ptr_cmp.c | 28 +++++++++++++++++----------- storage/innobase/include/ut0ut.h | 10 +++++----- storage/xtradb/include/ut0ut.h | 10 +++++----- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/mysys/ptr_cmp.c b/mysys/ptr_cmp.c index 6f9ab13c82b..a481b4d961c 100644 --- a/mysys/ptr_cmp.c +++ b/mysys/ptr_cmp.c @@ -21,17 +21,23 @@ #include "mysys_priv.h" #include - -#ifdef __sun /* - * On Solaris, memcmp() is normally faster than the unrolled ptr_compare_N + * On some platforms, memcmp() is faster than the unrolled ptr_compare_N * functions, as memcmp() is usually a platform-specific implementation - * written in assembler, provided in /usr/lib/libc/libc_hwcap*.so.1. - * This implementation is also usually faster than the built-in memcmp - * supplied by GCC, so it is recommended to build with "-fno-builtin-memcmp" - * in CFLAGS if building with GCC on Solaris. + * written in assembler. for example one in /usr/lib/libc/libc_hwcap*.so.1. + * on Solaris, or on Windows inside C runtime linrary. + * + * On Solaris, native implementation is also usually faster than the + * built-in memcmp supplied by GCC, so it is recommended to build + * with "-fno-builtin-memcmp"in CFLAGS if building with GCC on Solaris. */ +#if defined (__sun) || defined (_WIN32) +#define USE_NATIVE_MEMCMP 1 +#endif + +#ifdef USE_NATIVE_MEMCMP + #include static int native_compare(size_t *length, unsigned char **a, unsigned char **b) @@ -39,7 +45,7 @@ static int native_compare(size_t *length, unsigned char **a, unsigned char **b) return memcmp(*a, *b, *length); } -#else /* __sun */ +#else /* USE_NATIVE_MEMCMP */ static int ptr_compare(size_t *compare_length, uchar **a, uchar **b); static int ptr_compare_0(size_t *compare_length, uchar **a, uchar **b); @@ -50,7 +56,7 @@ static int ptr_compare_3(size_t *compare_length, uchar **a, uchar **b); /* Get a pointer to a optimal byte-compare function for a given size */ -#ifdef __sun +#ifdef USE_NATIVE_MEMCMP qsort2_cmp get_ptr_compare (size_t size __attribute__((unused))) { return (qsort2_cmp) native_compare; @@ -68,7 +74,7 @@ qsort2_cmp get_ptr_compare (size_t size) } return 0; /* Impossible */ } -#endif /* __sun */ +#endif /* USE_NATIVE_MEMCMP */ /* @@ -78,7 +84,7 @@ qsort2_cmp get_ptr_compare (size_t size) #define cmp(N) if (first[N] != last[N]) return (int) first[N] - (int) last[N] -#ifndef __sun +#ifndef USE_NATIVE_MEMCMP static int ptr_compare(size_t *compare_length, uchar **a, uchar **b) { diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index cad39e9a34f..47ab6eb9b74 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -63,16 +63,16 @@ typedef time_t ib_time_t; # define UT_RELAX_CPU() __asm__ __volatile__ ("pause") #elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) # define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") -#elif defined(HAVE_ATOMIC_BUILTINS) -# define UT_RELAX_CPU() do { \ - volatile lint volatile_var; \ - os_compare_and_swap_lint(&volatile_var, 0, 1); \ - } while (0) #elif defined(HAVE_WINDOWS_ATOMICS) /* In the Win32 API, the x86 PAUSE instruction is executed by calling the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- independent way by using YieldProcessor. */ # define UT_RELAX_CPU() YieldProcessor() +#elif defined(HAVE_ATOMIC_BUILTINS) +# define UT_RELAX_CPU() do { \ + volatile lint volatile_var; \ + os_compare_and_swap_lint(&volatile_var, 0, 1); \ + } while (0) #else # define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ #endif diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h index cad39e9a34f..47ab6eb9b74 100644 --- a/storage/xtradb/include/ut0ut.h +++ b/storage/xtradb/include/ut0ut.h @@ -63,16 +63,16 @@ typedef time_t ib_time_t; # define UT_RELAX_CPU() __asm__ __volatile__ ("pause") #elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) # define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") -#elif defined(HAVE_ATOMIC_BUILTINS) -# define UT_RELAX_CPU() do { \ - volatile lint volatile_var; \ - os_compare_and_swap_lint(&volatile_var, 0, 1); \ - } while (0) #elif defined(HAVE_WINDOWS_ATOMICS) /* In the Win32 API, the x86 PAUSE instruction is executed by calling the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- independent way by using YieldProcessor. */ # define UT_RELAX_CPU() YieldProcessor() +#elif defined(HAVE_ATOMIC_BUILTINS) +# define UT_RELAX_CPU() do { \ + volatile lint volatile_var; \ + os_compare_and_swap_lint(&volatile_var, 0, 1); \ + } while (0) #else # define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ #endif