Small Windows specific performance fixes:

- Use native memcmp() supplied with C runtime instead of hand-unrolled loop ptr_compare_N loop
Prior to fix  ptr_compare_0()  has  3.7% samples in OLTP-RO in-memory. 
Fix brings this down to 1.8% (all memcmp samples)

- Innodb : fix UT_RELAX_CPU to  be defined  as YieldProcessor, as  was also originally intended 
(but intention was lost in the #ifdef maze

This reduces number of ut_delay() samples in profile from 1.5% to 0.5%
This commit is contained in:
Vladislav Vaintroub 2012-03-25 19:27:24 +02:00
parent 99aa3d465e
commit 1be4b121d5
3 changed files with 27 additions and 21 deletions
mysys
storage
innobase/include
xtradb/include

View file

@ -21,17 +21,23 @@
#include "mysys_priv.h"
#include <myisampack.h>
#ifdef __sun
/*
* On Solaris, memcmp() is normally faster than the unrolled ptr_compare_N
* On some platforms, memcmp() is faster than the unrolled ptr_compare_N
* functions, as memcmp() is usually a platform-specific implementation
* written in assembler, provided in /usr/lib/libc/libc_hwcap*.so.1.
* This implementation is also usually faster than the built-in memcmp
* supplied by GCC, so it is recommended to build with "-fno-builtin-memcmp"
* in CFLAGS if building with GCC on Solaris.
* written in assembler. for example one in /usr/lib/libc/libc_hwcap*.so.1.
* on Solaris, or on Windows inside C runtime linrary.
*
* On Solaris, native implementation is also usually faster than the
* built-in memcmp supplied by GCC, so it is recommended to build
* with "-fno-builtin-memcmp"in CFLAGS if building with GCC on Solaris.
*/
#if defined (__sun) || defined (_WIN32)
#define USE_NATIVE_MEMCMP 1
#endif
#ifdef USE_NATIVE_MEMCMP
#include <string.h>
static int native_compare(size_t *length, unsigned char **a, unsigned char **b)
@ -39,7 +45,7 @@ static int native_compare(size_t *length, unsigned char **a, unsigned char **b)
return memcmp(*a, *b, *length);
}
#else /* __sun */
#else /* USE_NATIVE_MEMCMP */
static int ptr_compare(size_t *compare_length, uchar **a, uchar **b);
static int ptr_compare_0(size_t *compare_length, uchar **a, uchar **b);
@ -50,7 +56,7 @@ static int ptr_compare_3(size_t *compare_length, uchar **a, uchar **b);
/* Get a pointer to a optimal byte-compare function for a given size */
#ifdef __sun
#ifdef USE_NATIVE_MEMCMP
qsort2_cmp get_ptr_compare (size_t size __attribute__((unused)))
{
return (qsort2_cmp) native_compare;
@ -68,7 +74,7 @@ qsort2_cmp get_ptr_compare (size_t size)
}
return 0; /* Impossible */
}
#endif /* __sun */
#endif /* USE_NATIVE_MEMCMP */
/*
@ -78,7 +84,7 @@ qsort2_cmp get_ptr_compare (size_t size)
#define cmp(N) if (first[N] != last[N]) return (int) first[N] - (int) last[N]
#ifndef __sun
#ifndef USE_NATIVE_MEMCMP
static int ptr_compare(size_t *compare_length, uchar **a, uchar **b)
{

View file

@ -63,16 +63,16 @@ typedef time_t ib_time_t;
# define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#elif defined(HAVE_WINDOWS_ATOMICS)
/* In the Win32 API, the x86 PAUSE instruction is executed by calling
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
independent way by using YieldProcessor. */
# define UT_RELAX_CPU() YieldProcessor()
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#else
# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
#endif

View file

@ -63,16 +63,16 @@ typedef time_t ib_time_t;
# define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#elif defined(HAVE_WINDOWS_ATOMICS)
/* In the Win32 API, the x86 PAUSE instruction is executed by calling
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
independent way by using YieldProcessor. */
# define UT_RELAX_CPU() YieldProcessor()
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#else
# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
#endif