MDEV-25870 followup - some Windows ARM64 improvements

- optimize atomic store64/load64 implementation.
- allow CRC32 optimization. Do not allow pmull yet, as this fails like in
  https://stackoverflow.com/questions/54048837/how-to-perform-polynomial-multiplication-using-arm64
This commit is contained in:
Vladislav Vaintroub 2023-09-24 11:20:38 +02:00
parent e9573c0596
commit 905c3d61e1
3 changed files with 28 additions and 9 deletions

View file

@ -72,7 +72,7 @@ static inline int64 my_atomic_add64(int64 volatile *a, int64 v)
64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be 64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be
atomic on 32-bit Windows. atomic on 32-bit Windows.
https://msdn.microsoft.com/en-us/library/windows/desktop/ms684122(v=vs.85).aspx https://learn.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
*/ */
static inline int32 my_atomic_load32(int32 volatile *a) static inline int32 my_atomic_load32(int32 volatile *a)
@ -84,7 +84,7 @@ static inline int32 my_atomic_load32(int32 volatile *a)
static inline int64 my_atomic_load64(int64 volatile *a) static inline int64 my_atomic_load64(int64 volatile *a)
{ {
#ifdef _M_X64 #if defined(_M_X64) || defined(_M_ARM64)
int64 value= *a; int64 value= *a;
MemoryBarrier(); MemoryBarrier();
return value; return value;
@ -123,7 +123,7 @@ static inline void my_atomic_store32(int32 volatile *a, int32 v)
static inline void my_atomic_store64(int64 volatile *a, int64 v) static inline void my_atomic_store64(int64 volatile *a, int64 v)
{ {
#ifdef _M_X64 #if defined(_M_X64) || defined(_M_ARM64)
MemoryBarrier(); MemoryBarrier();
*a= v; *a= v;
#else #else

View file

@ -67,6 +67,9 @@ IF(MSVC_INTEL)
IF(CLANG_CL) IF(CLANG_CL)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF() ENDIF()
ELSEIF(MSVC_ARM64)
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686") ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
MY_CHECK_CXX_COMPILER_FLAG(-msse4.2) MY_CHECK_CXX_COMPILER_FLAG(-msse4.2)
MY_CHECK_CXX_COMPILER_FLAG(-mpclmul) MY_CHECK_CXX_COMPILER_FLAG(-mpclmul)

View file

@ -6,7 +6,22 @@ static int pmull_supported;
#if defined(HAVE_ARMV8_CRC) #if defined(HAVE_ARMV8_CRC)
#if defined(__APPLE__) #ifdef _WIN32
#include <windows.h>
int crc32_aarch64_available(void)
{
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
}
const char *crc32c_aarch64_available(void)
{
if (crc32_aarch64_available() == 0)
return NULL;
/* TODO : pmull seems supported, but does not compile*/
return "Using ARMv8 crc32 instructions";
}
#elif defined(__APPLE__)
#include <sys/sysctl.h> #include <sys/sysctl.h>
int crc32_aarch64_available(void) int crc32_aarch64_available(void)
@ -103,7 +118,10 @@ asm(".arch_extension crypto");
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/* Intrinsics header*/ /* Intrinsics header*/
#ifndef _WIN32
#include <arm_acle.h> #include <arm_acle.h>
#endif
#include <arm_neon.h> #include <arm_neon.h>
#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value)) #define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
@ -159,11 +177,11 @@ asm(".arch_extension crypto");
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
{ {
uint32_t crc0, crc1, crc2;
int64_t length= (int64_t)len; int64_t length= (int64_t)len;
crc^= 0xffffffff; crc^= 0xffffffff;
#ifdef HAVE_ARMV8_CRYPTO
/* Pmull runtime check here. /* Pmull runtime check here.
* Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
* *
@ -174,8 +192,8 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
*/ */
if (pmull_supported) if (pmull_supported)
{ {
uint32_t crc0, crc1, crc2;
/* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */ /* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */
#ifdef HAVE_ARMV8_CRYPTO
/* Crypto extension Support /* Crypto extension Support
* Parallel computation with 1024 Bytes (per block) * Parallel computation with 1024 Bytes (per block)
@ -277,10 +295,8 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
/* Done if Input data size is aligned with 1024 */ /* Done if Input data size is aligned with 1024 */
if (!(length+= 1024)) if (!(length+= 1024))
return ~crc; return ~crc;
#endif /* HAVE_ARMV8_CRYPTO */
} // end if pmull_supported } // end if pmull_supported
#endif /* HAVE_ARMV8_CRYPTO */
while ((length-= sizeof(uint64_t)) >= 0) while ((length-= sizeof(uint64_t)) >= 0)
{ {