From 905c3d61e18ae6222d0d195c43d335046eec65d9 Mon Sep 17 00:00:00 2001 From: Vladislav Vaintroub Date: Sun, 24 Sep 2023 11:20:38 +0200 Subject: [PATCH] MDEV-25870 followup - some Windows ARM64 improvements - optimize atomic store64/load64 implementation. - allow CRC32 optimization. Do not allow pmull yet, as this fails like in https://stackoverflow.com/questions/54048837/how-to-perform-polynomial-multiplication-using-arm64 --- include/atomic/generic-msvc.h | 6 +++--- mysys/CMakeLists.txt | 3 +++ mysys/crc32/crc32_arm64.c | 28 ++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/include/atomic/generic-msvc.h b/include/atomic/generic-msvc.h index ff2a5434071..8a29f560843 100644 --- a/include/atomic/generic-msvc.h +++ b/include/atomic/generic-msvc.h @@ -72,7 +72,7 @@ static inline int64 my_atomic_add64(int64 volatile *a, int64 v) 64-bit Windows. Reads and writes to 64-bit values are not guaranteed to be atomic on 32-bit Windows. - https://msdn.microsoft.com/en-us/library/windows/desktop/ms684122(v=vs.85).aspx + https://learn.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access */ static inline int32 my_atomic_load32(int32 volatile *a) @@ -84,7 +84,7 @@ static inline int32 my_atomic_load32(int32 volatile *a) static inline int64 my_atomic_load64(int64 volatile *a) { -#ifdef _M_X64 +#if defined(_M_X64) || defined(_M_ARM64) int64 value= *a; MemoryBarrier(); return value; @@ -123,7 +123,7 @@ static inline void my_atomic_store32(int32 volatile *a, int32 v) static inline void my_atomic_store64(int64 volatile *a, int64 v) { -#ifdef _M_X64 +#if defined(_M_X64) || defined(_M_ARM64) MemoryBarrier(); *a= v; #else diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt index 758243df10f..8021a2844c8 100644 --- a/mysys/CMakeLists.txt +++ b/mysys/CMakeLists.txt @@ -67,6 +67,9 @@ IF(MSVC_INTEL) IF(CLANG_CL) SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") ENDIF() +ELSEIF(MSVC_ARM64) + SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c) + ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS) ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686") MY_CHECK_CXX_COMPILER_FLAG(-msse4.2) MY_CHECK_CXX_COMPILER_FLAG(-mpclmul) diff --git a/mysys/crc32/crc32_arm64.c b/mysys/crc32/crc32_arm64.c index 0e70c21812a..79404874d60 100644 --- a/mysys/crc32/crc32_arm64.c +++ b/mysys/crc32/crc32_arm64.c @@ -6,7 +6,22 @@ static int pmull_supported; #if defined(HAVE_ARMV8_CRC) -#if defined(__APPLE__) +#ifdef _WIN32 +#include +int crc32_aarch64_available(void) +{ + return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); +} + +const char *crc32c_aarch64_available(void) +{ + if (crc32_aarch64_available() == 0) + return NULL; + /* TODO : pmull seems supported, but does not compile*/ + return "Using ARMv8 crc32 instructions"; +} + +#elif defined(__APPLE__) #include int crc32_aarch64_available(void) @@ -103,7 +118,10 @@ asm(".arch_extension crypto"); #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ /* Intrinsics header*/ +#ifndef _WIN32 #include +#endif + #include #define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value)) @@ -159,11 +177,11 @@ asm(".arch_extension crypto"); uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) { - uint32_t crc0, crc1, crc2; int64_t length= (int64_t)len; crc^= 0xffffffff; +#ifdef HAVE_ARMV8_CRYPTO /* Pmull runtime check here. * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). * @@ -174,8 +192,8 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) */ if (pmull_supported) { + uint32_t crc0, crc1, crc2; /* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */ -#ifdef HAVE_ARMV8_CRYPTO /* Crypto extension Support * Parallel computation with 1024 Bytes (per block) @@ -277,10 +295,8 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) /* Done if Input data size is aligned with 1024 */ if (!(length+= 1024)) return ~crc; - -#endif /* HAVE_ARMV8_CRYPTO */ - } // end if pmull_supported +#endif /* HAVE_ARMV8_CRYPTO */ while ((length-= sizeof(uint64_t)) >= 0) {