mirror of
https://github.com/MariaDB/server.git
synced 2025-01-25 00:04:33 +01:00
MDEV-34699 - mhnsw: support aarch64 SIMD instructions
SIMD implementations of bloom filters and dot product calculation. A microbenchmark shows 1.7x dot product performance improvement compared to regular -O2/-O3 builds and 2.4x compared to builds with auto-vectorization disabled. Performance improvement (microbenchmark) for bloom filters is less exciting, within 10-30% ballpark depending on compiler options and load. Misc implementation notes: CalcHash: no _mm256_shuffle_epi8(), use explicit XOR/shift. CalcHash: no 64bit multiplication, do scalar multiplication. ConstructMask/Query: no _mm256_i64gather_epi64, access array elements explicitly. Query: no _mm256_movemask_epi8, accumulate bits manually. Closes #3671
This commit is contained in:
parent
cacaaebf01
commit
11a6c1b30a
2 changed files with 208 additions and 3 deletions
|
@ -28,9 +28,18 @@ SOFTWARE.
|
|||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
/*
|
||||
Use gcc function multiversioning to optimize for a specific CPU with run-time
|
||||
detection. Works only for x86, for other architectures we provide only one
|
||||
implementation for now.
|
||||
*/
|
||||
#define DEFAULT_IMPLEMENTATION
|
||||
#if __GNUC__ > 7
|
||||
#ifdef __x86_64__
|
||||
#ifdef HAVE_IMMINTRIN_H
|
||||
#include <immintrin.h>
|
||||
#if __GNUC__ > 7 && defined __x86_64__
|
||||
#undef DEFAULT_IMPLEMENTATION
|
||||
#define DEFAULT_IMPLEMENTATION __attribute__ ((target ("default")))
|
||||
#define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma")))
|
||||
#if __GNUC__ > 9
|
||||
|
@ -38,8 +47,11 @@ SOFTWARE.
|
|||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#ifndef DEFAULT_IMPLEMENTATION
|
||||
#define DEFAULT_IMPLEMENTATION
|
||||
#ifdef __aarch64__
|
||||
#include <arm_neon.h>
|
||||
#undef DEFAULT_IMPLEMENTATION
|
||||
#define NEON_IMPLEMENTATION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
|
@ -177,9 +189,157 @@ struct PatternedSimdBloomFilter
|
|||
basically, unnoticeable, well below the noise level */
|
||||
#endif
|
||||
|
||||
#ifdef NEON_IMPLEMENTATION
|
||||
uint64x2_t CalcHash(uint64x2_t vecData)
|
||||
{
|
||||
static constexpr uint64_t prime_mx2= 0x9FB21C651E98DF25ULL;
|
||||
static constexpr uint64_t bitflip= 0xC73AB174C5ECD5A2ULL;
|
||||
uint64x2_t step1= veorq_u64(vecData, vdupq_n_u64(bitflip));
|
||||
uint64x2_t step2= veorq_u64(vshrq_n_u64(step1, 48), vshlq_n_u64(step1, 16));
|
||||
uint64x2_t step3= veorq_u64(vshrq_n_u64(step1, 24), vshlq_n_u64(step1, 40));
|
||||
uint64x2_t step4= veorq_u64(step1, veorq_u64(step2, step3));
|
||||
uint64x2_t step5;
|
||||
step5= vsetq_lane_u64(vgetq_lane_u64(step4, 0) * prime_mx2, step4, 0);
|
||||
step5= vsetq_lane_u64(vgetq_lane_u64(step4, 1) * prime_mx2, step5, 1);
|
||||
uint64x2_t step6= vshrq_n_u64(step5, 35);
|
||||
uint64x2_t step7= vaddq_u64(step6, vdupq_n_u64(8));
|
||||
uint64x2_t step8= veorq_u64(step5, step7);
|
||||
uint64x2_t step9;
|
||||
step9= vsetq_lane_u64(vgetq_lane_u64(step8, 0) * prime_mx2, step8, 0);
|
||||
step9= vsetq_lane_u64(vgetq_lane_u64(step8, 1) * prime_mx2, step9, 1);
|
||||
return veorq_u64(step9, vshrq_n_u64(step9, 28));
|
||||
}
|
||||
|
||||
uint64x2_t GetBlockIdx(uint64x2_t vecHash)
|
||||
{
|
||||
uint64x2_t vecNumBlocksMask= vdupq_n_u64(num_blocks - 1);
|
||||
uint64x2_t vecBlockIdx= vshrq_n_u64(vecHash, mask_idx_bits + rotate_bits);
|
||||
return vandq_u64(vecBlockIdx, vecNumBlocksMask);
|
||||
}
|
||||
|
||||
uint64x2_t ConstructMask(uint64x2_t vecHash)
|
||||
{
|
||||
uint64x2_t vecMaskIdxMask= vdupq_n_u64((1 << mask_idx_bits) - 1);
|
||||
uint64x2_t vecMaskMask= vdupq_n_u64((1ull << bits_per_mask) - 1);
|
||||
|
||||
uint64x2_t vecMaskIdx= vandq_u64(vecHash, vecMaskIdxMask);
|
||||
uint64x2_t vecMaskByteIdx= vshrq_n_u64(vecMaskIdx, 3);
|
||||
/*
|
||||
Shift right in NEON is implemented as shift left by a negative value.
|
||||
Do the negation here.
|
||||
*/
|
||||
int64x2_t vecMaskBitIdx=
|
||||
vsubq_s64(vdupq_n_s64(0),
|
||||
vreinterpretq_s64_u64(vandq_u64(vecMaskIdx, vdupq_n_u64(0x7))));
|
||||
uint64x2_t vecRawMasks= vdupq_n_u64(*reinterpret_cast<const uint64_t*>
|
||||
(masks + vgetq_lane_u64(vecMaskByteIdx, 0)));
|
||||
vecRawMasks= vsetq_lane_u64(*reinterpret_cast<const uint64_t*>
|
||||
(masks + vgetq_lane_u64(vecMaskByteIdx, 1)), vecRawMasks, 1);
|
||||
uint64x2_t vecUnrotated=
|
||||
vandq_u64(vshlq_u64(vecRawMasks, vecMaskBitIdx), vecMaskMask);
|
||||
|
||||
int64x2_t vecRotation=
|
||||
vreinterpretq_s64_u64(vandq_u64(vshrq_n_u64(vecHash, mask_idx_bits),
|
||||
vdupq_n_u64((1 << rotate_bits) - 1)));
|
||||
uint64x2_t vecShiftUp= vshlq_u64(vecUnrotated, vecRotation);
|
||||
uint64x2_t vecShiftDown=
|
||||
vshlq_u64(vecUnrotated, vsubq_s64(vecRotation, vdupq_n_s64(64)));
|
||||
return vorrq_u64(vecShiftDown, vecShiftUp);
|
||||
}
|
||||
|
||||
void Insert(const T **data)
|
||||
{
|
||||
uint64x2_t vecDataA= vld1q_u64(reinterpret_cast<uint64_t *>(data + 0));
|
||||
uint64x2_t vecDataB= vld1q_u64(reinterpret_cast<uint64_t *>(data + 2));
|
||||
uint64x2_t vecDataC= vld1q_u64(reinterpret_cast<uint64_t *>(data + 4));
|
||||
uint64x2_t vecDataD= vld1q_u64(reinterpret_cast<uint64_t *>(data + 6));
|
||||
|
||||
uint64x2_t vecHashA= CalcHash(vecDataA);
|
||||
uint64x2_t vecHashB= CalcHash(vecDataB);
|
||||
uint64x2_t vecHashC= CalcHash(vecDataC);
|
||||
uint64x2_t vecHashD= CalcHash(vecDataD);
|
||||
|
||||
uint64x2_t vecMaskA= ConstructMask(vecHashA);
|
||||
uint64x2_t vecMaskB= ConstructMask(vecHashB);
|
||||
uint64x2_t vecMaskC= ConstructMask(vecHashC);
|
||||
uint64x2_t vecMaskD= ConstructMask(vecHashD);
|
||||
|
||||
uint64x2_t vecBlockIdxA= GetBlockIdx(vecHashA);
|
||||
uint64x2_t vecBlockIdxB= GetBlockIdx(vecHashB);
|
||||
uint64x2_t vecBlockIdxC= GetBlockIdx(vecHashC);
|
||||
uint64x2_t vecBlockIdxD= GetBlockIdx(vecHashD);
|
||||
|
||||
uint64_t block0= vgetq_lane_u64(vecBlockIdxA, 0);
|
||||
uint64_t block1= vgetq_lane_u64(vecBlockIdxA, 1);
|
||||
uint64_t block2= vgetq_lane_u64(vecBlockIdxB, 0);
|
||||
uint64_t block3= vgetq_lane_u64(vecBlockIdxB, 1);
|
||||
uint64_t block4= vgetq_lane_u64(vecBlockIdxC, 0);
|
||||
uint64_t block5= vgetq_lane_u64(vecBlockIdxC, 1);
|
||||
uint64_t block6= vgetq_lane_u64(vecBlockIdxD, 0);
|
||||
uint64_t block7= vgetq_lane_u64(vecBlockIdxD, 1);
|
||||
|
||||
bv[block0]|= vgetq_lane_u64(vecMaskA, 0);
|
||||
bv[block1]|= vgetq_lane_u64(vecMaskA, 1);
|
||||
bv[block2]|= vgetq_lane_u64(vecMaskB, 0);
|
||||
bv[block3]|= vgetq_lane_u64(vecMaskB, 1);
|
||||
bv[block4]|= vgetq_lane_u64(vecMaskC, 0);
|
||||
bv[block5]|= vgetq_lane_u64(vecMaskC, 1);
|
||||
bv[block6]|= vgetq_lane_u64(vecMaskD, 0);
|
||||
bv[block7]|= vgetq_lane_u64(vecMaskD, 1);
|
||||
}
|
||||
|
||||
uint8_t Query(T **data)
|
||||
{
|
||||
uint64x2_t vecDataA= vld1q_u64(reinterpret_cast<uint64_t *>(data + 0));
|
||||
uint64x2_t vecDataB= vld1q_u64(reinterpret_cast<uint64_t *>(data + 2));
|
||||
uint64x2_t vecDataC= vld1q_u64(reinterpret_cast<uint64_t *>(data + 4));
|
||||
uint64x2_t vecDataD= vld1q_u64(reinterpret_cast<uint64_t *>(data + 6));
|
||||
|
||||
uint64x2_t vecHashA= CalcHash(vecDataA);
|
||||
uint64x2_t vecHashB= CalcHash(vecDataB);
|
||||
uint64x2_t vecHashC= CalcHash(vecDataC);
|
||||
uint64x2_t vecHashD= CalcHash(vecDataD);
|
||||
|
||||
uint64x2_t vecMaskA= ConstructMask(vecHashA);
|
||||
uint64x2_t vecMaskB= ConstructMask(vecHashB);
|
||||
uint64x2_t vecMaskC= ConstructMask(vecHashC);
|
||||
uint64x2_t vecMaskD= ConstructMask(vecHashD);
|
||||
|
||||
uint64x2_t vecBlockIdxA= GetBlockIdx(vecHashA);
|
||||
uint64x2_t vecBlockIdxB= GetBlockIdx(vecHashB);
|
||||
uint64x2_t vecBlockIdxC= GetBlockIdx(vecHashC);
|
||||
uint64x2_t vecBlockIdxD= GetBlockIdx(vecHashD);
|
||||
|
||||
uint64x2_t vecBloomA= vdupq_n_u64(bv[vgetq_lane_u64(vecBlockIdxA, 0)]);
|
||||
vecBloomA= vsetq_lane_u64(bv[vgetq_lane_u64(vecBlockIdxA, 1)], vecBloomA, 1);
|
||||
uint64x2_t vecBloomB= vdupq_n_u64(bv[vgetq_lane_u64(vecBlockIdxB, 0)]);
|
||||
vecBloomB= vsetq_lane_u64(bv[vgetq_lane_u64(vecBlockIdxB, 1)], vecBloomB, 1);
|
||||
uint64x2_t vecBloomC= vdupq_n_u64(bv[vgetq_lane_u64(vecBlockIdxC, 0)]);
|
||||
vecBloomC= vsetq_lane_u64(bv[vgetq_lane_u64(vecBlockIdxC, 1)], vecBloomC, 1);
|
||||
uint64x2_t vecBloomD= vdupq_n_u64(bv[vgetq_lane_u64(vecBlockIdxD, 0)]);
|
||||
vecBloomD= vsetq_lane_u64(bv[vgetq_lane_u64(vecBlockIdxD, 1)], vecBloomD, 1);
|
||||
|
||||
uint64x2_t vecCmpA= vceqq_u64(vandq_u64(vecMaskA, vecBloomA), vecMaskA);
|
||||
uint64x2_t vecCmpB= vceqq_u64(vandq_u64(vecMaskB, vecBloomB), vecMaskB);
|
||||
uint64x2_t vecCmpC= vceqq_u64(vandq_u64(vecMaskC, vecBloomC), vecMaskC);
|
||||
uint64x2_t vecCmpD= vceqq_u64(vandq_u64(vecMaskD, vecBloomD), vecMaskD);
|
||||
|
||||
return
|
||||
(vgetq_lane_u64(vecCmpA, 0) & 0x01) |
|
||||
(vgetq_lane_u64(vecCmpA, 1) & 0x02) |
|
||||
(vgetq_lane_u64(vecCmpB, 0) & 0x04) |
|
||||
(vgetq_lane_u64(vecCmpB, 1) & 0x08) |
|
||||
(vgetq_lane_u64(vecCmpC, 0) & 0x10) |
|
||||
(vgetq_lane_u64(vecCmpC, 1) & 0x20) |
|
||||
(vgetq_lane_u64(vecCmpD, 0) & 0x40) |
|
||||
(vgetq_lane_u64(vecCmpD, 1) & 0x80);
|
||||
}
|
||||
#endif
|
||||
|
||||
/********************************************************
|
||||
********* non-SIMD fallback version ********************/
|
||||
|
||||
#ifdef DEFAULT_IMPLEMENTATION
|
||||
uint64_t CalcHash_1(const T* data)
|
||||
{
|
||||
static constexpr uint64_t prime_mx2= 0x9FB21C651E98DF25ULL;
|
||||
|
@ -240,6 +400,7 @@ struct PatternedSimdBloomFilter
|
|||
}
|
||||
return res_bits;
|
||||
}
|
||||
#endif
|
||||
|
||||
int n;
|
||||
float epsilon;
|
||||
|
|
|
@ -188,7 +188,50 @@ struct FVector
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
ARM NEON implementation. A microbenchmark shows 1.7x dot_product() performance
|
||||
improvement compared to regular -O2/-O3 builds and 2.4x compared to builds
|
||||
with auto-vectorization disabled.
|
||||
|
||||
There seem to be no performance difference between vmull+vmull_high and
|
||||
vmull+vmlal2_high implementations.
|
||||
*/
|
||||
|
||||
#ifdef NEON_IMPLEMENTATION
|
||||
static constexpr size_t NEON_bytes= 128 / 8;
|
||||
static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t);
|
||||
|
||||
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
|
||||
{
|
||||
int64_t d= 0;
|
||||
for (size_t i= 0; i < (len + NEON_dims - 1) / NEON_dims; i++)
|
||||
{
|
||||
int16x8_t p1= vld1q_s16(v1);
|
||||
int16x8_t p2= vld1q_s16(v2);
|
||||
d+= vaddlvq_s32(vmull_s16(vget_low_s16(p1), vget_low_s16(p2))) +
|
||||
vaddlvq_s32(vmull_high_s16(p1, p2));
|
||||
v1+= NEON_dims;
|
||||
v2+= NEON_dims;
|
||||
}
|
||||
return static_cast<float>(d);
|
||||
}
|
||||
|
||||
static size_t alloc_size(size_t n)
|
||||
{ return alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }
|
||||
|
||||
static FVector *align_ptr(void *ptr)
|
||||
{ return (FVector*) (MY_ALIGN(((intptr) ptr) + alloc_header, NEON_bytes)
|
||||
- alloc_header); }
|
||||
|
||||
void fix_tail(size_t vec_len)
|
||||
{
|
||||
bzero(dims + vec_len, (MY_ALIGN(vec_len, NEON_dims) - vec_len) * 2);
|
||||
}
|
||||
#endif
|
||||
|
||||
/************* no-SIMD default ******************************************/
|
||||
#ifdef DEFAULT_IMPLEMENTATION
|
||||
DEFAULT_IMPLEMENTATION
|
||||
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
|
||||
{
|
||||
|
@ -206,6 +249,7 @@ struct FVector
|
|||
|
||||
DEFAULT_IMPLEMENTATION
|
||||
void fix_tail(size_t) { }
|
||||
#endif
|
||||
|
||||
float distance_to(const FVector *other, size_t vec_len) const
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue