mariadb/mysys/crc32/crc32_x86.c
mysqlonarm dec3f8ca69
MDEV-22641: Provide SIMD optimized wrapper for zlib crc32() (#1558)
Existing implementation used my_checksum (from mysys)
for calculating table checksum and binlog checksum.

This implementation was optimized for powerpc only and lacked
SIMD implementation for x86 (using clmul) and ARM
(using ACLE) instead used zlib-crc32.

mariabackup had its own copy of the crc32 implementation
using hardware optimized implementation only for x86 and lagged
hardware based implementation for powerpc and ARM.

Patch helps unifies all such calls and help aggregate all of them
using an unified interface my_checksum().

Said unification also enables hardware optimized calls for all
architecture viz. x86, ARM, POWERPC.
Default always fallback to zlib crc32.

Thanks to Daniel Black for reviewing, fixing and testing
PowerPC changes. Thanks to Marko and Daniel for early code feedback.
2020-06-01 11:34:06 +03:00

545 lines
15 KiB
C

/******************************************************
Copyright (c) 2017 Percona LLC and/or its affiliates.
CRC32 using Intel's PCLMUL instruction.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*******************************************************/
/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*
*/
#include <my_global.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
# define U64_C(c) (c ## UL)
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint64_t u64;
#ifndef byte
typedef uint8_t byte;
#endif
# define _gcry_bswap32 __builtin_bswap32
#if __GNUC__ >= 4 && defined(__x86_64__)
#if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */
/* Prevent compiler from issuing SSE instructions between asm blocks. */
# pragma GCC target("no-sse")
#endif
#define ALIGNED_16 __attribute__ ((aligned (16)))
struct u16_unaligned_s
{
u16 a;
} __attribute__((packed, aligned (1), may_alias));
/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
* functions. */
struct crc32_consts_s
{
/* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
u64 k[6];
/* my_p: { floor(x^64 / P(x)), P(x) } */
u64 my_p[2];
};
/* CLMUL constants for CRC32 and CRC32RFC1510. */
static const struct crc32_consts_s crc32_consts ALIGNED_16 =
{
{ /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
U64_C(0x163cd6124), 0 /* y = 2 */
},
{ /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
U64_C(0x1f7011641), U64_C(0x1db710641)
}
};
/* Common constants for CRC32 algorithms. */
static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
{
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};
static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};
static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
{
{ U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
{ U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
{ U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
{ U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
{ U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
{ U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
{ U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
};
static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
{
{ U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
{ U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
{ U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
};
/* PCLMUL functions for reflected CRC32. */
static inline void
crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
const struct crc32_consts_s *consts)
{
if (inlen >= 8 * 16)
{
asm volatile ("movd %[crc], %%xmm4\n\t"
"movdqu %[inbuf_0], %%xmm0\n\t"
"movdqu %[inbuf_1], %%xmm1\n\t"
"movdqu %[inbuf_2], %%xmm2\n\t"
"movdqu %[inbuf_3], %%xmm3\n\t"
"pxor %%xmm4, %%xmm0\n\t"
:
: [inbuf_0] "m" (inbuf[0 * 16]),
[inbuf_1] "m" (inbuf[1 * 16]),
[inbuf_2] "m" (inbuf[2 * 16]),
[inbuf_3] "m" (inbuf[3 * 16]),
[crc] "m" (*pcrc)
);
inbuf += 4 * 16;
inlen -= 4 * 16;
asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
:
: [k1k2] "m" (consts->k[1 - 1])
);
/* Fold by 4. */
while (inlen >= 4 * 16)
{
asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
"movdqa %%xmm0, %%xmm6\n\t"
"pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
"pxor %%xmm5, %%xmm0\n\t"
"pxor %%xmm6, %%xmm0\n\t"
"movdqu %[inbuf_1], %%xmm5\n\t"
"movdqa %%xmm1, %%xmm6\n\t"
"pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
"pxor %%xmm5, %%xmm1\n\t"
"pxor %%xmm6, %%xmm1\n\t"
"movdqu %[inbuf_2], %%xmm5\n\t"
"movdqa %%xmm2, %%xmm6\n\t"
"pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
"pxor %%xmm5, %%xmm2\n\t"
"pxor %%xmm6, %%xmm2\n\t"
"movdqu %[inbuf_3], %%xmm5\n\t"
"movdqa %%xmm3, %%xmm6\n\t"
"pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
"pxor %%xmm5, %%xmm3\n\t"
"pxor %%xmm6, %%xmm3\n\t"
:
: [inbuf_0] "m" (inbuf[0 * 16]),
[inbuf_1] "m" (inbuf[1 * 16]),
[inbuf_2] "m" (inbuf[2 * 16]),
[inbuf_3] "m" (inbuf[3 * 16])
);
inbuf += 4 * 16;
inlen -= 4 * 16;
}
asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
"movdqa %[my_p], %%xmm5\n\t"
:
: [k3k4] "m" (consts->k[3 - 1]),
[my_p] "m" (consts->my_p[0])
);
/* Fold 4 to 1. */
asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
"pxor %%xmm1, %%xmm0\n\t"
"pxor %%xmm4, %%xmm0\n\t"
"movdqa %%xmm0, %%xmm4\n\t"
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
"pxor %%xmm2, %%xmm0\n\t"
"pxor %%xmm4, %%xmm0\n\t"
"movdqa %%xmm0, %%xmm4\n\t"
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
"pxor %%xmm3, %%xmm0\n\t"
"pxor %%xmm4, %%xmm0\n\t"
:
:
);
}
else
{
asm volatile ("movd %[crc], %%xmm1\n\t"
"movdqu %[inbuf], %%xmm0\n\t"
"movdqa %[k3k4], %%xmm6\n\t"
"pxor %%xmm1, %%xmm0\n\t"
"movdqa %[my_p], %%xmm5\n\t"
:
: [inbuf] "m" (*inbuf),
[crc] "m" (*pcrc),
[k3k4] "m" (consts->k[3 - 1]),
[my_p] "m" (consts->my_p[0])
);
inbuf += 16;
inlen -= 16;
}
/* Fold by 1. */
if (inlen >= 16)
{
while (inlen >= 16)
{
/* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
"movdqa %%xmm0, %%xmm1\n\t"
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
"pxor %%xmm2, %%xmm0\n\t"
"pxor %%xmm1, %%xmm0\n\t"
:
: [inbuf] "m" (*inbuf)
);
inbuf += 16;
inlen -= 16;
}
}
/* Partial fold. */
if (inlen)
{
/* Load last input and add padding zeros. */
asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
"movdqu %[shl_shuf], %%xmm4\n\t"
"movdqu %[mask], %%xmm2\n\t"
"movdqa %%xmm0, %%xmm1\n\t"
"pshufb %%xmm4, %%xmm0\n\t"
"movdqu %[inbuf], %%xmm4\n\t"
"pshufb %%xmm3, %%xmm1\n\t"
"pand %%xmm4, %%xmm2\n\t"
"por %%xmm1, %%xmm2\n\t"
"movdqa %%xmm0, %%xmm1\n\t"
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
"pxor %%xmm2, %%xmm0\n\t"
"pxor %%xmm1, %%xmm0\n\t"
:
: [inbuf] "m" (*(inbuf - 16 + inlen)),
[mask] "m" (crc32_partial_fold_input_mask[inlen]),
[shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
[shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
);
inbuf += inlen;
inlen -= inlen;
}
/* Final fold. */
asm volatile (/* reduce 128-bits to 96-bits */
"movdqa %%xmm0, %%xmm1\n\t"
"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
"psrldq $8, %%xmm1\n\t"
"pxor %%xmm1, %%xmm0\n\t"
/* reduce 96-bits to 64-bits */
"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
/* barrett reduction */
"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
"pxor %%xmm1, %%xmm0\n\t"
/* store CRC */
"pextrd $2, %%xmm0, %[out]\n\t"
: [out] "=m" (*pcrc)
: [k5] "m" (consts->k[5 - 1])
);
}
static inline void
crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
const struct crc32_consts_s *consts)
{
if (inlen < 4)
{
u32 crc = *pcrc;
u32 data;
asm volatile ("movdqa %[my_p], %%xmm5\n\t"
:
: [my_p] "m" (consts->my_p[0])
);
if (inlen == 1)
{
data = inbuf[0];
data ^= crc;
data <<= 24;
crc >>= 8;
}
else if (inlen == 2)
{
data = ((const struct u16_unaligned_s *)inbuf)->a;
data ^= crc;
data <<= 16;
crc >>= 16;
}
else
{
data = ((const struct u16_unaligned_s *)inbuf)->a;
data |= ((u32) inbuf[2]) << 16;
data ^= crc;
data <<= 8;
crc >>= 24;
}
/* Barrett reduction */
asm volatile ("movd %[in], %%xmm0\n\t"
"movd %[crc], %%xmm1\n\t"
"pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
"psllq $32, %%xmm1\n\t"
"pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
"pxor %%xmm1, %%xmm0\n\t"
"pextrd $1, %%xmm0, %[out]\n\t"
: [out] "=m" (*pcrc)
: [in] "rm" (data),
[crc] "rm" (crc)
);
}
else if (inlen == 4)
{
/* Barrett reduction */
asm volatile ("movd %[crc], %%xmm1\n\t"
"movd %[in], %%xmm0\n\t"
"movdqa %[my_p], %%xmm5\n\t"
"pxor %%xmm1, %%xmm0\n\t"
"pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
"pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
"pextrd $1, %%xmm0, %[out]\n\t"
: [out] "=m" (*pcrc)
: [in] "m" (*inbuf),
[crc] "m" (*pcrc),
[my_p] "m" (consts->my_p[0])
);
}
else
{
asm volatile ("movdqu %[shuf], %%xmm4\n\t"
"movd %[crc], %%xmm1\n\t"
"movdqa %[my_p], %%xmm5\n\t"
"movdqa %[k3k4], %%xmm6\n\t"
:
: [shuf] "m" (crc32_refl_shuf_shift[inlen]),
[crc] "m" (*pcrc),
[my_p] "m" (consts->my_p[0]),
[k3k4] "m" (consts->k[3 - 1])
);
if (inlen >= 8)
{
asm volatile ("movq %[inbuf], %%xmm0\n\t"
:
: [inbuf] "m" (*inbuf)
);
if (inlen > 8)
{
asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
"movq %[inbuf_tail], %%xmm2\n\t"
"punpcklqdq %%xmm2, %%xmm0\n\t"
"pshufb %[merge_shuf], %%xmm0\n\t"
:
: [inbuf_tail] "m" (inbuf[inlen - 8]),
[merge_shuf] "m"
(*crc32_merge9to15_shuf[inlen - 9])
);
}
}
else
{
asm volatile ("movd %[inbuf], %%xmm0\n\t"
"pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
"pshufb %[merge_shuf], %%xmm0\n\t"
:
: [inbuf] "m" (*inbuf),
[inbuf_tail] "m" (inbuf[inlen - 4]),
[merge_shuf] "m"
(*crc32_merge5to7_shuf[inlen - 5])
);
}
/* Final fold. */
asm volatile ("pxor %%xmm1, %%xmm0\n\t"
"pshufb %%xmm4, %%xmm0\n\t"
/* reduce 128-bits to 96-bits */
"movdqa %%xmm0, %%xmm1\n\t"
"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
"psrldq $8, %%xmm1\n\t"
"pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
/* reduce 96-bits to 64-bits */
"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
/* barrett reduction */
"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
"pxor %%xmm1, %%xmm0\n\t"
/* store CRC */
"pextrd $2, %%xmm0, %[out]\n\t"
: [out] "=m" (*pcrc)
: [k5] "m" (consts->k[5 - 1])
);
}
}
void
crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
{
const struct crc32_consts_s *consts = &crc32_consts;
#if defined(__x86_64__) && defined(__WIN64__)
char win64tmp[2 * 16];
/* XMM6-XMM7 need to be restored after use. */
asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
"movdqu %%xmm7, 1*16(%0)\n\t"
:
: "r" (win64tmp)
: "memory");
#endif
if (!inlen)
return;
if (inlen >= 16)
crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
else
crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
#if defined(__x86_64__) && defined(__WIN64__)
/* Restore used registers. */
asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
"movdqu 1*16(%0), %%xmm7\n\t"
:
: "r" (win64tmp)
: "memory");
#endif
}
#ifdef __GNUC__
int crc32_pclmul_enabled(void)
{
int eax, ecx;
/* We assume that the CPUID instruction and its parameter 1 are available.
We do not support any precursors of the Intel 80486. */
asm("cpuid" : "=a"(eax), "=c"(ecx) : "0"(1) : "ebx", "edx");
return !(~ecx & (1 << 19 | 1 << 1));
}
#elif 0 /* defined _MSC_VER */ /* FIXME: implement the pclmul interface */
#include <intrin.h>
int crc32_pclmul_enabled(void)
{
/* We assume that the CPUID instruction and its parameter 1 are available.
We do not support any precursors of the Intel 80486. */
int regs[4];
__cpuid(regs, 1);
return !(~regs[2] & (1 << 19 | 1 << 1));
}
#else
int crc32_pclmul_enabled(void)
{
return 0;
}
#endif
unsigned int crc32_pclmul(unsigned int crc32, const void *buf, size_t len)
{
crc32= ~crc32;
crc32_intel_pclmul(&crc32, buf, len);
return ~crc32;
}
#endif