mirror of
https://github.com/MariaDB/server.git
synced 2025-01-30 10:31:54 +01:00
dec3f8ca69
Existing implementation used my_checksum (from mysys) for calculating table checksum and binlog checksum. This implementation was optimized for powerpc only and lacked SIMD implementation for x86 (using clmul) and ARM (using ACLE) instead used zlib-crc32. mariabackup had its own copy of the crc32 implementation using hardware optimized implementation only for x86 and lagged hardware based implementation for powerpc and ARM. Patch helps unifies all such calls and help aggregate all of them using an unified interface my_checksum(). Said unification also enables hardware optimized calls for all architecture viz. x86, ARM, POWERPC. Default always fallback to zlib crc32. Thanks to Daniel Black for reviewing, fixing and testing PowerPC changes. Thanks to Marko and Daniel for early code feedback.
545 lines
15 KiB
C
545 lines
15 KiB
C
/******************************************************
|
|
Copyright (c) 2017 Percona LLC and/or its affiliates.
|
|
|
|
CRC32 using Intel's PCLMUL instruction.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
|
|
*******************************************************/
|
|
|
|
/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
|
|
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
*
|
|
* This file is part of Libgcrypt.
|
|
*
|
|
* Libgcrypt is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as
|
|
* published by the Free Software Foundation; either version 2.1 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* Libgcrypt is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
*
|
|
*/
|
|
|
|
#include <my_global.h>
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
|
|
# define U64_C(c) (c ## UL)
|
|
|
|
typedef uint32_t u32;
|
|
typedef uint16_t u16;
|
|
typedef uint64_t u64;
|
|
#ifndef byte
|
|
typedef uint8_t byte;
|
|
#endif
|
|
|
|
# define _gcry_bswap32 __builtin_bswap32
|
|
|
|
#if __GNUC__ >= 4 && defined(__x86_64__)
|
|
|
|
#if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */
|
|
/* Prevent compiler from issuing SSE instructions between asm blocks. */
|
|
# pragma GCC target("no-sse")
|
|
#endif
|
|
|
|
|
|
#define ALIGNED_16 __attribute__ ((aligned (16)))
|
|
|
|
|
|
struct u16_unaligned_s
|
|
{
|
|
u16 a;
|
|
} __attribute__((packed, aligned (1), may_alias));
|
|
|
|
|
|
/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
|
|
* functions. */
|
|
struct crc32_consts_s
|
|
{
|
|
/* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
|
|
u64 k[6];
|
|
/* my_p: { floor(x^64 / P(x)), P(x) } */
|
|
u64 my_p[2];
|
|
};
|
|
|
|
|
|
/* CLMUL constants for CRC32 and CRC32RFC1510. */
|
|
static const struct crc32_consts_s crc32_consts ALIGNED_16 =
|
|
{
|
|
{ /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
|
|
U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
|
|
U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
|
|
U64_C(0x163cd6124), 0 /* y = 2 */
|
|
},
|
|
{ /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
|
|
U64_C(0x1f7011641), U64_C(0x1db710641)
|
|
}
|
|
};
|
|
|
|
/* Common constants for CRC32 algorithms. */
|
|
static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
|
|
{
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
};
|
|
static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
|
|
{
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
};
|
|
static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
|
|
{
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
|
|
};
|
|
static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
|
|
{
|
|
{ U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
|
|
{ U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
|
|
{ U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
|
|
};
|
|
|
|
/* PCLMUL functions for reflected CRC32. */
|
|
static inline void
|
|
crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
|
|
const struct crc32_consts_s *consts)
|
|
{
|
|
if (inlen >= 8 * 16)
|
|
{
|
|
asm volatile ("movd %[crc], %%xmm4\n\t"
|
|
"movdqu %[inbuf_0], %%xmm0\n\t"
|
|
"movdqu %[inbuf_1], %%xmm1\n\t"
|
|
"movdqu %[inbuf_2], %%xmm2\n\t"
|
|
"movdqu %[inbuf_3], %%xmm3\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
:
|
|
: [inbuf_0] "m" (inbuf[0 * 16]),
|
|
[inbuf_1] "m" (inbuf[1 * 16]),
|
|
[inbuf_2] "m" (inbuf[2 * 16]),
|
|
[inbuf_3] "m" (inbuf[3 * 16]),
|
|
[crc] "m" (*pcrc)
|
|
);
|
|
|
|
inbuf += 4 * 16;
|
|
inlen -= 4 * 16;
|
|
|
|
asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
|
|
:
|
|
: [k1k2] "m" (consts->k[1 - 1])
|
|
);
|
|
|
|
/* Fold by 4. */
|
|
while (inlen >= 4 * 16)
|
|
{
|
|
asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
|
|
"movdqa %%xmm0, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm0\n\t"
|
|
"pxor %%xmm6, %%xmm0\n\t"
|
|
|
|
"movdqu %[inbuf_1], %%xmm5\n\t"
|
|
"movdqa %%xmm1, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm1\n\t"
|
|
"pxor %%xmm6, %%xmm1\n\t"
|
|
|
|
"movdqu %[inbuf_2], %%xmm5\n\t"
|
|
"movdqa %%xmm2, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm2\n\t"
|
|
"pxor %%xmm6, %%xmm2\n\t"
|
|
|
|
"movdqu %[inbuf_3], %%xmm5\n\t"
|
|
"movdqa %%xmm3, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm3\n\t"
|
|
"pxor %%xmm6, %%xmm3\n\t"
|
|
:
|
|
: [inbuf_0] "m" (inbuf[0 * 16]),
|
|
[inbuf_1] "m" (inbuf[1 * 16]),
|
|
[inbuf_2] "m" (inbuf[2 * 16]),
|
|
[inbuf_3] "m" (inbuf[3 * 16])
|
|
);
|
|
|
|
inbuf += 4 * 16;
|
|
inlen -= 4 * 16;
|
|
}
|
|
|
|
asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
:
|
|
: [k3k4] "m" (consts->k[3 - 1]),
|
|
[my_p] "m" (consts->my_p[0])
|
|
);
|
|
|
|
/* Fold 4 to 1. */
|
|
|
|
asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm4\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
|
|
"pxor %%xmm2, %%xmm0\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm4\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
|
|
"pxor %%xmm3, %%xmm0\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
:
|
|
:
|
|
);
|
|
}
|
|
else
|
|
{
|
|
asm volatile ("movd %[crc], %%xmm1\n\t"
|
|
"movdqu %[inbuf], %%xmm0\n\t"
|
|
"movdqa %[k3k4], %%xmm6\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf),
|
|
[crc] "m" (*pcrc),
|
|
[k3k4] "m" (consts->k[3 - 1]),
|
|
[my_p] "m" (consts->my_p[0])
|
|
);
|
|
|
|
inbuf += 16;
|
|
inlen -= 16;
|
|
}
|
|
|
|
/* Fold by 1. */
|
|
if (inlen >= 16)
|
|
{
|
|
while (inlen >= 16)
|
|
{
|
|
/* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
|
|
asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
|
|
"pxor %%xmm2, %%xmm0\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf)
|
|
);
|
|
|
|
inbuf += 16;
|
|
inlen -= 16;
|
|
}
|
|
}
|
|
|
|
/* Partial fold. */
|
|
if (inlen)
|
|
{
|
|
/* Load last input and add padding zeros. */
|
|
asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
|
|
"movdqu %[shl_shuf], %%xmm4\n\t"
|
|
"movdqu %[mask], %%xmm2\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pshufb %%xmm4, %%xmm0\n\t"
|
|
"movdqu %[inbuf], %%xmm4\n\t"
|
|
"pshufb %%xmm3, %%xmm1\n\t"
|
|
"pand %%xmm4, %%xmm2\n\t"
|
|
"por %%xmm1, %%xmm2\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
|
|
"pxor %%xmm2, %%xmm0\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*(inbuf - 16 + inlen)),
|
|
[mask] "m" (crc32_partial_fold_input_mask[inlen]),
|
|
[shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
|
|
[shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
|
|
);
|
|
|
|
inbuf += inlen;
|
|
inlen -= inlen;
|
|
}
|
|
|
|
/* Final fold. */
|
|
asm volatile (/* reduce 128-bits to 96-bits */
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
|
|
"psrldq $8, %%xmm1\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
/* reduce 96-bits to 64-bits */
|
|
"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
|
|
"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
|
|
"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
|
|
"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
|
|
|
|
/* barrett reduction */
|
|
"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
|
|
"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
/* store CRC */
|
|
"pextrd $2, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [k5] "m" (consts->k[5 - 1])
|
|
);
|
|
}
|
|
|
|
static inline void
|
|
crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
|
|
const struct crc32_consts_s *consts)
|
|
{
|
|
if (inlen < 4)
|
|
{
|
|
u32 crc = *pcrc;
|
|
u32 data;
|
|
|
|
asm volatile ("movdqa %[my_p], %%xmm5\n\t"
|
|
:
|
|
: [my_p] "m" (consts->my_p[0])
|
|
);
|
|
|
|
if (inlen == 1)
|
|
{
|
|
data = inbuf[0];
|
|
data ^= crc;
|
|
data <<= 24;
|
|
crc >>= 8;
|
|
}
|
|
else if (inlen == 2)
|
|
{
|
|
data = ((const struct u16_unaligned_s *)inbuf)->a;
|
|
data ^= crc;
|
|
data <<= 16;
|
|
crc >>= 16;
|
|
}
|
|
else
|
|
{
|
|
data = ((const struct u16_unaligned_s *)inbuf)->a;
|
|
data |= ((u32) inbuf[2]) << 16;
|
|
data ^= crc;
|
|
data <<= 8;
|
|
crc >>= 24;
|
|
}
|
|
|
|
/* Barrett reduction */
|
|
asm volatile ("movd %[in], %%xmm0\n\t"
|
|
"movd %[crc], %%xmm1\n\t"
|
|
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
"psllq $32, %%xmm1\n\t"
|
|
"pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
"pextrd $1, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [in] "rm" (data),
|
|
[crc] "rm" (crc)
|
|
);
|
|
}
|
|
else if (inlen == 4)
|
|
{
|
|
/* Barrett reduction */
|
|
asm volatile ("movd %[crc], %%xmm1\n\t"
|
|
"movd %[in], %%xmm0\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
"pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
|
|
"pextrd $1, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [in] "m" (*inbuf),
|
|
[crc] "m" (*pcrc),
|
|
[my_p] "m" (consts->my_p[0])
|
|
);
|
|
}
|
|
else
|
|
{
|
|
asm volatile ("movdqu %[shuf], %%xmm4\n\t"
|
|
"movd %[crc], %%xmm1\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
"movdqa %[k3k4], %%xmm6\n\t"
|
|
:
|
|
: [shuf] "m" (crc32_refl_shuf_shift[inlen]),
|
|
[crc] "m" (*pcrc),
|
|
[my_p] "m" (consts->my_p[0]),
|
|
[k3k4] "m" (consts->k[3 - 1])
|
|
);
|
|
|
|
if (inlen >= 8)
|
|
{
|
|
asm volatile ("movq %[inbuf], %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf)
|
|
);
|
|
if (inlen > 8)
|
|
{
|
|
asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
|
|
"movq %[inbuf_tail], %%xmm2\n\t"
|
|
"punpcklqdq %%xmm2, %%xmm0\n\t"
|
|
"pshufb %[merge_shuf], %%xmm0\n\t"
|
|
:
|
|
: [inbuf_tail] "m" (inbuf[inlen - 8]),
|
|
[merge_shuf] "m"
|
|
(*crc32_merge9to15_shuf[inlen - 9])
|
|
);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
asm volatile ("movd %[inbuf], %%xmm0\n\t"
|
|
"pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
|
|
"pshufb %[merge_shuf], %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf),
|
|
[inbuf_tail] "m" (inbuf[inlen - 4]),
|
|
[merge_shuf] "m"
|
|
(*crc32_merge5to7_shuf[inlen - 5])
|
|
);
|
|
}
|
|
|
|
/* Final fold. */
|
|
asm volatile ("pxor %%xmm1, %%xmm0\n\t"
|
|
"pshufb %%xmm4, %%xmm0\n\t"
|
|
|
|
/* reduce 128-bits to 96-bits */
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
|
|
"psrldq $8, %%xmm1\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
|
|
|
|
/* reduce 96-bits to 64-bits */
|
|
"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
|
|
"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
|
|
"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
|
|
"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
|
|
|
|
/* barrett reduction */
|
|
"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
|
|
"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
/* store CRC */
|
|
"pextrd $2, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [k5] "m" (consts->k[5 - 1])
|
|
);
|
|
}
|
|
}
|
|
|
|
void
|
|
crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
|
|
{
|
|
const struct crc32_consts_s *consts = &crc32_consts;
|
|
#if defined(__x86_64__) && defined(__WIN64__)
|
|
char win64tmp[2 * 16];
|
|
|
|
/* XMM6-XMM7 need to be restored after use. */
|
|
asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
|
|
"movdqu %%xmm7, 1*16(%0)\n\t"
|
|
:
|
|
: "r" (win64tmp)
|
|
: "memory");
|
|
#endif
|
|
|
|
if (!inlen)
|
|
return;
|
|
|
|
if (inlen >= 16)
|
|
crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
|
|
else
|
|
crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
|
|
|
|
#if defined(__x86_64__) && defined(__WIN64__)
|
|
/* Restore used registers. */
|
|
asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
|
|
"movdqu 1*16(%0), %%xmm7\n\t"
|
|
:
|
|
: "r" (win64tmp)
|
|
: "memory");
|
|
#endif
|
|
}
|
|
|
|
#ifdef __GNUC__
|
|
int crc32_pclmul_enabled(void)
|
|
{
|
|
int eax, ecx;
|
|
/* We assume that the CPUID instruction and its parameter 1 are available.
|
|
We do not support any precursors of the Intel 80486. */
|
|
asm("cpuid" : "=a"(eax), "=c"(ecx) : "0"(1) : "ebx", "edx");
|
|
return !(~ecx & (1 << 19 | 1 << 1));
|
|
}
|
|
#elif 0 /* defined _MSC_VER */ /* FIXME: implement the pclmul interface */
|
|
#include <intrin.h>
|
|
int crc32_pclmul_enabled(void)
|
|
{
|
|
/* We assume that the CPUID instruction and its parameter 1 are available.
|
|
We do not support any precursors of the Intel 80486. */
|
|
int regs[4];
|
|
__cpuid(regs, 1);
|
|
return !(~regs[2] & (1 << 19 | 1 << 1));
|
|
}
|
|
#else
|
|
int crc32_pclmul_enabled(void)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
unsigned int crc32_pclmul(unsigned int crc32, const void *buf, size_t len)
|
|
{
|
|
crc32= ~crc32;
|
|
crc32_intel_pclmul(&crc32, buf, len);
|
|
return ~crc32;
|
|
}
|
|
#endif
|