mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
f224525204
The -Wconversion in GCC seems to be stricter than in clang. GCC at least since version 4.4.7 issues truncation warnings for assignments to bitfields, while clang 10 appears to only issue warnings when the sizes in bytes rounded to the nearest integer powers of 2 are different. Before GCC 10.0.0, -Wconversion required more casts and would not allow some operations, such as x<<=1 or x+=1 on a data type that is narrower than int. GCC 5 (but not GCC 4, GCC 6, or any later version) is complaining about x|=y even when x and y are compatible types that are narrower than int. Hence, we must rewrite some x|=y as x=static_cast<byte>(x|y) or similar, or we must disable -Wconversion. In GCC 6 and later, the warning for assigning wider to bitfields that are narrower than 8, 16, or 32 bits can be suppressed by applying a bitwise & with the exact bitmask of the bitfield. For older GCC, we must disable -Wconversion for GCC 4 or 5 in such cases. The bitwise negation operator appears to promote short integers to a wider type, and hence we must add explicit truncation casts around them. Microsoft Visual C does not allow a static_cast to truncate a constant, such as static_cast<byte>(1) truncating int. Hence, we will use the constructor-style cast byte(~1) for such cases. This has been tested at least with GCC 4.8.5, 5.4.0, 7.4.0, 9.2.1, 10.0.0, clang 9.0.1, 10.0.0, and MSVC 14.22.27905 (Microsoft Visual Studio 2019) on 64-bit and 32-bit targets (IA-32, AMD64, POWER 8, POWER 9, ARMv8).
511 lines
14 KiB
C
511 lines
14 KiB
C
/******************************************************
|
|
Copyright (c) 2017 Percona LLC and/or its affiliates.
|
|
|
|
CRC32 using Intel's PCLMUL instruction.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
|
|
*******************************************************/
|
|
|
|
/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
|
|
* Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
*
|
|
* This file is part of Libgcrypt.
|
|
*
|
|
* Libgcrypt is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as
|
|
* published by the Free Software Foundation; either version 2.1 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* Libgcrypt is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
*
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
|
|
# define U64_C(c) (c ## UL)
|
|
|
|
typedef uint32_t u32;
|
|
typedef uint16_t u16;
|
|
typedef uint64_t u64;
|
|
#ifndef byte
|
|
typedef uint8_t byte;
|
|
#endif
|
|
|
|
# define _gcry_bswap32 __builtin_bswap32
|
|
|
|
#if __GNUC__ >= 4 && defined(__x86_64__) && defined(HAVE_CLMUL_INSTRUCTION)
|
|
|
|
#if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */
|
|
/* Prevent compiler from issuing SSE instructions between asm blocks. */
|
|
# pragma GCC target("no-sse")
|
|
#endif
|
|
|
|
|
|
#define ALIGNED_16 __attribute__ ((aligned (16)))
|
|
|
|
|
|
struct u16_unaligned_s
|
|
{
|
|
u16 a;
|
|
} __attribute__((packed, aligned (1), may_alias));
|
|
|
|
|
|
/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
|
|
* functions. */
|
|
struct crc32_consts_s
|
|
{
|
|
/* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
|
|
u64 k[6];
|
|
/* my_p: { floor(x^64 / P(x)), P(x) } */
|
|
u64 my_p[2];
|
|
};
|
|
|
|
|
|
/* CLMUL constants for CRC32 and CRC32RFC1510. */
|
|
static const struct crc32_consts_s crc32_consts ALIGNED_16 =
|
|
{
|
|
{ /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
|
|
U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
|
|
U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
|
|
U64_C(0x163cd6124), 0 /* y = 2 */
|
|
},
|
|
{ /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
|
|
U64_C(0x1f7011641), U64_C(0x1db710641)
|
|
}
|
|
};
|
|
|
|
/* Common constants for CRC32 algorithms. */
|
|
static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
|
|
{
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
|
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
};
|
|
static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
|
|
{
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
};
|
|
static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
|
|
{
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
|
|
{ U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
|
|
};
|
|
static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
|
|
{
|
|
{ U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
|
|
{ U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
|
|
{ U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
|
|
};
|
|
|
|
/* PCLMUL functions for reflected CRC32. */
|
|
static inline void
|
|
crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
|
|
const struct crc32_consts_s *consts)
|
|
{
|
|
if (inlen >= 8 * 16)
|
|
{
|
|
asm volatile ("movd %[crc], %%xmm4\n\t"
|
|
"movdqu %[inbuf_0], %%xmm0\n\t"
|
|
"movdqu %[inbuf_1], %%xmm1\n\t"
|
|
"movdqu %[inbuf_2], %%xmm2\n\t"
|
|
"movdqu %[inbuf_3], %%xmm3\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
:
|
|
: [inbuf_0] "m" (inbuf[0 * 16]),
|
|
[inbuf_1] "m" (inbuf[1 * 16]),
|
|
[inbuf_2] "m" (inbuf[2 * 16]),
|
|
[inbuf_3] "m" (inbuf[3 * 16]),
|
|
[crc] "m" (*pcrc)
|
|
);
|
|
|
|
inbuf += 4 * 16;
|
|
inlen -= 4 * 16;
|
|
|
|
asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
|
|
:
|
|
: [k1k2] "m" (consts->k[1 - 1])
|
|
);
|
|
|
|
/* Fold by 4. */
|
|
while (inlen >= 4 * 16)
|
|
{
|
|
asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
|
|
"movdqa %%xmm0, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm0\n\t"
|
|
"pxor %%xmm6, %%xmm0\n\t"
|
|
|
|
"movdqu %[inbuf_1], %%xmm5\n\t"
|
|
"movdqa %%xmm1, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm1\n\t"
|
|
"pxor %%xmm6, %%xmm1\n\t"
|
|
|
|
"movdqu %[inbuf_2], %%xmm5\n\t"
|
|
"movdqa %%xmm2, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm2\n\t"
|
|
"pxor %%xmm6, %%xmm2\n\t"
|
|
|
|
"movdqu %[inbuf_3], %%xmm5\n\t"
|
|
"movdqa %%xmm3, %%xmm6\n\t"
|
|
"pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
|
|
"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
|
|
"pxor %%xmm5, %%xmm3\n\t"
|
|
"pxor %%xmm6, %%xmm3\n\t"
|
|
:
|
|
: [inbuf_0] "m" (inbuf[0 * 16]),
|
|
[inbuf_1] "m" (inbuf[1 * 16]),
|
|
[inbuf_2] "m" (inbuf[2 * 16]),
|
|
[inbuf_3] "m" (inbuf[3 * 16])
|
|
);
|
|
|
|
inbuf += 4 * 16;
|
|
inlen -= 4 * 16;
|
|
}
|
|
|
|
asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
:
|
|
: [k3k4] "m" (consts->k[3 - 1]),
|
|
[my_p] "m" (consts->my_p[0])
|
|
);
|
|
|
|
/* Fold 4 to 1. */
|
|
|
|
asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm4\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
|
|
"pxor %%xmm2, %%xmm0\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm4\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
|
|
"pxor %%xmm3, %%xmm0\n\t"
|
|
"pxor %%xmm4, %%xmm0\n\t"
|
|
:
|
|
:
|
|
);
|
|
}
|
|
else
|
|
{
|
|
asm volatile ("movd %[crc], %%xmm1\n\t"
|
|
"movdqu %[inbuf], %%xmm0\n\t"
|
|
"movdqa %[k3k4], %%xmm6\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf),
|
|
[crc] "m" (*pcrc),
|
|
[k3k4] "m" (consts->k[3 - 1]),
|
|
[my_p] "m" (consts->my_p[0])
|
|
);
|
|
|
|
inbuf += 16;
|
|
inlen -= 16;
|
|
}
|
|
|
|
/* Fold by 1. */
|
|
if (inlen >= 16)
|
|
{
|
|
while (inlen >= 16)
|
|
{
|
|
/* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
|
|
asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
|
|
"pxor %%xmm2, %%xmm0\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf)
|
|
);
|
|
|
|
inbuf += 16;
|
|
inlen -= 16;
|
|
}
|
|
}
|
|
|
|
/* Partial fold. */
|
|
if (inlen)
|
|
{
|
|
/* Load last input and add padding zeros. */
|
|
asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
|
|
"movdqu %[shl_shuf], %%xmm4\n\t"
|
|
"movdqu %[mask], %%xmm2\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pshufb %%xmm4, %%xmm0\n\t"
|
|
"movdqu %[inbuf], %%xmm4\n\t"
|
|
"pshufb %%xmm3, %%xmm1\n\t"
|
|
"pand %%xmm4, %%xmm2\n\t"
|
|
"por %%xmm1, %%xmm2\n\t"
|
|
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
|
|
"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
|
|
"pxor %%xmm2, %%xmm0\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*(inbuf - 16 + inlen)),
|
|
[mask] "m" (crc32_partial_fold_input_mask[inlen]),
|
|
[shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
|
|
[shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
|
|
);
|
|
|
|
inbuf += inlen;
|
|
inlen -= inlen;
|
|
}
|
|
|
|
/* Final fold. */
|
|
asm volatile (/* reduce 128-bits to 96-bits */
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
|
|
"psrldq $8, %%xmm1\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
/* reduce 96-bits to 64-bits */
|
|
"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
|
|
"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
|
|
"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
|
|
"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
|
|
|
|
/* barrett reduction */
|
|
"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
|
|
"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
/* store CRC */
|
|
"pextrd $2, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [k5] "m" (consts->k[5 - 1])
|
|
);
|
|
}
|
|
|
|
static inline void
|
|
crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
|
|
const struct crc32_consts_s *consts)
|
|
{
|
|
if (inlen < 4)
|
|
{
|
|
u32 crc = *pcrc;
|
|
u32 data;
|
|
|
|
asm volatile ("movdqa %[my_p], %%xmm5\n\t"
|
|
:
|
|
: [my_p] "m" (consts->my_p[0])
|
|
);
|
|
|
|
if (inlen == 1)
|
|
{
|
|
data = inbuf[0];
|
|
data ^= crc;
|
|
data <<= 24;
|
|
crc >>= 8;
|
|
}
|
|
else if (inlen == 2)
|
|
{
|
|
data = ((const struct u16_unaligned_s *)inbuf)->a;
|
|
data ^= crc;
|
|
data <<= 16;
|
|
crc >>= 16;
|
|
}
|
|
else
|
|
{
|
|
data = ((const struct u16_unaligned_s *)inbuf)->a;
|
|
data |= ((u32) inbuf[2]) << 16;
|
|
data ^= crc;
|
|
data <<= 8;
|
|
crc >>= 24;
|
|
}
|
|
|
|
/* Barrett reduction */
|
|
asm volatile ("movd %[in], %%xmm0\n\t"
|
|
"movd %[crc], %%xmm1\n\t"
|
|
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
"psllq $32, %%xmm1\n\t"
|
|
"pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
"pextrd $1, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [in] "rm" (data),
|
|
[crc] "rm" (crc)
|
|
);
|
|
}
|
|
else if (inlen == 4)
|
|
{
|
|
/* Barrett reduction */
|
|
asm volatile ("movd %[crc], %%xmm1\n\t"
|
|
"movd %[in], %%xmm0\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
"pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
|
|
|
|
"pextrd $1, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [in] "m" (*inbuf),
|
|
[crc] "m" (*pcrc),
|
|
[my_p] "m" (consts->my_p[0])
|
|
);
|
|
}
|
|
else
|
|
{
|
|
asm volatile ("movdqu %[shuf], %%xmm4\n\t"
|
|
"movd %[crc], %%xmm1\n\t"
|
|
"movdqa %[my_p], %%xmm5\n\t"
|
|
"movdqa %[k3k4], %%xmm6\n\t"
|
|
:
|
|
: [shuf] "m" (crc32_refl_shuf_shift[inlen]),
|
|
[crc] "m" (*pcrc),
|
|
[my_p] "m" (consts->my_p[0]),
|
|
[k3k4] "m" (consts->k[3 - 1])
|
|
);
|
|
|
|
if (inlen >= 8)
|
|
{
|
|
asm volatile ("movq %[inbuf], %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf)
|
|
);
|
|
if (inlen > 8)
|
|
{
|
|
asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
|
|
"movq %[inbuf_tail], %%xmm2\n\t"
|
|
"punpcklqdq %%xmm2, %%xmm0\n\t"
|
|
"pshufb %[merge_shuf], %%xmm0\n\t"
|
|
:
|
|
: [inbuf_tail] "m" (inbuf[inlen - 8]),
|
|
[merge_shuf] "m"
|
|
(*crc32_merge9to15_shuf[inlen - 9])
|
|
);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
asm volatile ("movd %[inbuf], %%xmm0\n\t"
|
|
"pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
|
|
"pshufb %[merge_shuf], %%xmm0\n\t"
|
|
:
|
|
: [inbuf] "m" (*inbuf),
|
|
[inbuf_tail] "m" (inbuf[inlen - 4]),
|
|
[merge_shuf] "m"
|
|
(*crc32_merge5to7_shuf[inlen - 5])
|
|
);
|
|
}
|
|
|
|
/* Final fold. */
|
|
asm volatile ("pxor %%xmm1, %%xmm0\n\t"
|
|
"pshufb %%xmm4, %%xmm0\n\t"
|
|
|
|
/* reduce 128-bits to 96-bits */
|
|
"movdqa %%xmm0, %%xmm1\n\t"
|
|
"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
|
|
"psrldq $8, %%xmm1\n\t"
|
|
"pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
|
|
|
|
/* reduce 96-bits to 64-bits */
|
|
"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
|
|
"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
|
|
"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
|
|
"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
|
|
|
|
/* barrett reduction */
|
|
"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
|
|
"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
|
|
"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
|
|
"pxor %%xmm1, %%xmm0\n\t"
|
|
|
|
/* store CRC */
|
|
"pextrd $2, %%xmm0, %[out]\n\t"
|
|
: [out] "=m" (*pcrc)
|
|
: [k5] "m" (consts->k[5 - 1])
|
|
);
|
|
}
|
|
}
|
|
|
|
void
|
|
crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
|
|
{
|
|
const struct crc32_consts_s *consts = &crc32_consts;
|
|
#if defined(__x86_64__) && defined(__WIN64__)
|
|
char win64tmp[2 * 16];
|
|
|
|
/* XMM6-XMM7 need to be restored after use. */
|
|
asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
|
|
"movdqu %%xmm7, 1*16(%0)\n\t"
|
|
:
|
|
: "r" (win64tmp)
|
|
: "memory");
|
|
#endif
|
|
|
|
if (!inlen)
|
|
return;
|
|
|
|
if (inlen >= 16)
|
|
crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
|
|
else
|
|
crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
|
|
|
|
#if defined(__x86_64__) && defined(__WIN64__)
|
|
/* Restore used registers. */
|
|
asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
|
|
"movdqu 1*16(%0), %%xmm7\n\t"
|
|
:
|
|
: "r" (win64tmp)
|
|
: "memory");
|
|
#endif
|
|
}
|
|
|
|
#endif
|