mirror of
				https://github.com/MariaDB/server.git
				synced 2025-11-04 04:46:15 +01:00 
			
		
		
		
	In our unit test, let us rely on our own reference implementation using the reflected CRC-32 ISO 3309 and CRC-32C polynomials. Let us also test with various lengths. Let us refactor the CRC-32 and CRC-32C implementations so that no special compilation flags will be needed and that some function call indirection will be avoided. pmull_supported: Remove. We will have pointers to two separate functions crc32c_aarch64_pmull() and crc32c_aarch64().
		
			
				
	
	
		
			344 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			344 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* Copyright (c) 2020, 2021, MariaDB
 | 
						|
 | 
						|
   This program is free software; you can redistribute it and/or modify
 | 
						|
   it under the terms of the GNU General Public License as published by
 | 
						|
   the Free Software Foundation; version 2 of the License.
 | 
						|
 | 
						|
   This program is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU General Public License
 | 
						|
   along with this program; if not, write to the Free Software
 | 
						|
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
 | 
						|
 | 
						|
/*
 | 
						|
  Implementation of CRC32 (Ethernet) uing Intel PCLMULQDQ
 | 
						|
  Ported from Intels work, see https://github.com/intel/soft-crc
 | 
						|
*/
 | 
						|
 | 
						|
/*******************************************************************************
 | 
						|
 Copyright (c) 2009-2018, Intel Corporation
 | 
						|
 | 
						|
 Redistribution and use in source and binary forms, with or without
 | 
						|
 modification, are permitted provided that the following conditions are met:
 | 
						|
 | 
						|
     * Redistributions of source code must retain the above copyright notice,
 | 
						|
       this list of conditions and the following disclaimer.
 | 
						|
     * Redistributions in binary form must reproduce the above copyright
 | 
						|
       notice, this list of conditions and the following disclaimer in the
 | 
						|
       documentation and/or other materials provided with the distribution.
 | 
						|
     * Neither the name of Intel Corporation nor the names of its contributors
 | 
						|
       may be used to endorse or promote products derived from this software
 | 
						|
       without specific prior written permission.
 | 
						|
 | 
						|
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
						|
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
						|
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | 
						|
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 | 
						|
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
						|
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
						|
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
						|
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
						|
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | 
						|
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
*******************************************************************************/
 | 
						|
 | 
						|
 | 
						|
#include <my_global.h>
 | 
						|
#include <my_compiler.h>
 | 
						|
 | 
						|
#include <stdio.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <string.h>
 | 
						|
#include <stdint.h>
 | 
						|
#include <stddef.h>
 | 
						|
 | 
						|
#ifdef __GNUC__
 | 
						|
# include <emmintrin.h>
 | 
						|
# include <smmintrin.h>
 | 
						|
# include <tmmintrin.h>
 | 
						|
# include <wmmintrin.h>
 | 
						|
# define USE_PCLMUL __attribute__((target("sse4.2,pclmul")))
 | 
						|
#elif defined(_MSC_VER)
 | 
						|
# include <intrin.h>
 | 
						|
# define USE_PCLMUL /* nothing */
 | 
						|
#else
 | 
						|
# error "unknown compiler"
 | 
						|
#endif
 | 
						|
 | 
						|
/**
 | 
						|
 * @brief Shifts left 128 bit register by specified number of bytes
 | 
						|
 *
 | 
						|
 * @param reg 128 bit value
 | 
						|
 * @param num number of bytes to shift left \a reg by (0-16)
 | 
						|
 *
 | 
						|
 * @return \a reg << (\a num * 8)
 | 
						|
 */
 | 
						|
USE_PCLMUL
 | 
						|
static inline __m128i xmm_shift_left(__m128i reg, const unsigned int num)
 | 
						|
{
 | 
						|
  static const MY_ALIGNED(16) uint8_t crc_xmm_shift_tab[48]= {
 | 
						|
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 | 
						|
      0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 | 
						|
      0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff,
 | 
						|
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 | 
						|
 | 
						|
  const __m128i *p= (const __m128i *) (crc_xmm_shift_tab + 16 - num);
 | 
						|
 | 
						|
  return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
 | 
						|
}
 | 
						|
 | 
						|
struct crcr_pclmulqdq_ctx
 | 
						|
{
 | 
						|
  uint64_t rk1;
 | 
						|
  uint64_t rk2;
 | 
						|
  uint64_t rk5;
 | 
						|
  uint64_t rk6;
 | 
						|
  uint64_t rk7;
 | 
						|
  uint64_t rk8;
 | 
						|
};
 | 
						|
 | 
						|
/**
 | 
						|
 * @brief Performs one folding round
 | 
						|
 *
 | 
						|
 * Logically function operates as follows:
 | 
						|
 *     DATA = READ_NEXT_16BYTES();
 | 
						|
 *     F1 = LSB8(FOLD)
 | 
						|
 *     F2 = MSB8(FOLD)
 | 
						|
 *     T1 = CLMUL(F1, RK1)
 | 
						|
 *     T2 = CLMUL(F2, RK2)
 | 
						|
 *     FOLD = XOR(T1, T2, DATA)
 | 
						|
 *
 | 
						|
 * @param data_block 16 byte data block
 | 
						|
 * @param precomp precomputed rk1 constanst
 | 
						|
 * @param fold running 16 byte folded data
 | 
						|
 *
 | 
						|
 * @return New 16 byte folded data
 | 
						|
 */
 | 
						|
USE_PCLMUL
 | 
						|
static inline __m128i crcr32_folding_round(const __m128i data_block,
 | 
						|
                                    const __m128i precomp, const __m128i fold)
 | 
						|
{
 | 
						|
  __m128i tmp0= _mm_clmulepi64_si128(fold, precomp, 0x01);
 | 
						|
  __m128i tmp1= _mm_clmulepi64_si128(fold, precomp, 0x10);
 | 
						|
 | 
						|
  return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * @brief Performs reduction from 128 bits to 64 bits
 | 
						|
 *
 | 
						|
 * @param data128 128 bits data to be reduced
 | 
						|
 * @param precomp rk5 and rk6 precomputed constants
 | 
						|
 *
 | 
						|
 * @return data reduced to 64 bits
 | 
						|
 */
 | 
						|
USE_PCLMUL
 | 
						|
static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i precomp)
 | 
						|
{
 | 
						|
  __m128i tmp0, tmp1, tmp2;
 | 
						|
 | 
						|
  /* 64b fold */
 | 
						|
  tmp0= _mm_clmulepi64_si128(data128, precomp, 0x00);
 | 
						|
  tmp1= _mm_srli_si128(data128, 8);
 | 
						|
  tmp0= _mm_xor_si128(tmp0, tmp1);
 | 
						|
 | 
						|
  /* 32b fold */
 | 
						|
  tmp2= _mm_slli_si128(tmp0, 4);
 | 
						|
  tmp1= _mm_clmulepi64_si128(tmp2, precomp, 0x10);
 | 
						|
 | 
						|
  return _mm_xor_si128(tmp1, tmp0);
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * @brief Performs Barret's reduction from 64 bits to 32 bits
 | 
						|
 *
 | 
						|
 * @param data64 64 bits data to be reduced
 | 
						|
 * @param precomp rk7 precomputed constant
 | 
						|
 *
 | 
						|
 * @return data reduced to 32 bits
 | 
						|
 */
 | 
						|
USE_PCLMUL
 | 
						|
static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i precomp)
 | 
						|
{
 | 
						|
  static const MY_ALIGNED(16) uint32_t mask1[4]= {
 | 
						|
      0xffffffff, 0xffffffff, 0x00000000, 0x00000000};
 | 
						|
  static const MY_ALIGNED(16) uint32_t mask2[4]= {
 | 
						|
      0x00000000, 0xffffffff, 0xffffffff, 0xffffffff};
 | 
						|
  __m128i tmp0, tmp1, tmp2;
 | 
						|
 | 
						|
  tmp0= _mm_and_si128(data64, _mm_load_si128((__m128i *) mask2));
 | 
						|
 | 
						|
  tmp1= _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 | 
						|
  tmp1= _mm_xor_si128(tmp1, tmp0);
 | 
						|
  tmp1= _mm_and_si128(tmp1, _mm_load_si128((__m128i *) mask1));
 | 
						|
 | 
						|
  tmp2= _mm_clmulepi64_si128(tmp1, precomp, 0x10);
 | 
						|
  tmp2= _mm_xor_si128(tmp2, tmp1);
 | 
						|
  tmp2= _mm_xor_si128(tmp2, tmp0);
 | 
						|
 | 
						|
  return _mm_extract_epi32(tmp2, 2);
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * @brief Calculates reflected 32-bit CRC for given \a data block
 | 
						|
 *        by applying folding and reduction methods.
 | 
						|
 *
 | 
						|
 * Algorithm operates on 32 bit CRCs.
 | 
						|
 * Polynomials and initial values may need to be promoted to
 | 
						|
 * 32 bits where required.
 | 
						|
 *
 | 
						|
 * @param crc initial CRC value (32 bit value)
 | 
						|
 * @param data pointer to data block
 | 
						|
 * @param data_len length of \a data block in bytes
 | 
						|
 * @param params pointer to PCLMULQDQ CRC calculation context
 | 
						|
 *
 | 
						|
 * @return CRC for given \a data block (32 bits wide).
 | 
						|
 */
 | 
						|
USE_PCLMUL
 | 
						|
static inline uint32_t crcr32_calc_pclmulqdq(const uint8_t *data, uint32_t data_len,
 | 
						|
                                      uint32_t crc,
 | 
						|
                                      const struct crcr_pclmulqdq_ctx *params)
 | 
						|
{
 | 
						|
  __m128i temp, fold, k;
 | 
						|
  uint32_t n;
 | 
						|
 | 
						|
  DBUG_ASSERT(data != NULL || data_len == 0);
 | 
						|
  DBUG_ASSERT(params);
 | 
						|
 | 
						|
  if (unlikely(data_len == 0))
 | 
						|
    return crc;
 | 
						|
 | 
						|
  /**
 | 
						|
   * Get CRC init value
 | 
						|
   */
 | 
						|
  temp= _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 | 
						|
 | 
						|
  /**
 | 
						|
   * -------------------------------------------------
 | 
						|
   * Folding all data into single 16 byte data block
 | 
						|
   * Assumes: \a fold holds first 16 bytes of data
 | 
						|
   */
 | 
						|
 | 
						|
  if (unlikely(data_len < 32))
 | 
						|
  {
 | 
						|
    if (unlikely(data_len == 16))
 | 
						|
    {
 | 
						|
      /* 16 bytes */
 | 
						|
      fold= _mm_loadu_si128((__m128i *) data);
 | 
						|
      fold= _mm_xor_si128(fold, temp);
 | 
						|
      goto reduction_128_64;
 | 
						|
    }
 | 
						|
    if (unlikely(data_len < 16))
 | 
						|
    {
 | 
						|
      /* 0 to 15 bytes */
 | 
						|
      MY_ALIGNED(16) uint8_t buffer[16];
 | 
						|
 | 
						|
      memset(buffer, 0, sizeof(buffer));
 | 
						|
      memcpy(buffer, data, data_len);
 | 
						|
 | 
						|
      fold= _mm_load_si128((__m128i *) buffer);
 | 
						|
      fold= _mm_xor_si128(fold, temp);
 | 
						|
      if ((data_len < 4))
 | 
						|
      {
 | 
						|
        fold= xmm_shift_left(fold, 8 - data_len);
 | 
						|
        goto barret_reduction;
 | 
						|
      }
 | 
						|
      fold= xmm_shift_left(fold, 16 - data_len);
 | 
						|
      goto reduction_128_64;
 | 
						|
    }
 | 
						|
    /* 17 to 31 bytes */
 | 
						|
    fold= _mm_loadu_si128((__m128i *) data);
 | 
						|
    fold= _mm_xor_si128(fold, temp);
 | 
						|
    n= 16;
 | 
						|
    k= _mm_load_si128((__m128i *) (¶ms->rk1));
 | 
						|
    goto partial_bytes;
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * At least 32 bytes in the buffer
 | 
						|
   */
 | 
						|
 | 
						|
  /**
 | 
						|
   * Apply CRC initial value
 | 
						|
   */
 | 
						|
  fold= _mm_loadu_si128((const __m128i *) data);
 | 
						|
  fold= _mm_xor_si128(fold, temp);
 | 
						|
 | 
						|
  /**
 | 
						|
   * Main folding loop
 | 
						|
   * - the last 16 bytes is processed separately
 | 
						|
   */
 | 
						|
  k= _mm_load_si128((__m128i *) (¶ms->rk1));
 | 
						|
  for (n= 16; (n + 16) <= data_len; n+= 16)
 | 
						|
  {
 | 
						|
    temp= _mm_loadu_si128((__m128i *) &data[n]);
 | 
						|
    fold= crcr32_folding_round(temp, k, fold);
 | 
						|
  }
 | 
						|
 | 
						|
partial_bytes:
 | 
						|
  if (likely(n < data_len))
 | 
						|
  {
 | 
						|
    static const MY_ALIGNED(16) uint32_t mask3[4]= {0x80808080, 0x80808080,
 | 
						|
                                                   0x80808080, 0x80808080};
 | 
						|
    static const MY_ALIGNED(16) uint8_t shf_table[32]= {
 | 
						|
        0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
 | 
						|
        0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
 | 
						|
        0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
 | 
						|
    __m128i last16, a, b;
 | 
						|
 | 
						|
    last16= _mm_loadu_si128((const __m128i *) &data[data_len - 16]);
 | 
						|
 | 
						|
    temp= _mm_loadu_si128((const __m128i *) &shf_table[data_len & 15]);
 | 
						|
    a= _mm_shuffle_epi8(fold, temp);
 | 
						|
 | 
						|
    temp= _mm_xor_si128(temp, _mm_load_si128((const __m128i *) mask3));
 | 
						|
    b= _mm_shuffle_epi8(fold, temp);
 | 
						|
    b= _mm_blendv_epi8(b, last16, temp);
 | 
						|
 | 
						|
    /* k = rk1 & rk2 */
 | 
						|
    temp= _mm_clmulepi64_si128(a, k, 0x01);
 | 
						|
    fold= _mm_clmulepi64_si128(a, k, 0x10);
 | 
						|
 | 
						|
    fold= _mm_xor_si128(fold, temp);
 | 
						|
    fold= _mm_xor_si128(fold, b);
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * -------------------------------------------------
 | 
						|
   * Reduction 128 -> 32
 | 
						|
   * Assumes: \a fold holds 128bit folded data
 | 
						|
   */
 | 
						|
reduction_128_64:
 | 
						|
  k= _mm_load_si128((__m128i *) (¶ms->rk5));
 | 
						|
  fold= crcr32_reduce_128_to_64(fold, k);
 | 
						|
 | 
						|
barret_reduction:
 | 
						|
  k= _mm_load_si128((__m128i *) (¶ms->rk7));
 | 
						|
  n= crcr32_reduce_64_to_32(fold, k);
 | 
						|
  return n;
 | 
						|
}
 | 
						|
 | 
						|
static const MY_ALIGNED(16) struct crcr_pclmulqdq_ctx ether_crc32_clmul= {
 | 
						|
    0xccaa009e,  /**< rk1 */
 | 
						|
    0x1751997d0, /**< rk2 */
 | 
						|
    0xccaa009e,  /**< rk5 */
 | 
						|
    0x163cd6124, /**< rk6 */
 | 
						|
    0x1f7011640, /**< rk7 */
 | 
						|
    0x1db710641  /**< rk8 */
 | 
						|
};
 | 
						|
 | 
						|
/**
 | 
						|
 * @brief Calculates Ethernet CRC32 using PCLMULQDQ method.
 | 
						|
 *
 | 
						|
 * @param data pointer to data block to calculate CRC for
 | 
						|
 * @param data_len size of data block
 | 
						|
 *
 | 
						|
 * @return New CRC value
 | 
						|
 */
 | 
						|
unsigned int crc32_pclmul(unsigned int crc32, const void *buf, size_t len)
 | 
						|
{
 | 
						|
  return ~crcr32_calc_pclmulqdq(buf, (uint32_t)len, ~crc32, ðer_crc32_clmul);
 | 
						|
}
 |