src/cksum_avx2.c

   1 /* cksum -- calculate and print POSIX checksums and sizes of files
   2    Copyright (C) 2024 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 #include <stdio.h>
  20 #include <sys/types.h>
  21 #include <stdint.h>
  22 #include <x86intrin.h>
  23 #include "system.h"
  24
  25 /* Number of bytes to read at once.  */
  26 #define BUFLEN (1 << 16)
  27
  28 extern uint_fast32_t const crctab[8][256];
  29
  30 extern bool
  31 cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
  32
  33 bool
  34 cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
  35 {
  36   __m256i buf[BUFLEN / sizeof (__m256i)];
  37   uint_fast32_t crc = 0;
  38   uintmax_t length = 0;
  39   size_t bytes_read;
  40   __m256i single_mult_constant;
  41   __m256i four_mult_constant;
  42   __m256i shuffle_constant;
  43
  44   if (!fp || !crc_out || !length_out)
  45     return false;
  46
  47   /* These constants and general algorithms are taken from the Intel whitepaper
  48      "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  49   */
  50   single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7,
  51                                             0x569700E5, 0x75BE46B7);
  52   four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB,
  53                                             0x10BD4D7C, 0x567FDDEB);
  54
  55   /* Constant to byteswap a full AVX2 register */
  56   shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
  57                                       9, 10, 11, 12, 13, 14, 15,
  58                                       0, 1, 2, 3, 4, 5, 6, 7, 8,
  59                                       9, 10, 11, 12, 13, 14, 15);
  60   while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
  61     {
  62       __m256i data;
  63       __m256i data2;
  64       __m256i data3;
  65       __m256i data4;
  66       __m256i data5;
  67       __m256i data6;
  68       __m256i data7;
  69       __m256i data8;
  70       __m256i fold_data;
  71       __m256i xor_crc;
  72
  73       __m256i *datap;
  74
  75       if (length + bytes_read < length)
  76         {
  77           errno = EOVERFLOW;
  78           return false;
  79         }
  80       length += bytes_read;
  81
  82       datap = (__m256i *)buf;
  83
  84       /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */
  85       if (bytes_read >= 16 * 8 * 2)
  86         {
  87           data = _mm256_loadu_si256 (datap);
  88           data = _mm256_shuffle_epi8 (data, shuffle_constant);
  89           /* XOR in initial CRC value (for us 0 so no effect), or CRC value
  90              calculated for previous BUFLEN buffer from fread */
  91           xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
  92           crc = 0;
  93           data = _mm256_xor_si256 (data, xor_crc);
  94           data3 = _mm256_loadu_si256 (datap + 1);
  95           data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
  96           data5 = _mm256_loadu_si256 (datap + 2);
  97           data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
  98           data7 = _mm256_loadu_si256 (datap + 3);
  99           data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
 100
 101           while (bytes_read >= 16 * 8 * 2)
 102             {
 103               datap += 4;
 104
 105               /* Do multiplication here for 8x consecutive 16 byte blocks */
 106               data2 = _mm256_clmulepi64_epi128 (data, four_mult_constant,
 107                                                 0x00);
 108               data = _mm256_clmulepi64_epi128 (data, four_mult_constant,
 109                                                0x11);
 110               data4 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
 111                                                 0x00);
 112               data3 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
 113                                                 0x11);
 114               data6 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
 115                                                 0x00);
 116               data5 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
 117                                                 0x11);
 118               data8 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
 119                                                 0x00);
 120               data7 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
 121                                                 0x11);
 122
 123               /* Now multiplication results for the 8x blocks is xor:ed with
 124                  next 8x 16 byte blocks from the buffer. This effectively
 125                  "consumes" the first 8x blocks from the buffer.
 126                  Keep xor result in variables for multiplication in next
 127                  round of loop. */
 128               data = _mm256_xor_si256 (data, data2);
 129               data2 = _mm256_loadu_si256 (datap);
 130               data2 = _mm256_shuffle_epi8 (data2, shuffle_constant);
 131               data = _mm256_xor_si256 (data, data2);
 132
 133               data3 = _mm256_xor_si256 (data3, data4);
 134               data4 = _mm256_loadu_si256 (datap + 1);
 135               data4 = _mm256_shuffle_epi8 (data4, shuffle_constant);
 136               data3 = _mm256_xor_si256 (data3, data4);
 137
 138               data5 = _mm256_xor_si256 (data5, data6);
 139               data6 = _mm256_loadu_si256 (datap + 2);
 140               data6 = _mm256_shuffle_epi8 (data6, shuffle_constant);
 141               data5 = _mm256_xor_si256 (data5, data6);
 142
 143               data7 = _mm256_xor_si256 (data7, data8);
 144               data8 = _mm256_loadu_si256 (datap + 3);
 145               data8 = _mm256_shuffle_epi8 (data8, shuffle_constant);
 146               data7 = _mm256_xor_si256 (data7, data8);
 147
 148               bytes_read -= (16 * 4 * 2);
 149             }
 150           /* At end of loop we write out results from variables back into
 151              the buffer, for use in single fold loop */
 152           data = _mm256_shuffle_epi8 (data, shuffle_constant);
 153           _mm256_storeu_si256 (datap, data);
 154           data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
 155           _mm256_storeu_si256 (datap + 1, data3);
 156           data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
 157           _mm256_storeu_si256 (datap + 2, data5);
 158           data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
 159           _mm256_storeu_si256 (datap + 3, data7);
 160         }
 161
 162       /* Fold two 32-byte blocks into one 32-byte block */
 163       if (bytes_read >= 64)
 164         {
 165           data = _mm256_loadu_si256 (datap);
 166           data = _mm256_shuffle_epi8 (data, shuffle_constant);
 167           xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
 168           crc = 0;
 169           data = _mm256_xor_si256 (data, xor_crc);
 170           while (bytes_read >= 64)
 171             {
 172               datap++;
 173
 174               data2 = _mm256_clmulepi64_epi128 (data, single_mult_constant,
 175                                                 0x00);
 176               data = _mm256_clmulepi64_epi128 (data, single_mult_constant,
 177                                                0x11);
 178               fold_data = _mm256_loadu_si256 (datap);
 179               fold_data = _mm256_shuffle_epi8 (fold_data, shuffle_constant);
 180               data = _mm256_xor_si256 (data, data2);
 181               data = _mm256_xor_si256 (data, fold_data);
 182               bytes_read -= 32;
 183             }
 184           data = _mm256_shuffle_epi8 (data, shuffle_constant);
 185           _mm256_storeu_si256 (datap, data);
 186         }
 187
 188       /* And finish up last 0-63 bytes in a byte by byte fashion */
 189       unsigned char *cp = (unsigned char *)datap;
 190       while (bytes_read--)
 191         crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
 192       if (feof (fp))
 193         break;
 194     }
 195
 196   *crc_out = crc;
 197   *length_out = length;
 198
 199   return !ferror (fp);
 200 }