src/cksum_vmull.c

   1 /* cksum -- calculate and print POSIX checksums and sizes of files
   2    Copyright (C) 2024 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 #include <stdio.h>
  20 #include <sys/types.h>
  21 #include <stdint.h>
  22 #include <arm_neon.h>
  23 #include "system.h"
  24
  25 /* Number of bytes to read at once.  */
  26 #define BUFLEN (1 << 16)
  27
  28 extern uint_fast32_t const crctab[8][256];
  29
  30 extern bool
  31 cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
  32
  33 static uint64x2_t
  34 bswap_neon (uint64x2_t in)
  35 {
  36   uint64x2_t a =
  37     vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in)));
  38   a = vcombine_u64 (vget_high_u64 (a), vget_low_u64 (a));
  39   return a;
  40 }
  41
  42 /* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */
  43
  44 bool
  45 cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
  46 {
  47   uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)];
  48   uint_fast32_t crc = 0;
  49   uintmax_t length = 0;
  50   size_t bytes_read;
  51   poly64x2_t single_mult_constant;
  52   poly64x2_t four_mult_constant;
  53
  54   if (!fp || !crc_out || !length_out)
  55     return false;
  56
  57   /* These constants and general algorithms are taken from the Intel whitepaper
  58      "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  59    */
  60   single_mult_constant =
  61     vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C));
  62   four_mult_constant =
  63     vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C));
  64
  65   while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
  66     {
  67       uint64x2_t *datap;
  68       uint64x2_t data;
  69       uint64x2_t data2;
  70       uint64x2_t data3;
  71       uint64x2_t data4;
  72       uint64x2_t data5;
  73       uint64x2_t data6;
  74       uint64x2_t data7;
  75       uint64x2_t data8;
  76       uint64x2_t fold_data;
  77       uint64x2_t xor_crc;
  78
  79       if (length + bytes_read < length)
  80         {
  81           errno = EOVERFLOW;
  82           return false;
  83         }
  84       length += bytes_read;
  85
  86       datap = (uint64x2_t *) buf;
  87
  88       /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
  89       if (bytes_read >= 16 * 8)
  90         {
  91           data = vld1q_u64 ((uint64_t *) (datap));
  92           data = bswap_neon (data);
  93           /* XOR in initial CRC value (for us 0 so no effect), or CRC value
  94              calculated for previous BUFLEN buffer from fread */
  95           xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
  96           crc = 0;
  97           data = veorq_u64 (data, xor_crc);
  98           data3 = vld1q_u64 ((uint64_t *) (datap + 1));
  99           data3 = bswap_neon (data3);
 100           data5 = vld1q_u64 ((uint64_t *) (datap + 2));
 101           data5 = bswap_neon (data5);
 102           data7 = vld1q_u64 ((uint64_t *) (datap + 3));
 103           data7 = bswap_neon (data7);
 104
 105
 106           while (bytes_read >= 16 * 8)
 107             {
 108               datap += 4;
 109
 110               /* Do multiplication here for four consecutive 16 byte blocks */
 111               data2 =
 112                 vreinterpretq_u64_p128 (vmull_p64
 113                                         (vgetq_lane_p64
 114                                          (vreinterpretq_p64_u64 (data), 0),
 115                                          vgetq_lane_p64 (four_mult_constant,
 116                                                          0)));
 117               data =
 118                 vreinterpretq_u64_p128 (vmull_high_p64
 119                                         (vreinterpretq_p64_u64 (data),
 120                                          four_mult_constant));
 121               data4 =
 122                 vreinterpretq_u64_p128 (vmull_p64
 123                                         (vgetq_lane_p64
 124                                          (vreinterpretq_p64_u64 (data3), 0),
 125                                          vgetq_lane_p64 (four_mult_constant,
 126                                                          0)));
 127               data3 =
 128                 vreinterpretq_u64_p128 (vmull_high_p64
 129                                         (vreinterpretq_p64_u64 (data3),
 130                                          four_mult_constant));
 131               data6 =
 132                 vreinterpretq_u64_p128 (vmull_p64
 133                                         (vgetq_lane_p64
 134                                          (vreinterpretq_p64_u64 (data5), 0),
 135                                          vgetq_lane_p64 (four_mult_constant,
 136                                                          0)));
 137               data5 =
 138                 vreinterpretq_u64_p128 (vmull_high_p64
 139                                         (vreinterpretq_p64_u64 (data5),
 140                                          four_mult_constant));
 141               data8 =
 142                 vreinterpretq_u64_p128 (vmull_p64
 143                                         (vgetq_lane_p64
 144                                          (vreinterpretq_p64_u64 (data7), 0),
 145                                          vgetq_lane_p64 (four_mult_constant,
 146                                                          0)));
 147               data7 =
 148                 vreinterpretq_u64_p128 (vmull_high_p64
 149                                         (vreinterpretq_p64_u64 (data7),
 150                                          four_mult_constant));
 151
 152               /* Now multiplication results for the four blocks is xor:ed with
 153                  next four 16 byte blocks from the buffer. This effectively
 154                  "consumes" the first four blocks from the buffer.
 155                  Keep xor result in variables for multiplication in next
 156                  round of loop. */
 157               data = veorq_u64 (data, data2);
 158               data2 = vld1q_u64 ((uint64_t *) (datap));
 159               data2 = bswap_neon (data2);
 160               data = veorq_u64 (data, data2);
 161
 162               data3 = veorq_u64 (data3, data4);
 163               data4 = vld1q_u64 ((uint64_t *) (datap + 1));
 164               data4 = bswap_neon (data4);
 165               data3 = veorq_u64 (data3, data4);
 166
 167               data5 = veorq_u64 (data5, data6);
 168               data6 = vld1q_u64 ((uint64_t *) (datap + 2));
 169               data6 = bswap_neon (data6);
 170               data5 = veorq_u64 (data5, data6);
 171
 172               data7 = veorq_u64 (data7, data8);
 173               data8 = vld1q_u64 ((uint64_t *) (datap + 3));
 174               data8 = bswap_neon (data8);
 175               data7 = veorq_u64 (data7, data8);
 176
 177               bytes_read -= (16 * 4);
 178             }
 179           /* At end of loop we write out results from variables back into
 180              the buffer, for use in single fold loop */
 181           data = bswap_neon (data);
 182           vst1q_u64 ((uint64_t *) (datap), data);
 183           data3 = bswap_neon (data3);
 184           vst1q_u64 ((uint64_t *) (datap + 1), data3);
 185           data5 = bswap_neon (data5);
 186           vst1q_u64 ((uint64_t *) (datap + 2), data5);
 187           data7 = bswap_neon (data7);
 188           vst1q_u64 ((uint64_t *) (datap + 3), data7);
 189         }
 190
 191       /* Fold two 16-byte blocks into one 16-byte block */
 192       if (bytes_read >= 32)
 193         {
 194           data = vld1q_u64 ((uint64_t *) (datap));
 195           data = bswap_neon (data);
 196           xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
 197           crc = 0;
 198           data = veorq_u64 (data, xor_crc);
 199           while (bytes_read >= 32)
 200             {
 201               datap++;
 202
 203               data2 =
 204                 vreinterpretq_u64_p128 (vmull_p64
 205                                         (vgetq_lane_p64
 206                                          (vreinterpretq_p64_u64 (data), 0),
 207                                          vgetq_lane_p64 (single_mult_constant,
 208                                                          0)));
 209               data =
 210                 vreinterpretq_u64_p128 (vmull_high_p64
 211                                         (vreinterpretq_p64_u64 (data),
 212                                          single_mult_constant));
 213               fold_data = vld1q_u64 ((uint64_t *) (datap));
 214               fold_data = bswap_neon (fold_data);
 215               data = veorq_u64 (data, data2);
 216               data = veorq_u64 (data, fold_data);
 217               bytes_read -= 16;
 218             }
 219           data = bswap_neon (data);
 220           vst1q_u64 ((uint64_t *) (datap), data);
 221         }
 222
 223       /* And finish up last 0-31 bytes in a byte by byte fashion */
 224       unsigned char *cp = (unsigned char *) datap;
 225       while (bytes_read--)
 226         crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
 227       if (feof (fp))
 228         break;
 229     }
 230
 231   *crc_out = crc;
 232   *length_out = length;
 233
 234   return !ferror (fp);
 235 }