maint: correct © dates for hardware optimized crc routines
[coreutils.git] / src / cksum_avx2.c
blob0c891586603924a895698c32b7952137b311fae3
1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include <stdint.h>
22 #include <x86intrin.h>
23 #include "system.h"
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab[8][256];
30 extern bool
31 cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
33 bool
34 cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
36 __m256i buf[BUFLEN / sizeof (__m256i)];
37 uint_fast32_t crc = 0;
38 uintmax_t length = 0;
39 size_t bytes_read;
40 __m256i single_mult_constant;
41 __m256i four_mult_constant;
42 __m256i shuffle_constant;
44 if (!fp || !crc_out || !length_out)
45 return false;
47 /* These constants and general algorithms are taken from the Intel whitepaper
48 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
50 single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7,
51 0x569700E5, 0x75BE46B7);
52 four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB,
53 0x10BD4D7C, 0x567FDDEB);
55 /* Constant to byteswap a full AVX2 register */
56 shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57 9, 10, 11, 12, 13, 14, 15,
58 0, 1, 2, 3, 4, 5, 6, 7, 8,
59 9, 10, 11, 12, 13, 14, 15);
60 while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
62 __m256i data;
63 __m256i data2;
64 __m256i data3;
65 __m256i data4;
66 __m256i data5;
67 __m256i data6;
68 __m256i data7;
69 __m256i data8;
70 __m256i fold_data;
71 __m256i xor_crc;
73 __m256i *datap;
75 if (length + bytes_read < length)
77 errno = EOVERFLOW;
78 return false;
80 length += bytes_read;
82 datap = (__m256i *)buf;
84 /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */
85 if (bytes_read >= 16 * 8 * 2)
87 data = _mm256_loadu_si256 (datap);
88 data = _mm256_shuffle_epi8 (data, shuffle_constant);
89 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
90 calculated for previous BUFLEN buffer from fread */
91 xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
92 crc = 0;
93 data = _mm256_xor_si256 (data, xor_crc);
94 data3 = _mm256_loadu_si256 (datap + 1);
95 data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
96 data5 = _mm256_loadu_si256 (datap + 2);
97 data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
98 data7 = _mm256_loadu_si256 (datap + 3);
99 data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
101 while (bytes_read >= 16 * 8 * 2)
103 datap += 4;
105 /* Do multiplication here for 8x consecutive 16 byte blocks */
106 data2 = _mm256_clmulepi64_epi128 (data, four_mult_constant,
107 0x00);
108 data = _mm256_clmulepi64_epi128 (data, four_mult_constant,
109 0x11);
110 data4 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
111 0x00);
112 data3 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
113 0x11);
114 data6 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
115 0x00);
116 data5 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
117 0x11);
118 data8 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
119 0x00);
120 data7 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
121 0x11);
123 /* Now multiplication results for the 8x blocks is xor:ed with
124 next 8x 16 byte blocks from the buffer. This effectively
125 "consumes" the first 8x blocks from the buffer.
126 Keep xor result in variables for multiplication in next
127 round of loop. */
128 data = _mm256_xor_si256 (data, data2);
129 data2 = _mm256_loadu_si256 (datap);
130 data2 = _mm256_shuffle_epi8 (data2, shuffle_constant);
131 data = _mm256_xor_si256 (data, data2);
133 data3 = _mm256_xor_si256 (data3, data4);
134 data4 = _mm256_loadu_si256 (datap + 1);
135 data4 = _mm256_shuffle_epi8 (data4, shuffle_constant);
136 data3 = _mm256_xor_si256 (data3, data4);
138 data5 = _mm256_xor_si256 (data5, data6);
139 data6 = _mm256_loadu_si256 (datap + 2);
140 data6 = _mm256_shuffle_epi8 (data6, shuffle_constant);
141 data5 = _mm256_xor_si256 (data5, data6);
143 data7 = _mm256_xor_si256 (data7, data8);
144 data8 = _mm256_loadu_si256 (datap + 3);
145 data8 = _mm256_shuffle_epi8 (data8, shuffle_constant);
146 data7 = _mm256_xor_si256 (data7, data8);
148 bytes_read -= (16 * 4 * 2);
150 /* At end of loop we write out results from variables back into
151 the buffer, for use in single fold loop */
152 data = _mm256_shuffle_epi8 (data, shuffle_constant);
153 _mm256_storeu_si256 (datap, data);
154 data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
155 _mm256_storeu_si256 (datap + 1, data3);
156 data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
157 _mm256_storeu_si256 (datap + 2, data5);
158 data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
159 _mm256_storeu_si256 (datap + 3, data7);
162 /* Fold two 32-byte blocks into one 32-byte block */
163 if (bytes_read >= 64)
165 data = _mm256_loadu_si256 (datap);
166 data = _mm256_shuffle_epi8 (data, shuffle_constant);
167 xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
168 crc = 0;
169 data = _mm256_xor_si256 (data, xor_crc);
170 while (bytes_read >= 64)
172 datap++;
174 data2 = _mm256_clmulepi64_epi128 (data, single_mult_constant,
175 0x00);
176 data = _mm256_clmulepi64_epi128 (data, single_mult_constant,
177 0x11);
178 fold_data = _mm256_loadu_si256 (datap);
179 fold_data = _mm256_shuffle_epi8 (fold_data, shuffle_constant);
180 data = _mm256_xor_si256 (data, data2);
181 data = _mm256_xor_si256 (data, fold_data);
182 bytes_read -= 32;
184 data = _mm256_shuffle_epi8 (data, shuffle_constant);
185 _mm256_storeu_si256 (datap, data);
188 /* And finish up last 0-63 bytes in a byte by byte fashion */
189 unsigned char *cp = (unsigned char *)datap;
190 while (bytes_read--)
191 crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
192 if (feof (fp))
193 break;
196 *crc_out = crc;
197 *length_out = length;
199 return !ferror (fp);