1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 #include <sys/types.h>
22 #include <x86intrin.h>
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab
[8][256];
31 cksum_avx512 (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
);
34 cksum_avx512 (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
)
36 __m512i buf
[BUFLEN
/ sizeof (__m512i
)];
37 uint_fast32_t crc
= 0;
40 __m512i single_mult_constant
;
41 __m512i four_mult_constant
;
42 __m512i shuffle_constant
;
44 if (!fp
|| !crc_out
|| !length_out
)
47 /* These constants and general algorithms are taken from the Intel whitepaper
48 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
50 single_mult_constant
= _mm512_set_epi64 (0x8833794C, 0xE6228B11,
51 0x8833794C, 0xE6228B11,
52 0x8833794C, 0xE6228B11,
53 0x8833794C, 0xE6228B11);
54 four_mult_constant
= _mm512_set_epi64 (0xCBCF3BCB, 0x88FE2237,
55 0xCBCF3BCB, 0x88FE2237,
56 0xCBCF3BCB, 0x88FE2237,
57 0xCBCF3BCB, 0x88FE2237);
59 /* Constant to byteswap a full AVX512 register */
60 shuffle_constant
= _mm512_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
61 9, 10, 11, 12, 13, 14, 15,
62 0, 1, 2, 3, 4, 5, 6, 7, 8,
63 9, 10, 11, 12, 13, 14, 15,
64 0, 1, 2, 3, 4, 5, 6, 7, 8,
65 9, 10, 11, 12, 13, 14, 15,
66 0, 1, 2, 3, 4, 5, 6, 7, 8,
67 9, 10, 11, 12, 13, 14, 15);
68 while ((bytes_read
= fread (buf
, 1, BUFLEN
, fp
)) > 0)
83 if (length
+ bytes_read
< length
)
90 datap
= (__m512i
*)buf
;
92 /* Fold in parallel 32x 16-byte blocks into 16x 16-byte blocks */
93 if (bytes_read
>= 16 * 8 * 4)
95 data
= _mm512_loadu_si512 (datap
);
96 data
= _mm512_shuffle_epi8 (data
, shuffle_constant
);
97 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
98 calculated for previous BUFLEN buffer from fread */
99 xor_crc
= _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, crc
, 0, 0, 0);
102 data
= _mm512_xor_si512 (data
, xor_crc
);
103 data3
= _mm512_loadu_si512 (datap
+ 1);
104 data3
= _mm512_shuffle_epi8 (data3
, shuffle_constant
);
105 data5
= _mm512_loadu_si512 (datap
+ 2);
106 data5
= _mm512_shuffle_epi8 (data5
, shuffle_constant
);
107 data7
= _mm512_loadu_si512 (datap
+ 3);
108 data7
= _mm512_shuffle_epi8 (data7
, shuffle_constant
);
110 while (bytes_read
>= 16 * 8 * 4)
114 /* Do multiplication here for 16x consecutive 16 byte blocks */
115 data2
= _mm512_clmulepi64_epi128 (data
, four_mult_constant
,
117 data
= _mm512_clmulepi64_epi128 (data
, four_mult_constant
,
119 data4
= _mm512_clmulepi64_epi128 (data3
, four_mult_constant
,
121 data3
= _mm512_clmulepi64_epi128 (data3
, four_mult_constant
,
123 data6
= _mm512_clmulepi64_epi128 (data5
, four_mult_constant
,
125 data5
= _mm512_clmulepi64_epi128 (data5
, four_mult_constant
,
127 data8
= _mm512_clmulepi64_epi128 (data7
, four_mult_constant
,
129 data7
= _mm512_clmulepi64_epi128 (data7
, four_mult_constant
,
132 /* Now multiplication results for the 16x blocks is xor:ed with
133 next 16x 16 byte blocks from the buffer. This effectively
134 "consumes" the first 16x blocks from the buffer.
135 Keep xor result in variables for multiplication in next
137 data
= _mm512_xor_si512 (data
, data2
);
138 data2
= _mm512_loadu_si512 (datap
);
139 data2
= _mm512_shuffle_epi8 (data2
, shuffle_constant
);
140 data
= _mm512_xor_si512 (data
, data2
);
142 data3
= _mm512_xor_si512 (data3
, data4
);
143 data4
= _mm512_loadu_si512 (datap
+ 1);
144 data4
= _mm512_shuffle_epi8 (data4
, shuffle_constant
);
145 data3
= _mm512_xor_si512 (data3
, data4
);
147 data5
= _mm512_xor_si512 (data5
, data6
);
148 data6
= _mm512_loadu_si512 (datap
+ 2);
149 data6
= _mm512_shuffle_epi8 (data6
, shuffle_constant
);
150 data5
= _mm512_xor_si512 (data5
, data6
);
152 data7
= _mm512_xor_si512 (data7
, data8
);
153 data8
= _mm512_loadu_si512 (datap
+ 3);
154 data8
= _mm512_shuffle_epi8 (data8
, shuffle_constant
);
155 data7
= _mm512_xor_si512 (data7
, data8
);
157 bytes_read
-= (16 * 4 * 4);
159 /* At end of loop we write out results from variables back into
160 the buffer, for use in single fold loop */
161 data
= _mm512_shuffle_epi8 (data
, shuffle_constant
);
162 _mm512_storeu_si512 (datap
, data
);
163 data3
= _mm512_shuffle_epi8 (data3
, shuffle_constant
);
164 _mm512_storeu_si512 (datap
+ 1, data3
);
165 data5
= _mm512_shuffle_epi8 (data5
, shuffle_constant
);
166 _mm512_storeu_si512 (datap
+ 2, data5
);
167 data7
= _mm512_shuffle_epi8 (data7
, shuffle_constant
);
168 _mm512_storeu_si512 (datap
+ 3, data7
);
171 /* Fold two 64-byte blocks into one 64-byte block */
172 if (bytes_read
>= 128)
174 data
= _mm512_loadu_si512 (datap
);
175 data
= _mm512_shuffle_epi8 (data
, shuffle_constant
);
176 xor_crc
= _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, crc
, 0, 0, 0);
179 data
= _mm512_xor_si512 (data
, xor_crc
);
180 while (bytes_read
>= 128)
184 data2
= _mm512_clmulepi64_epi128 (data
, single_mult_constant
,
186 data
= _mm512_clmulepi64_epi128 (data
, single_mult_constant
,
188 fold_data
= _mm512_loadu_si512 (datap
);
189 fold_data
= _mm512_shuffle_epi8 (fold_data
, shuffle_constant
);
190 data
= _mm512_xor_si512 (data
, data2
);
191 data
= _mm512_xor_si512 (data
, fold_data
);
194 data
= _mm512_shuffle_epi8 (data
, shuffle_constant
);
195 _mm512_storeu_si512 (datap
, data
);
198 /* And finish up last 0-127 bytes in a byte by byte fashion */
199 unsigned char *cp
= (unsigned char *)datap
;
201 crc
= (crc
<< 8) ^ crctab
[0][((crc
>> 24) ^ *cp
++) & 0xFF];
207 *length_out
= length
;