1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 #include <sys/types.h>
22 #include <x86intrin.h>
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab
[8][256];
31 cksum_avx2 (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
);
34 cksum_avx2 (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
)
36 __m256i buf
[BUFLEN
/ sizeof (__m256i
)];
37 uint_fast32_t crc
= 0;
40 __m256i single_mult_constant
;
41 __m256i four_mult_constant
;
42 __m256i shuffle_constant
;
44 if (!fp
|| !crc_out
|| !length_out
)
47 /* These constants and general algorithms are taken from the Intel whitepaper
48 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
50 single_mult_constant
= _mm256_set_epi64x (0x569700E5, 0x75BE46B7,
51 0x569700E5, 0x75BE46B7);
52 four_mult_constant
= _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB,
53 0x10BD4D7C, 0x567FDDEB);
55 /* Constant to byteswap a full AVX2 register */
56 shuffle_constant
= _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57 9, 10, 11, 12, 13, 14, 15,
58 0, 1, 2, 3, 4, 5, 6, 7, 8,
59 9, 10, 11, 12, 13, 14, 15);
60 while ((bytes_read
= fread (buf
, 1, BUFLEN
, fp
)) > 0)
75 if (length
+ bytes_read
< length
)
82 datap
= (__m256i
*)buf
;
84 /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */
85 if (bytes_read
>= 16 * 8 * 2)
87 data
= _mm256_loadu_si256 (datap
);
88 data
= _mm256_shuffle_epi8 (data
, shuffle_constant
);
89 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
90 calculated for previous BUFLEN buffer from fread */
91 xor_crc
= _mm256_set_epi32 (0, 0, 0, 0, crc
, 0, 0, 0);
93 data
= _mm256_xor_si256 (data
, xor_crc
);
94 data3
= _mm256_loadu_si256 (datap
+ 1);
95 data3
= _mm256_shuffle_epi8 (data3
, shuffle_constant
);
96 data5
= _mm256_loadu_si256 (datap
+ 2);
97 data5
= _mm256_shuffle_epi8 (data5
, shuffle_constant
);
98 data7
= _mm256_loadu_si256 (datap
+ 3);
99 data7
= _mm256_shuffle_epi8 (data7
, shuffle_constant
);
101 while (bytes_read
>= 16 * 8 * 2)
105 /* Do multiplication here for 8x consecutive 16 byte blocks */
106 data2
= _mm256_clmulepi64_epi128 (data
, four_mult_constant
,
108 data
= _mm256_clmulepi64_epi128 (data
, four_mult_constant
,
110 data4
= _mm256_clmulepi64_epi128 (data3
, four_mult_constant
,
112 data3
= _mm256_clmulepi64_epi128 (data3
, four_mult_constant
,
114 data6
= _mm256_clmulepi64_epi128 (data5
, four_mult_constant
,
116 data5
= _mm256_clmulepi64_epi128 (data5
, four_mult_constant
,
118 data8
= _mm256_clmulepi64_epi128 (data7
, four_mult_constant
,
120 data7
= _mm256_clmulepi64_epi128 (data7
, four_mult_constant
,
123 /* Now multiplication results for the 8x blocks is xor:ed with
124 next 8x 16 byte blocks from the buffer. This effectively
125 "consumes" the first 8x blocks from the buffer.
126 Keep xor result in variables for multiplication in next
128 data
= _mm256_xor_si256 (data
, data2
);
129 data2
= _mm256_loadu_si256 (datap
);
130 data2
= _mm256_shuffle_epi8 (data2
, shuffle_constant
);
131 data
= _mm256_xor_si256 (data
, data2
);
133 data3
= _mm256_xor_si256 (data3
, data4
);
134 data4
= _mm256_loadu_si256 (datap
+ 1);
135 data4
= _mm256_shuffle_epi8 (data4
, shuffle_constant
);
136 data3
= _mm256_xor_si256 (data3
, data4
);
138 data5
= _mm256_xor_si256 (data5
, data6
);
139 data6
= _mm256_loadu_si256 (datap
+ 2);
140 data6
= _mm256_shuffle_epi8 (data6
, shuffle_constant
);
141 data5
= _mm256_xor_si256 (data5
, data6
);
143 data7
= _mm256_xor_si256 (data7
, data8
);
144 data8
= _mm256_loadu_si256 (datap
+ 3);
145 data8
= _mm256_shuffle_epi8 (data8
, shuffle_constant
);
146 data7
= _mm256_xor_si256 (data7
, data8
);
148 bytes_read
-= (16 * 4 * 2);
150 /* At end of loop we write out results from variables back into
151 the buffer, for use in single fold loop */
152 data
= _mm256_shuffle_epi8 (data
, shuffle_constant
);
153 _mm256_storeu_si256 (datap
, data
);
154 data3
= _mm256_shuffle_epi8 (data3
, shuffle_constant
);
155 _mm256_storeu_si256 (datap
+ 1, data3
);
156 data5
= _mm256_shuffle_epi8 (data5
, shuffle_constant
);
157 _mm256_storeu_si256 (datap
+ 2, data5
);
158 data7
= _mm256_shuffle_epi8 (data7
, shuffle_constant
);
159 _mm256_storeu_si256 (datap
+ 3, data7
);
162 /* Fold two 32-byte blocks into one 32-byte block */
163 if (bytes_read
>= 64)
165 data
= _mm256_loadu_si256 (datap
);
166 data
= _mm256_shuffle_epi8 (data
, shuffle_constant
);
167 xor_crc
= _mm256_set_epi32 (0, 0, 0, 0, crc
, 0, 0, 0);
169 data
= _mm256_xor_si256 (data
, xor_crc
);
170 while (bytes_read
>= 64)
174 data2
= _mm256_clmulepi64_epi128 (data
, single_mult_constant
,
176 data
= _mm256_clmulepi64_epi128 (data
, single_mult_constant
,
178 fold_data
= _mm256_loadu_si256 (datap
);
179 fold_data
= _mm256_shuffle_epi8 (fold_data
, shuffle_constant
);
180 data
= _mm256_xor_si256 (data
, data2
);
181 data
= _mm256_xor_si256 (data
, fold_data
);
184 data
= _mm256_shuffle_epi8 (data
, shuffle_constant
);
185 _mm256_storeu_si256 (datap
, data
);
188 /* And finish up last 0-63 bytes in a byte by byte fashion */
189 unsigned char *cp
= (unsigned char *)datap
;
191 crc
= (crc
<< 8) ^ crctab
[0][((crc
>> 24) ^ *cp
++) & 0xFF];
197 *length_out
= length
;