1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 1992-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 #include <sys/types.h>
22 #include <x86intrin.h>
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab
[8][256];
31 cksum_pclmul (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
);
33 /* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */
36 cksum_pclmul (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
)
38 __m128i buf
[BUFLEN
/ sizeof (__m128i
)];
39 uint_fast32_t crc
= 0;
42 __m128i single_mult_constant
;
43 __m128i four_mult_constant
;
44 __m128i shuffle_constant
;
46 if (!fp
|| !crc_out
|| !length_out
)
49 /* These constants and general algorithms are taken from the Intel whitepaper
50 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
52 single_mult_constant
= _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605);
53 four_mult_constant
= _mm_set_epi64x (0x8833794C, 0xE6228B11);
55 /* Constant to byteswap a full SSE register */
56 shuffle_constant
= _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57 9, 10, 11, 12, 13, 14, 15);
59 while ((bytes_read
= fread (buf
, 1, BUFLEN
, fp
)) > 0)
73 if (length
+ bytes_read
< length
)
80 datap
= (__m128i
*)buf
;
82 /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
83 if (bytes_read
>= 16 * 8)
85 data
= _mm_loadu_si128 (datap
);
86 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
87 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
88 calculated for previous BUFLEN buffer from fread */
89 xor_crc
= _mm_set_epi32 (crc
, 0, 0, 0);
91 data
= _mm_xor_si128 (data
, xor_crc
);
92 data3
= _mm_loadu_si128 (datap
+ 1);
93 data3
= _mm_shuffle_epi8 (data3
, shuffle_constant
);
94 data5
= _mm_loadu_si128 (datap
+ 2);
95 data5
= _mm_shuffle_epi8 (data5
, shuffle_constant
);
96 data7
= _mm_loadu_si128 (datap
+ 3);
97 data7
= _mm_shuffle_epi8 (data7
, shuffle_constant
);
100 while (bytes_read
>= 16 * 8)
104 /* Do multiplication here for four consecutive 16 byte blocks */
105 data2
= _mm_clmulepi64_si128 (data
, four_mult_constant
, 0x00);
106 data
= _mm_clmulepi64_si128 (data
, four_mult_constant
, 0x11);
107 data4
= _mm_clmulepi64_si128 (data3
, four_mult_constant
, 0x00);
108 data3
= _mm_clmulepi64_si128 (data3
, four_mult_constant
, 0x11);
109 data6
= _mm_clmulepi64_si128 (data5
, four_mult_constant
, 0x00);
110 data5
= _mm_clmulepi64_si128 (data5
, four_mult_constant
, 0x11);
111 data8
= _mm_clmulepi64_si128 (data7
, four_mult_constant
, 0x00);
112 data7
= _mm_clmulepi64_si128 (data7
, four_mult_constant
, 0x11);
114 /* Now multiplication results for the four blocks is xor:ed with
115 next four 16 byte blocks from the buffer. This effectively
116 "consumes" the first four blocks from the buffer.
117 Keep xor result in variables for multiplication in next
119 data
= _mm_xor_si128 (data
, data2
);
120 data2
= _mm_loadu_si128 (datap
);
121 data2
= _mm_shuffle_epi8 (data2
, shuffle_constant
);
122 data
= _mm_xor_si128 (data
, data2
);
124 data3
= _mm_xor_si128 (data3
, data4
);
125 data4
= _mm_loadu_si128 (datap
+ 1);
126 data4
= _mm_shuffle_epi8 (data4
, shuffle_constant
);
127 data3
= _mm_xor_si128 (data3
, data4
);
129 data5
= _mm_xor_si128 (data5
, data6
);
130 data6
= _mm_loadu_si128 (datap
+ 2);
131 data6
= _mm_shuffle_epi8 (data6
, shuffle_constant
);
132 data5
= _mm_xor_si128 (data5
, data6
);
134 data7
= _mm_xor_si128 (data7
, data8
);
135 data8
= _mm_loadu_si128 (datap
+ 3);
136 data8
= _mm_shuffle_epi8 (data8
, shuffle_constant
);
137 data7
= _mm_xor_si128 (data7
, data8
);
139 bytes_read
-= (16 * 4);
141 /* At end of loop we write out results from variables back into
142 the buffer, for use in single fold loop */
143 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
144 _mm_storeu_si128 (datap
, data
);
145 data3
= _mm_shuffle_epi8 (data3
, shuffle_constant
);
146 _mm_storeu_si128 (datap
+ 1, data3
);
147 data5
= _mm_shuffle_epi8 (data5
, shuffle_constant
);
148 _mm_storeu_si128 (datap
+ 2, data5
);
149 data7
= _mm_shuffle_epi8 (data7
, shuffle_constant
);
150 _mm_storeu_si128 (datap
+ 3, data7
);
153 /* Fold two 16-byte blocks into one 16-byte block */
154 if (bytes_read
>= 32)
156 data
= _mm_loadu_si128 (datap
);
157 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
158 xor_crc
= _mm_set_epi32 (crc
, 0, 0, 0);
160 data
= _mm_xor_si128 (data
, xor_crc
);
161 while (bytes_read
>= 32)
165 data2
= _mm_clmulepi64_si128 (data
, single_mult_constant
, 0x00);
166 data
= _mm_clmulepi64_si128 (data
, single_mult_constant
, 0x11);
167 fold_data
= _mm_loadu_si128 (datap
);
168 fold_data
= _mm_shuffle_epi8 (fold_data
, shuffle_constant
);
169 data
= _mm_xor_si128 (data
, data2
);
170 data
= _mm_xor_si128 (data
, fold_data
);
173 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
174 _mm_storeu_si128 (datap
, data
);
177 /* And finish up last 0-31 bytes in a byte by byte fashion */
178 unsigned char *cp
= (unsigned char *)datap
;
180 crc
= (crc
<< 8) ^ crctab
[0][((crc
>> 24) ^ *cp
++) & 0xFF];
186 *length_out
= length
;