sort: drop "note" from a --debug message
[coreutils.git] / src / cksum_avx512.c
blobb2cef2824fb003b41d6fb014818e3b6fbcea6b67
1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 2024-2025 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include <stdint.h>
22 #include <x86intrin.h>
23 #include "system.h"
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab[8][256];
30 extern bool
31 cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
33 bool
34 cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
36 __m512i buf[BUFLEN / sizeof (__m512i)];
37 uint_fast32_t crc = 0;
38 uintmax_t length = 0;
39 size_t bytes_read;
40 __m512i single_mult_constant;
41 __m512i four_mult_constant;
42 __m512i shuffle_constant;
44 if (!fp || !crc_out || !length_out)
45 return false;
47 /* These constants and general algorithms are taken from the Intel whitepaper
48 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
50 single_mult_constant = _mm512_set_epi64 (0x8833794C, 0xE6228B11,
51 0x8833794C, 0xE6228B11,
52 0x8833794C, 0xE6228B11,
53 0x8833794C, 0xE6228B11);
54 four_mult_constant = _mm512_set_epi64 (0xCBCF3BCB, 0x88FE2237,
55 0xCBCF3BCB, 0x88FE2237,
56 0xCBCF3BCB, 0x88FE2237,
57 0xCBCF3BCB, 0x88FE2237);
59 /* Constant to byteswap a full AVX512 register */
60 shuffle_constant = _mm512_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
61 9, 10, 11, 12, 13, 14, 15,
62 0, 1, 2, 3, 4, 5, 6, 7, 8,
63 9, 10, 11, 12, 13, 14, 15,
64 0, 1, 2, 3, 4, 5, 6, 7, 8,
65 9, 10, 11, 12, 13, 14, 15,
66 0, 1, 2, 3, 4, 5, 6, 7, 8,
67 9, 10, 11, 12, 13, 14, 15);
68 while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
70 __m512i data;
71 __m512i data2;
72 __m512i data3;
73 __m512i data4;
74 __m512i data5;
75 __m512i data6;
76 __m512i data7;
77 __m512i data8;
78 __m512i fold_data;
79 __m512i xor_crc;
81 __m512i *datap;
83 if (length + bytes_read < length)
85 errno = EOVERFLOW;
86 return false;
88 length += bytes_read;
90 datap = (__m512i *)buf;
92 /* Fold in parallel 32x 16-byte blocks into 16x 16-byte blocks */
93 if (bytes_read >= 16 * 8 * 4)
95 data = _mm512_loadu_si512 (datap);
96 data = _mm512_shuffle_epi8 (data, shuffle_constant);
97 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
98 calculated for previous BUFLEN buffer from fread */
99 xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, crc, 0, 0, 0);
101 crc = 0;
102 data = _mm512_xor_si512 (data, xor_crc);
103 data3 = _mm512_loadu_si512 (datap + 1);
104 data3 = _mm512_shuffle_epi8 (data3, shuffle_constant);
105 data5 = _mm512_loadu_si512 (datap + 2);
106 data5 = _mm512_shuffle_epi8 (data5, shuffle_constant);
107 data7 = _mm512_loadu_si512 (datap + 3);
108 data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
110 while (bytes_read >= 16 * 8 * 4)
112 datap += 4;
114 /* Do multiplication here for 16x consecutive 16 byte blocks */
115 data2 = _mm512_clmulepi64_epi128 (data, four_mult_constant,
116 0x00);
117 data = _mm512_clmulepi64_epi128 (data, four_mult_constant,
118 0x11);
119 data4 = _mm512_clmulepi64_epi128 (data3, four_mult_constant,
120 0x00);
121 data3 = _mm512_clmulepi64_epi128 (data3, four_mult_constant,
122 0x11);
123 data6 = _mm512_clmulepi64_epi128 (data5, four_mult_constant,
124 0x00);
125 data5 = _mm512_clmulepi64_epi128 (data5, four_mult_constant,
126 0x11);
127 data8 = _mm512_clmulepi64_epi128 (data7, four_mult_constant,
128 0x00);
129 data7 = _mm512_clmulepi64_epi128 (data7, four_mult_constant,
130 0x11);
132 /* Now multiplication results for the 16x blocks is xor:ed with
133 next 16x 16 byte blocks from the buffer. This effectively
134 "consumes" the first 16x blocks from the buffer.
135 Keep xor result in variables for multiplication in next
136 round of loop. */
137 data = _mm512_xor_si512 (data, data2);
138 data2 = _mm512_loadu_si512 (datap);
139 data2 = _mm512_shuffle_epi8 (data2, shuffle_constant);
140 data = _mm512_xor_si512 (data, data2);
142 data3 = _mm512_xor_si512 (data3, data4);
143 data4 = _mm512_loadu_si512 (datap + 1);
144 data4 = _mm512_shuffle_epi8 (data4, shuffle_constant);
145 data3 = _mm512_xor_si512 (data3, data4);
147 data5 = _mm512_xor_si512 (data5, data6);
148 data6 = _mm512_loadu_si512 (datap + 2);
149 data6 = _mm512_shuffle_epi8 (data6, shuffle_constant);
150 data5 = _mm512_xor_si512 (data5, data6);
152 data7 = _mm512_xor_si512 (data7, data8);
153 data8 = _mm512_loadu_si512 (datap + 3);
154 data8 = _mm512_shuffle_epi8 (data8, shuffle_constant);
155 data7 = _mm512_xor_si512 (data7, data8);
157 bytes_read -= (16 * 4 * 4);
159 /* At end of loop we write out results from variables back into
160 the buffer, for use in single fold loop */
161 data = _mm512_shuffle_epi8 (data, shuffle_constant);
162 _mm512_storeu_si512 (datap, data);
163 data3 = _mm512_shuffle_epi8 (data3, shuffle_constant);
164 _mm512_storeu_si512 (datap + 1, data3);
165 data5 = _mm512_shuffle_epi8 (data5, shuffle_constant);
166 _mm512_storeu_si512 (datap + 2, data5);
167 data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
168 _mm512_storeu_si512 (datap + 3, data7);
171 /* Fold two 64-byte blocks into one 64-byte block */
172 if (bytes_read >= 128)
174 data = _mm512_loadu_si512 (datap);
175 data = _mm512_shuffle_epi8 (data, shuffle_constant);
176 xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, crc, 0, 0, 0);
178 crc = 0;
179 data = _mm512_xor_si512 (data, xor_crc);
180 while (bytes_read >= 128)
182 datap++;
184 data2 = _mm512_clmulepi64_epi128 (data, single_mult_constant,
185 0x00);
186 data = _mm512_clmulepi64_epi128 (data, single_mult_constant,
187 0x11);
188 fold_data = _mm512_loadu_si512 (datap);
189 fold_data = _mm512_shuffle_epi8 (fold_data, shuffle_constant);
190 data = _mm512_xor_si512 (data, data2);
191 data = _mm512_xor_si512 (data, fold_data);
192 bytes_read -= 64;
194 data = _mm512_shuffle_epi8 (data, shuffle_constant);
195 _mm512_storeu_si512 (datap, data);
198 /* And finish up last 0-127 bytes in a byte by byte fashion */
199 unsigned char *cp = (unsigned char *)datap;
200 while (bytes_read--)
201 crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
202 if (feof (fp))
203 break;
206 *crc_out = crc;
207 *length_out = length;
209 return !ferror (fp);