ptx: prefer xpalloc to x2nrealloc
[coreutils.git] / src / cksum_pclmul.c
blobf88f6cc01693d7c25abb57d2bc8420f5d8495163
1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 1992-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include <stdint.h>
22 #include <x86intrin.h>
23 #include "system.h"
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab[8][256];
30 extern bool
31 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
33 /* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */
35 bool
36 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
38 __m128i buf[BUFLEN / sizeof (__m128i)];
39 uint_fast32_t crc = 0;
40 uintmax_t length = 0;
41 size_t bytes_read;
42 __m128i single_mult_constant;
43 __m128i four_mult_constant;
44 __m128i shuffle_constant;
46 if (!fp || !crc_out || !length_out)
47 return false;
49 /* These constants and general algorithms are taken from the Intel whitepaper
50 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
52 single_mult_constant = _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605);
53 four_mult_constant = _mm_set_epi64x (0x8833794C, 0xE6228B11);
55 /* Constant to byteswap a full SSE register */
56 shuffle_constant = _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57 9, 10, 11, 12, 13, 14, 15);
59 while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
61 __m128i *datap;
62 __m128i data;
63 __m128i data2;
64 __m128i data3;
65 __m128i data4;
66 __m128i data5;
67 __m128i data6;
68 __m128i data7;
69 __m128i data8;
70 __m128i fold_data;
71 __m128i xor_crc;
73 if (length + bytes_read < length)
75 errno = EOVERFLOW;
76 return false;
78 length += bytes_read;
80 datap = (__m128i *)buf;
82 /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
83 if (bytes_read >= 16 * 8)
85 data = _mm_loadu_si128 (datap);
86 data = _mm_shuffle_epi8 (data, shuffle_constant);
87 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
88 calculated for previous BUFLEN buffer from fread */
89 xor_crc = _mm_set_epi32 (crc, 0, 0, 0);
90 crc = 0;
91 data = _mm_xor_si128 (data, xor_crc);
92 data3 = _mm_loadu_si128 (datap + 1);
93 data3 = _mm_shuffle_epi8 (data3, shuffle_constant);
94 data5 = _mm_loadu_si128 (datap + 2);
95 data5 = _mm_shuffle_epi8 (data5, shuffle_constant);
96 data7 = _mm_loadu_si128 (datap + 3);
97 data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
100 while (bytes_read >= 16 * 8)
102 datap += 4;
104 /* Do multiplication here for four consecutive 16 byte blocks */
105 data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
106 data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
107 data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
108 data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
109 data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
110 data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
111 data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
112 data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
114 /* Now multiplication results for the four blocks is xor:ed with
115 next four 16 byte blocks from the buffer. This effectively
116 "consumes" the first four blocks from the buffer.
117 Keep xor result in variables for multiplication in next
118 round of loop. */
119 data = _mm_xor_si128 (data, data2);
120 data2 = _mm_loadu_si128 (datap);
121 data2 = _mm_shuffle_epi8 (data2, shuffle_constant);
122 data = _mm_xor_si128 (data, data2);
124 data3 = _mm_xor_si128 (data3, data4);
125 data4 = _mm_loadu_si128 (datap + 1);
126 data4 = _mm_shuffle_epi8 (data4, shuffle_constant);
127 data3 = _mm_xor_si128 (data3, data4);
129 data5 = _mm_xor_si128 (data5, data6);
130 data6 = _mm_loadu_si128 (datap + 2);
131 data6 = _mm_shuffle_epi8 (data6, shuffle_constant);
132 data5 = _mm_xor_si128 (data5, data6);
134 data7 = _mm_xor_si128 (data7, data8);
135 data8 = _mm_loadu_si128 (datap + 3);
136 data8 = _mm_shuffle_epi8 (data8, shuffle_constant);
137 data7 = _mm_xor_si128 (data7, data8);
139 bytes_read -= (16 * 4);
141 /* At end of loop we write out results from variables back into
142 the buffer, for use in single fold loop */
143 data = _mm_shuffle_epi8 (data, shuffle_constant);
144 _mm_storeu_si128 (datap, data);
145 data3 = _mm_shuffle_epi8 (data3, shuffle_constant);
146 _mm_storeu_si128 (datap + 1, data3);
147 data5 = _mm_shuffle_epi8 (data5, shuffle_constant);
148 _mm_storeu_si128 (datap + 2, data5);
149 data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
150 _mm_storeu_si128 (datap + 3, data7);
153 /* Fold two 16-byte blocks into one 16-byte block */
154 if (bytes_read >= 32)
156 data = _mm_loadu_si128 (datap);
157 data = _mm_shuffle_epi8 (data, shuffle_constant);
158 xor_crc = _mm_set_epi32 (crc, 0, 0, 0);
159 crc = 0;
160 data = _mm_xor_si128 (data, xor_crc);
161 while (bytes_read >= 32)
163 datap++;
165 data2 = _mm_clmulepi64_si128 (data, single_mult_constant, 0x00);
166 data = _mm_clmulepi64_si128 (data, single_mult_constant, 0x11);
167 fold_data = _mm_loadu_si128 (datap);
168 fold_data = _mm_shuffle_epi8 (fold_data, shuffle_constant);
169 data = _mm_xor_si128 (data, data2);
170 data = _mm_xor_si128 (data, fold_data);
171 bytes_read -= 16;
173 data = _mm_shuffle_epi8 (data, shuffle_constant);
174 _mm_storeu_si128 (datap, data);
177 /* And finish up last 0-31 bytes in a byte by byte fashion */
178 unsigned char *cp = (unsigned char *)datap;
179 while (bytes_read--)
180 crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
181 if (feof (fp))
182 break;
185 *crc_out = crc;
186 *length_out = length;
188 return !ferror (fp);