maint: correct © dates for hardware optimized crc routines
[coreutils.git] / src / cksum_vmull.c
blob305f737b9b068c344ed0d1854717b9519875e6ba
1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include <stdint.h>
22 #include <arm_neon.h>
23 #include "system.h"
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab[8][256];
30 extern bool
31 cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
33 static uint64x2_t
34 bswap_neon (uint64x2_t in)
36 uint64x2_t a =
37 vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in)));
38 a = vcombine_u64 (vget_high_u64 (a), vget_low_u64 (a));
39 return a;
42 /* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */
44 bool
45 cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
47 uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)];
48 uint_fast32_t crc = 0;
49 uintmax_t length = 0;
50 size_t bytes_read;
51 poly64x2_t single_mult_constant;
52 poly64x2_t four_mult_constant;
54 if (!fp || !crc_out || !length_out)
55 return false;
57 /* These constants and general algorithms are taken from the Intel whitepaper
58 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
60 single_mult_constant =
61 vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C));
62 four_mult_constant =
63 vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C));
65 while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
67 uint64x2_t *datap;
68 uint64x2_t data;
69 uint64x2_t data2;
70 uint64x2_t data3;
71 uint64x2_t data4;
72 uint64x2_t data5;
73 uint64x2_t data6;
74 uint64x2_t data7;
75 uint64x2_t data8;
76 uint64x2_t fold_data;
77 uint64x2_t xor_crc;
79 if (length + bytes_read < length)
81 errno = EOVERFLOW;
82 return false;
84 length += bytes_read;
86 datap = (uint64x2_t *) buf;
88 /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
89 if (bytes_read >= 16 * 8)
91 data = vld1q_u64 ((uint64_t *) (datap));
92 data = bswap_neon (data);
93 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
94 calculated for previous BUFLEN buffer from fread */
95 xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
96 crc = 0;
97 data = veorq_u64 (data, xor_crc);
98 data3 = vld1q_u64 ((uint64_t *) (datap + 1));
99 data3 = bswap_neon (data3);
100 data5 = vld1q_u64 ((uint64_t *) (datap + 2));
101 data5 = bswap_neon (data5);
102 data7 = vld1q_u64 ((uint64_t *) (datap + 3));
103 data7 = bswap_neon (data7);
106 while (bytes_read >= 16 * 8)
108 datap += 4;
110 /* Do multiplication here for four consecutive 16 byte blocks */
111 data2 =
112 vreinterpretq_u64_p128 (vmull_p64
113 (vgetq_lane_p64
114 (vreinterpretq_p64_u64 (data), 0),
115 vgetq_lane_p64 (four_mult_constant,
116 0)));
117 data =
118 vreinterpretq_u64_p128 (vmull_high_p64
119 (vreinterpretq_p64_u64 (data),
120 four_mult_constant));
121 data4 =
122 vreinterpretq_u64_p128 (vmull_p64
123 (vgetq_lane_p64
124 (vreinterpretq_p64_u64 (data3), 0),
125 vgetq_lane_p64 (four_mult_constant,
126 0)));
127 data3 =
128 vreinterpretq_u64_p128 (vmull_high_p64
129 (vreinterpretq_p64_u64 (data3),
130 four_mult_constant));
131 data6 =
132 vreinterpretq_u64_p128 (vmull_p64
133 (vgetq_lane_p64
134 (vreinterpretq_p64_u64 (data5), 0),
135 vgetq_lane_p64 (four_mult_constant,
136 0)));
137 data5 =
138 vreinterpretq_u64_p128 (vmull_high_p64
139 (vreinterpretq_p64_u64 (data5),
140 four_mult_constant));
141 data8 =
142 vreinterpretq_u64_p128 (vmull_p64
143 (vgetq_lane_p64
144 (vreinterpretq_p64_u64 (data7), 0),
145 vgetq_lane_p64 (four_mult_constant,
146 0)));
147 data7 =
148 vreinterpretq_u64_p128 (vmull_high_p64
149 (vreinterpretq_p64_u64 (data7),
150 four_mult_constant));
152 /* Now multiplication results for the four blocks is xor:ed with
153 next four 16 byte blocks from the buffer. This effectively
154 "consumes" the first four blocks from the buffer.
155 Keep xor result in variables for multiplication in next
156 round of loop. */
157 data = veorq_u64 (data, data2);
158 data2 = vld1q_u64 ((uint64_t *) (datap));
159 data2 = bswap_neon (data2);
160 data = veorq_u64 (data, data2);
162 data3 = veorq_u64 (data3, data4);
163 data4 = vld1q_u64 ((uint64_t *) (datap + 1));
164 data4 = bswap_neon (data4);
165 data3 = veorq_u64 (data3, data4);
167 data5 = veorq_u64 (data5, data6);
168 data6 = vld1q_u64 ((uint64_t *) (datap + 2));
169 data6 = bswap_neon (data6);
170 data5 = veorq_u64 (data5, data6);
172 data7 = veorq_u64 (data7, data8);
173 data8 = vld1q_u64 ((uint64_t *) (datap + 3));
174 data8 = bswap_neon (data8);
175 data7 = veorq_u64 (data7, data8);
177 bytes_read -= (16 * 4);
179 /* At end of loop we write out results from variables back into
180 the buffer, for use in single fold loop */
181 data = bswap_neon (data);
182 vst1q_u64 ((uint64_t *) (datap), data);
183 data3 = bswap_neon (data3);
184 vst1q_u64 ((uint64_t *) (datap + 1), data3);
185 data5 = bswap_neon (data5);
186 vst1q_u64 ((uint64_t *) (datap + 2), data5);
187 data7 = bswap_neon (data7);
188 vst1q_u64 ((uint64_t *) (datap + 3), data7);
191 /* Fold two 16-byte blocks into one 16-byte block */
192 if (bytes_read >= 32)
194 data = vld1q_u64 ((uint64_t *) (datap));
195 data = bswap_neon (data);
196 xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32));
197 crc = 0;
198 data = veorq_u64 (data, xor_crc);
199 while (bytes_read >= 32)
201 datap++;
203 data2 =
204 vreinterpretq_u64_p128 (vmull_p64
205 (vgetq_lane_p64
206 (vreinterpretq_p64_u64 (data), 0),
207 vgetq_lane_p64 (single_mult_constant,
208 0)));
209 data =
210 vreinterpretq_u64_p128 (vmull_high_p64
211 (vreinterpretq_p64_u64 (data),
212 single_mult_constant));
213 fold_data = vld1q_u64 ((uint64_t *) (datap));
214 fold_data = bswap_neon (fold_data);
215 data = veorq_u64 (data, data2);
216 data = veorq_u64 (data, fold_data);
217 bytes_read -= 16;
219 data = bswap_neon (data);
220 vst1q_u64 ((uint64_t *) (datap), data);
223 /* And finish up last 0-31 bytes in a byte by byte fashion */
224 unsigned char *cp = (unsigned char *) datap;
225 while (bytes_read--)
226 crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
227 if (feof (fp))
228 break;
231 *crc_out = crc;
232 *length_out = length;
234 return !ferror (fp);