1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 #include <sys/types.h>
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab
[8][256];
31 cksum_vmull (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
);
34 bswap_neon (uint64x2_t in
)
37 vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in
)));
38 a
= vcombine_u64 (vget_high_u64 (a
), vget_low_u64 (a
));
42 /* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */
45 cksum_vmull (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
)
47 uint64x2_t buf
[BUFLEN
/ sizeof (uint64x2_t
)];
48 uint_fast32_t crc
= 0;
51 poly64x2_t single_mult_constant
;
52 poly64x2_t four_mult_constant
;
54 if (!fp
|| !crc_out
|| !length_out
)
57 /* These constants and general algorithms are taken from the Intel whitepaper
58 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
60 single_mult_constant
=
61 vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C));
63 vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C));
65 while ((bytes_read
= fread (buf
, 1, BUFLEN
, fp
)) > 0)
79 if (length
+ bytes_read
< length
)
86 datap
= (uint64x2_t
*) buf
;
88 /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
89 if (bytes_read
>= 16 * 8)
91 data
= vld1q_u64 ((uint64_t *) (datap
));
92 data
= bswap_neon (data
);
93 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
94 calculated for previous BUFLEN buffer from fread */
95 xor_crc
= vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc
<< 32));
97 data
= veorq_u64 (data
, xor_crc
);
98 data3
= vld1q_u64 ((uint64_t *) (datap
+ 1));
99 data3
= bswap_neon (data3
);
100 data5
= vld1q_u64 ((uint64_t *) (datap
+ 2));
101 data5
= bswap_neon (data5
);
102 data7
= vld1q_u64 ((uint64_t *) (datap
+ 3));
103 data7
= bswap_neon (data7
);
106 while (bytes_read
>= 16 * 8)
110 /* Do multiplication here for four consecutive 16 byte blocks */
112 vreinterpretq_u64_p128 (vmull_p64
114 (vreinterpretq_p64_u64 (data
), 0),
115 vgetq_lane_p64 (four_mult_constant
,
118 vreinterpretq_u64_p128 (vmull_high_p64
119 (vreinterpretq_p64_u64 (data
),
120 four_mult_constant
));
122 vreinterpretq_u64_p128 (vmull_p64
124 (vreinterpretq_p64_u64 (data3
), 0),
125 vgetq_lane_p64 (four_mult_constant
,
128 vreinterpretq_u64_p128 (vmull_high_p64
129 (vreinterpretq_p64_u64 (data3
),
130 four_mult_constant
));
132 vreinterpretq_u64_p128 (vmull_p64
134 (vreinterpretq_p64_u64 (data5
), 0),
135 vgetq_lane_p64 (four_mult_constant
,
138 vreinterpretq_u64_p128 (vmull_high_p64
139 (vreinterpretq_p64_u64 (data5
),
140 four_mult_constant
));
142 vreinterpretq_u64_p128 (vmull_p64
144 (vreinterpretq_p64_u64 (data7
), 0),
145 vgetq_lane_p64 (four_mult_constant
,
148 vreinterpretq_u64_p128 (vmull_high_p64
149 (vreinterpretq_p64_u64 (data7
),
150 four_mult_constant
));
152 /* Now multiplication results for the four blocks is xor:ed with
153 next four 16 byte blocks from the buffer. This effectively
154 "consumes" the first four blocks from the buffer.
155 Keep xor result in variables for multiplication in next
157 data
= veorq_u64 (data
, data2
);
158 data2
= vld1q_u64 ((uint64_t *) (datap
));
159 data2
= bswap_neon (data2
);
160 data
= veorq_u64 (data
, data2
);
162 data3
= veorq_u64 (data3
, data4
);
163 data4
= vld1q_u64 ((uint64_t *) (datap
+ 1));
164 data4
= bswap_neon (data4
);
165 data3
= veorq_u64 (data3
, data4
);
167 data5
= veorq_u64 (data5
, data6
);
168 data6
= vld1q_u64 ((uint64_t *) (datap
+ 2));
169 data6
= bswap_neon (data6
);
170 data5
= veorq_u64 (data5
, data6
);
172 data7
= veorq_u64 (data7
, data8
);
173 data8
= vld1q_u64 ((uint64_t *) (datap
+ 3));
174 data8
= bswap_neon (data8
);
175 data7
= veorq_u64 (data7
, data8
);
177 bytes_read
-= (16 * 4);
179 /* At end of loop we write out results from variables back into
180 the buffer, for use in single fold loop */
181 data
= bswap_neon (data
);
182 vst1q_u64 ((uint64_t *) (datap
), data
);
183 data3
= bswap_neon (data3
);
184 vst1q_u64 ((uint64_t *) (datap
+ 1), data3
);
185 data5
= bswap_neon (data5
);
186 vst1q_u64 ((uint64_t *) (datap
+ 2), data5
);
187 data7
= bswap_neon (data7
);
188 vst1q_u64 ((uint64_t *) (datap
+ 3), data7
);
191 /* Fold two 16-byte blocks into one 16-byte block */
192 if (bytes_read
>= 32)
194 data
= vld1q_u64 ((uint64_t *) (datap
));
195 data
= bswap_neon (data
);
196 xor_crc
= vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc
<< 32));
198 data
= veorq_u64 (data
, xor_crc
);
199 while (bytes_read
>= 32)
204 vreinterpretq_u64_p128 (vmull_p64
206 (vreinterpretq_p64_u64 (data
), 0),
207 vgetq_lane_p64 (single_mult_constant
,
210 vreinterpretq_u64_p128 (vmull_high_p64
211 (vreinterpretq_p64_u64 (data
),
212 single_mult_constant
));
213 fold_data
= vld1q_u64 ((uint64_t *) (datap
));
214 fold_data
= bswap_neon (fold_data
);
215 data
= veorq_u64 (data
, data2
);
216 data
= veorq_u64 (data
, fold_data
);
219 data
= bswap_neon (data
);
220 vst1q_u64 ((uint64_t *) (datap
), data
);
223 /* And finish up last 0-31 bytes in a byte by byte fashion */
224 unsigned char *cp
= (unsigned char *) datap
;
226 crc
= (crc
<< 8) ^ crctab
[0][((crc
>> 24) ^ *cp
++) & 0xFF];
232 *length_out
= length
;