2 * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
5 * A white paper describing this algorithm can be found at:
6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
8 * Copyright (C) 2013 Intel Corporation. All rights reserved.
10 * Wajdi Feghali <wajdi.k.feghali@intel.com>
11 * Jim Guilford <james.guilford@intel.com>
12 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Erdinc Ozturk <erdinc.ozturk@intel.com>
14 * Jim Kukunas <james.t.kukunas@linux.intel.com>
16 * For conditions of distribution and use, see copyright notice in zlib.h
22 #include <emmintrin.h>
23 #include <immintrin.h>
24 #include <wmmintrin.h>
28 __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
29 __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
30 __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
31 __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
32 __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
35 _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
36 _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
37 _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
38 _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
39 _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
42 ZLIB_INTERNAL
void crc_fold_init(deflate_state
*const s
)
46 xmm_crc0
= _mm_cvtsi32_si128(0x9db42487);
47 xmm_crc1
= _mm_setzero_si128();
48 xmm_crc2
= _mm_setzero_si128();
49 xmm_crc3
= _mm_setzero_si128();
56 local
void fold_1(deflate_state
*const s
,
57 __m128i
*xmm_crc0
, __m128i
*xmm_crc1
,
58 __m128i
*xmm_crc2
, __m128i
*xmm_crc3
)
60 const __m128i xmm_fold4
= _mm_set_epi32(
61 0x00000001, 0x54442bd4,
62 0x00000001, 0xc6e41596);
65 __m128 ps_crc0
, ps_crc3
, ps_res
;
69 *xmm_crc3
= *xmm_crc0
;
70 *xmm_crc0
= _mm_clmulepi64_si128(*xmm_crc0
, xmm_fold4
, 0x01);
71 *xmm_crc3
= _mm_clmulepi64_si128(*xmm_crc3
, xmm_fold4
, 0x10);
72 ps_crc0
= _mm_castsi128_ps(*xmm_crc0
);
73 ps_crc3
= _mm_castsi128_ps(*xmm_crc3
);
74 ps_res
= _mm_xor_ps(ps_crc0
, ps_crc3
);
76 *xmm_crc0
= *xmm_crc1
;
77 *xmm_crc1
= *xmm_crc2
;
79 *xmm_crc3
= _mm_castps_si128(ps_res
);
82 local
void fold_2(deflate_state
*const s
,
83 __m128i
*xmm_crc0
, __m128i
*xmm_crc1
,
84 __m128i
*xmm_crc2
, __m128i
*xmm_crc3
)
86 const __m128i xmm_fold4
= _mm_set_epi32(
87 0x00000001, 0x54442bd4,
88 0x00000001, 0xc6e41596);
90 __m128i x_tmp3
, x_tmp2
;
91 __m128 ps_crc0
, ps_crc1
, ps_crc2
, ps_crc3
, ps_res31
, ps_res20
;
96 *xmm_crc3
= *xmm_crc1
;
97 *xmm_crc1
= _mm_clmulepi64_si128(*xmm_crc1
, xmm_fold4
, 0x01);
98 *xmm_crc3
= _mm_clmulepi64_si128(*xmm_crc3
, xmm_fold4
, 0x10);
99 ps_crc3
= _mm_castsi128_ps(*xmm_crc3
);
100 ps_crc1
= _mm_castsi128_ps(*xmm_crc1
);
101 ps_res31
= _mm_xor_ps(ps_crc3
, ps_crc1
);
103 *xmm_crc2
= *xmm_crc0
;
104 *xmm_crc0
= _mm_clmulepi64_si128(*xmm_crc0
, xmm_fold4
, 0x01);
105 *xmm_crc2
= _mm_clmulepi64_si128(*xmm_crc2
, xmm_fold4
, 0x10);
106 ps_crc0
= _mm_castsi128_ps(*xmm_crc0
);
107 ps_crc2
= _mm_castsi128_ps(*xmm_crc2
);
108 ps_res20
= _mm_xor_ps(ps_crc0
, ps_crc2
);
112 *xmm_crc2
= _mm_castps_si128(ps_res20
);
113 *xmm_crc3
= _mm_castps_si128(ps_res31
);
116 local
void fold_3(deflate_state
*const s
,
117 __m128i
*xmm_crc0
, __m128i
*xmm_crc1
,
118 __m128i
*xmm_crc2
, __m128i
*xmm_crc3
)
120 const __m128i xmm_fold4
= _mm_set_epi32(
121 0x00000001, 0x54442bd4,
122 0x00000001, 0xc6e41596);
125 __m128 ps_crc0
, ps_crc1
, ps_crc2
, ps_crc3
, ps_res32
, ps_res21
, ps_res10
;
129 *xmm_crc3
= *xmm_crc2
;
130 *xmm_crc2
= _mm_clmulepi64_si128(*xmm_crc2
, xmm_fold4
, 0x01);
131 *xmm_crc3
= _mm_clmulepi64_si128(*xmm_crc3
, xmm_fold4
, 0x10);
132 ps_crc2
= _mm_castsi128_ps(*xmm_crc2
);
133 ps_crc3
= _mm_castsi128_ps(*xmm_crc3
);
134 ps_res32
= _mm_xor_ps(ps_crc2
, ps_crc3
);
136 *xmm_crc2
= *xmm_crc1
;
137 *xmm_crc1
= _mm_clmulepi64_si128(*xmm_crc1
, xmm_fold4
, 0x01);
138 *xmm_crc2
= _mm_clmulepi64_si128(*xmm_crc2
, xmm_fold4
, 0x10);
139 ps_crc1
= _mm_castsi128_ps(*xmm_crc1
);
140 ps_crc2
= _mm_castsi128_ps(*xmm_crc2
);
141 ps_res21
= _mm_xor_ps(ps_crc1
, ps_crc2
);
143 *xmm_crc1
= *xmm_crc0
;
144 *xmm_crc0
= _mm_clmulepi64_si128(*xmm_crc0
, xmm_fold4
, 0x01);
145 *xmm_crc1
= _mm_clmulepi64_si128(*xmm_crc1
, xmm_fold4
, 0x10);
146 ps_crc0
= _mm_castsi128_ps(*xmm_crc0
);
147 ps_crc1
= _mm_castsi128_ps(*xmm_crc1
);
148 ps_res10
= _mm_xor_ps(ps_crc0
, ps_crc1
);
151 *xmm_crc1
= _mm_castps_si128(ps_res10
);
152 *xmm_crc2
= _mm_castps_si128(ps_res21
);
153 *xmm_crc3
= _mm_castps_si128(ps_res32
);
156 local
void fold_4(deflate_state
*const s
,
157 __m128i
*xmm_crc0
, __m128i
*xmm_crc1
,
158 __m128i
*xmm_crc2
, __m128i
*xmm_crc3
)
160 const __m128i xmm_fold4
= _mm_set_epi32(
161 0x00000001, 0x54442bd4,
162 0x00000001, 0xc6e41596);
164 __m128i x_tmp0
, x_tmp1
, x_tmp2
, x_tmp3
;
165 __m128 ps_crc0
, ps_crc1
, ps_crc2
, ps_crc3
;
166 __m128 ps_t0
, ps_t1
, ps_t2
, ps_t3
;
167 __m128 ps_res0
, ps_res1
, ps_res2
, ps_res3
;
174 *xmm_crc0
= _mm_clmulepi64_si128(*xmm_crc0
, xmm_fold4
, 0x01);
175 x_tmp0
= _mm_clmulepi64_si128(x_tmp0
, xmm_fold4
, 0x10);
176 ps_crc0
= _mm_castsi128_ps(*xmm_crc0
);
177 ps_t0
= _mm_castsi128_ps(x_tmp0
);
178 ps_res0
= _mm_xor_ps(ps_crc0
, ps_t0
);
180 *xmm_crc1
= _mm_clmulepi64_si128(*xmm_crc1
, xmm_fold4
, 0x01);
181 x_tmp1
= _mm_clmulepi64_si128(x_tmp1
, xmm_fold4
, 0x10);
182 ps_crc1
= _mm_castsi128_ps(*xmm_crc1
);
183 ps_t1
= _mm_castsi128_ps(x_tmp1
);
184 ps_res1
= _mm_xor_ps(ps_crc1
, ps_t1
);
186 *xmm_crc2
= _mm_clmulepi64_si128(*xmm_crc2
, xmm_fold4
, 0x01);
187 x_tmp2
= _mm_clmulepi64_si128(x_tmp2
, xmm_fold4
, 0x10);
188 ps_crc2
= _mm_castsi128_ps(*xmm_crc2
);
189 ps_t2
= _mm_castsi128_ps(x_tmp2
);
190 ps_res2
= _mm_xor_ps(ps_crc2
, ps_t2
);
192 *xmm_crc3
= _mm_clmulepi64_si128(*xmm_crc3
, xmm_fold4
, 0x01);
193 x_tmp3
= _mm_clmulepi64_si128(x_tmp3
, xmm_fold4
, 0x10);
194 ps_crc3
= _mm_castsi128_ps(*xmm_crc3
);
195 ps_t3
= _mm_castsi128_ps(x_tmp3
);
196 ps_res3
= _mm_xor_ps(ps_crc3
, ps_t3
);
198 *xmm_crc0
= _mm_castps_si128(ps_res0
);
199 *xmm_crc1
= _mm_castps_si128(ps_res1
);
200 *xmm_crc2
= _mm_castps_si128(ps_res2
);
201 *xmm_crc3
= _mm_castps_si128(ps_res3
);
204 local
const unsigned zalign(32) pshufb_shf_table
[60] = {
205 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
206 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
207 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
208 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
209 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
210 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
211 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
212 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
213 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
214 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
215 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
216 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
217 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
218 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
219 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
222 local
void partial_fold(deflate_state
*const s
, const size_t len
,
223 __m128i
*xmm_crc0
, __m128i
*xmm_crc1
,
224 __m128i
*xmm_crc2
, __m128i
*xmm_crc3
,
225 __m128i
*xmm_crc_part
)
228 const __m128i xmm_fold4
= _mm_set_epi32(
229 0x00000001, 0x54442bd4,
230 0x00000001, 0xc6e41596);
231 const __m128i xmm_mask3
= _mm_set1_epi32(0x80808080);
233 __m128i xmm_shl
, xmm_shr
, xmm_tmp1
, xmm_tmp2
, xmm_tmp3
;
234 __m128i xmm_a0_0
, xmm_a0_1
;
235 __m128 ps_crc3
, psa0_0
, psa0_1
, ps_res
;
237 xmm_shl
= _mm_load_si128((__m128i
*)pshufb_shf_table
+ (len
- 1));
239 xmm_shr
= _mm_xor_si128(xmm_shr
, xmm_mask3
);
241 xmm_a0_0
= _mm_shuffle_epi8(*xmm_crc0
, xmm_shl
);
243 *xmm_crc0
= _mm_shuffle_epi8(*xmm_crc0
, xmm_shr
);
244 xmm_tmp1
= _mm_shuffle_epi8(*xmm_crc1
, xmm_shl
);
245 *xmm_crc0
= _mm_or_si128(*xmm_crc0
, xmm_tmp1
);
247 *xmm_crc1
= _mm_shuffle_epi8(*xmm_crc1
, xmm_shr
);
248 xmm_tmp2
= _mm_shuffle_epi8(*xmm_crc2
, xmm_shl
);
249 *xmm_crc1
= _mm_or_si128(*xmm_crc1
, xmm_tmp2
);
251 *xmm_crc2
= _mm_shuffle_epi8(*xmm_crc2
, xmm_shr
);
252 xmm_tmp3
= _mm_shuffle_epi8(*xmm_crc3
, xmm_shl
);
253 *xmm_crc2
= _mm_or_si128(*xmm_crc2
, xmm_tmp3
);
255 *xmm_crc3
= _mm_shuffle_epi8(*xmm_crc3
, xmm_shr
);
256 *xmm_crc_part
= _mm_shuffle_epi8(*xmm_crc_part
, xmm_shl
);
257 *xmm_crc3
= _mm_or_si128(*xmm_crc3
, *xmm_crc_part
);
259 xmm_a0_1
= _mm_clmulepi64_si128(xmm_a0_0
, xmm_fold4
, 0x10);
260 xmm_a0_0
= _mm_clmulepi64_si128(xmm_a0_0
, xmm_fold4
, 0x01);
262 ps_crc3
= _mm_castsi128_ps(*xmm_crc3
);
263 psa0_0
= _mm_castsi128_ps(xmm_a0_0
);
264 psa0_1
= _mm_castsi128_ps(xmm_a0_1
);
266 ps_res
= _mm_xor_ps(ps_crc3
, psa0_0
);
267 ps_res
= _mm_xor_ps(ps_res
, psa0_1
);
269 *xmm_crc3
= _mm_castps_si128(ps_res
);
272 ZLIB_INTERNAL
void crc_fold_copy(deflate_state
*const s
,
273 unsigned char *dst
, const unsigned char *src
, long len
)
275 unsigned long algn_diff
;
276 __m128i xmm_t0
, xmm_t1
, xmm_t2
, xmm_t3
;
286 algn_diff
= 0 - (unsigned long)src
& 0xF;
288 xmm_crc_part
= _mm_loadu_si128((__m128i
*)src
);
289 _mm_storeu_si128((__m128i
*)dst
, xmm_crc_part
);
295 partial_fold(s
, algn_diff
, &xmm_crc0
, &xmm_crc1
, &xmm_crc2
, &xmm_crc3
,
299 while ((len
-= 64) >= 0) {
300 xmm_t0
= _mm_load_si128((__m128i
*)src
);
301 xmm_t1
= _mm_load_si128((__m128i
*)src
+ 1);
302 xmm_t2
= _mm_load_si128((__m128i
*)src
+ 2);
303 xmm_t3
= _mm_load_si128((__m128i
*)src
+ 3);
305 fold_4(s
, &xmm_crc0
, &xmm_crc1
, &xmm_crc2
, &xmm_crc3
);
307 _mm_storeu_si128((__m128i
*)dst
, xmm_t0
);
308 _mm_storeu_si128((__m128i
*)dst
+ 1, xmm_t1
);
309 _mm_storeu_si128((__m128i
*)dst
+ 2, xmm_t2
);
310 _mm_storeu_si128((__m128i
*)dst
+ 3, xmm_t3
);
312 xmm_crc0
= _mm_xor_si128(xmm_crc0
, xmm_t0
);
313 xmm_crc1
= _mm_xor_si128(xmm_crc1
, xmm_t1
);
314 xmm_crc2
= _mm_xor_si128(xmm_crc2
, xmm_t2
);
315 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_t3
);
322 * len = num bytes left - 64
327 xmm_t0
= _mm_load_si128((__m128i
*)src
);
328 xmm_t1
= _mm_load_si128((__m128i
*)src
+ 1);
329 xmm_t2
= _mm_load_si128((__m128i
*)src
+ 2);
331 fold_3(s
, &xmm_crc0
, &xmm_crc1
, &xmm_crc2
, &xmm_crc3
);
333 _mm_storeu_si128((__m128i
*)dst
, xmm_t0
);
334 _mm_storeu_si128((__m128i
*)dst
+ 1, xmm_t1
);
335 _mm_storeu_si128((__m128i
*)dst
+ 2, xmm_t2
);
337 xmm_crc1
= _mm_xor_si128(xmm_crc1
, xmm_t0
);
338 xmm_crc2
= _mm_xor_si128(xmm_crc2
, xmm_t1
);
339 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_t2
);
346 } else if (len
+ 32 >= 0) {
349 xmm_t0
= _mm_load_si128((__m128i
*)src
);
350 xmm_t1
= _mm_load_si128((__m128i
*)src
+ 1);
352 fold_2(s
, &xmm_crc0
, &xmm_crc1
, &xmm_crc2
, &xmm_crc3
);
354 _mm_storeu_si128((__m128i
*)dst
, xmm_t0
);
355 _mm_storeu_si128((__m128i
*)dst
+ 1, xmm_t1
);
357 xmm_crc2
= _mm_xor_si128(xmm_crc2
, xmm_t0
);
358 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_t1
);
365 } else if (len
+ 48 >= 0) {
368 xmm_t0
= _mm_load_si128((__m128i
*)src
);
370 fold_1(s
, &xmm_crc0
, &xmm_crc1
, &xmm_crc2
, &xmm_crc3
);
372 _mm_storeu_si128((__m128i
*)dst
, xmm_t0
);
374 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_t0
);
389 #if defined(_MSC_VER)
390 /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
392 int32_t parts
[4] = {0, 0, 0, 0};
393 memcpy(&parts
, src
, len
);
394 xmm_crc_part
= _mm_set_epi32(parts
[3], parts
[2], parts
[1], parts
[0]);
398 int64_t parts
[2] = {0, 0};
399 memcpy(&parts
, src
, len
);
400 xmm_crc_part
= _mm_set_epi64x(parts
[1], parts
[0]);
404 _mm_storeu_si128((__m128i
*)dst
, xmm_crc_part
);
405 partial_fold(s
, len
, &xmm_crc0
, &xmm_crc1
, &xmm_crc2
, &xmm_crc3
,
411 local
const unsigned zalign(16) crc_k
[] = {
412 0xccaa009e, 0x00000000, /* rk1 */
413 0x751997d0, 0x00000001, /* rk2 */
414 0xccaa009e, 0x00000000, /* rk5 */
415 0x63cd6124, 0x00000001, /* rk6 */
416 0xf7011640, 0x00000001, /* rk7 */
417 0xdb710640, 0x00000001 /* rk8 */
420 local
const unsigned zalign(16) crc_mask
[4] = {
421 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
424 local
const unsigned zalign(16) crc_mask2
[4] = {
425 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
428 unsigned ZLIB_INTERNAL
crc_fold_512to32(deflate_state
*const s
)
430 const __m128i xmm_mask
= _mm_load_si128((__m128i
*)crc_mask
);
431 const __m128i xmm_mask2
= _mm_load_si128((__m128i
*)crc_mask2
);
434 __m128i x_tmp0
, x_tmp1
, x_tmp2
, crc_fold
;
441 crc_fold
= _mm_load_si128((__m128i
*)crc_k
);
443 x_tmp0
= _mm_clmulepi64_si128(xmm_crc0
, crc_fold
, 0x10);
444 xmm_crc0
= _mm_clmulepi64_si128(xmm_crc0
, crc_fold
, 0x01);
445 xmm_crc1
= _mm_xor_si128(xmm_crc1
, x_tmp0
);
446 xmm_crc1
= _mm_xor_si128(xmm_crc1
, xmm_crc0
);
448 x_tmp1
= _mm_clmulepi64_si128(xmm_crc1
, crc_fold
, 0x10);
449 xmm_crc1
= _mm_clmulepi64_si128(xmm_crc1
, crc_fold
, 0x01);
450 xmm_crc2
= _mm_xor_si128(xmm_crc2
, x_tmp1
);
451 xmm_crc2
= _mm_xor_si128(xmm_crc2
, xmm_crc1
);
453 x_tmp2
= _mm_clmulepi64_si128(xmm_crc2
, crc_fold
, 0x10);
454 xmm_crc2
= _mm_clmulepi64_si128(xmm_crc2
, crc_fold
, 0x01);
455 xmm_crc3
= _mm_xor_si128(xmm_crc3
, x_tmp2
);
456 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_crc2
);
461 crc_fold
= _mm_load_si128((__m128i
*)crc_k
+ 1);
464 xmm_crc3
= _mm_clmulepi64_si128(xmm_crc3
, crc_fold
, 0);
465 xmm_crc0
= _mm_srli_si128(xmm_crc0
, 8);
466 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_crc0
);
469 xmm_crc3
= _mm_slli_si128(xmm_crc3
, 4);
470 xmm_crc3
= _mm_clmulepi64_si128(xmm_crc3
, crc_fold
, 0x10);
471 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_crc0
);
472 xmm_crc3
= _mm_and_si128(xmm_crc3
, xmm_mask2
);
479 crc_fold
= _mm_load_si128((__m128i
*)crc_k
+ 2);
481 xmm_crc3
= _mm_clmulepi64_si128(xmm_crc3
, crc_fold
, 0);
482 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_crc2
);
483 xmm_crc3
= _mm_and_si128(xmm_crc3
, xmm_mask
);
486 xmm_crc3
= _mm_clmulepi64_si128(xmm_crc3
, crc_fold
, 0x10);
487 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_crc2
);
488 xmm_crc3
= _mm_xor_si128(xmm_crc3
, xmm_crc1
);
490 crc
= _mm_extract_epi32(xmm_crc3
, 2);