2 * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
4 * Copyright (C) 1996 Andrew Tridgell
5 * Copyright (C) 1996 Paul Mackerras
6 * Copyright (C) 2004-2020 Wayne Davison
7 * Copyright (C) 2020 Jorrit Jongma
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 3 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, visit the http://fsf.org website.
23 * Optimization target for get_checksum1() was the Intel Atom D2700, the
24 * slowest CPU in the test set and the most likely to be CPU limited during
25 * transfers. The combination of intrinsics was chosen specifically for the
26 * most gain on that CPU, other combinations were occasionally slightly
27 * faster on the others.
29 * While on more modern CPUs transfers are less likely to be CPU limited
30 * (at least by this specific function), lower CPU usage is always better.
31 * Improvements may still be seen when matching chunks from NVMe storage
34 * Benchmarks (in MB/s) C SSE2 SSSE3 AVX2
35 * - Intel Atom D2700 550 750 1000 N/A
36 * - Intel i7-7700hq 1850 2550 4050 6200
37 * - AMD ThreadRipper 2950x 2900 5600 8950 8100
39 * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
40 * significantly faster. AVX2 is kept because it's more likely to relieve
41 * the bottleneck on the slower CPU.
43 * This optimization for get_checksum1() is intentionally limited to x86-64
44 * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
45 * the available xmm registers, this optimized version may not be faster than
46 * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
48 * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
49 * use of the target attribute, selecting the fastest code path based on
50 * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
51 * GCC 4.x are not supported to ease configure.ac logic.
61 #include <immintrin.h>
63 /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
67 #define MVSTATIC static
70 // Missing from the headers on gcc 6 and older, clang 8 and older
71 typedef long long __m128i_u
__attribute__((__vector_size__(16), __may_alias__
, __aligned__(1)));
72 typedef long long __m256i_u
__attribute__((__vector_size__(32), __may_alias__
, __aligned__(1)));
74 /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
75 These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
76 as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
77 performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
80 #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
81 #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
82 #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
83 #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
85 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
86 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
88 __attribute__ ((target("default"))) MVSTATIC int32
get_checksum1_avx2_64(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
) { return i
; }
89 __attribute__ ((target("default"))) MVSTATIC int32
get_checksum1_ssse3_32(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
) { return i
; }
90 __attribute__ ((target("default"))) MVSTATIC int32
get_checksum1_sse2_32(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
) { return i
; }
93 Original loop per 4 bytes:
94 s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
95 s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
97 SSE2/SSSE3 loop per 32 bytes:
100 for (int j = 0; j < 8; j++) {
101 t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
102 t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
104 s2 += 32*s1 + (uint32)(
105 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
106 t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
108 s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
111 __attribute__ ((target("ssse3"))) MVSTATIC int32
get_checksum1_ssse3_32(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
)
114 int aligned
= ((uintptr_t)buf
& 15) == 0;
118 __m128i ss1
= _mm_loadu_si128((__m128i_u
*)x
);
120 __m128i ss2
= _mm_loadu_si128((__m128i_u
*)x
);
122 const int16 mul_t1_buf
[8] = {28, 24, 20, 16, 12, 8, 4, 0};
123 __m128i mul_t1
= _mm_loadu_si128((__m128i_u
*)mul_t1_buf
);
125 for (; i
< (len
-32); i
+=32) {
126 // Load ... 2*[int8*16]
127 __m128i in8_1
, in8_2
;
129 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
130 in8_1
= _mm_lddqu_si128((__m128i_u
*)&buf
[i
]);
131 in8_2
= _mm_lddqu_si128((__m128i_u
*)&buf
[i
+ 16]);
133 in8_1
= _mm_load_si128((__m128i_u
*)&buf
[i
]);
134 in8_2
= _mm_load_si128((__m128i_u
*)&buf
[i
+ 16]);
137 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
138 // Fastest, even though multiply by 1
139 __m128i mul_one
= _mm_set1_epi8(1);
140 __m128i add16_1
= _mm_maddubs_epi16(mul_one
, in8_1
);
141 __m128i add16_2
= _mm_maddubs_epi16(mul_one
, in8_2
);
143 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
144 __m128i mul_const
= _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
145 __m128i mul_add16_1
= _mm_maddubs_epi16(mul_const
, in8_1
);
146 __m128i mul_add16_2
= _mm_maddubs_epi16(mul_const
, in8_2
);
149 ss2
= _mm_add_epi32(ss2
, _mm_slli_epi32(ss1
, 5));
151 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
152 // Shifting left, then shifting right again and shuffling (rather than just
153 // shifting right as with mul32 below) to cheaply end up with the correct sign
154 // extension as we go from int16 to int32.
155 __m128i sum_add32
= _mm_add_epi16(add16_1
, add16_2
);
156 sum_add32
= _mm_add_epi16(sum_add32
, _mm_slli_si128(sum_add32
, 2));
157 sum_add32
= _mm_add_epi16(sum_add32
, _mm_slli_si128(sum_add32
, 4));
158 sum_add32
= _mm_add_epi16(sum_add32
, _mm_slli_si128(sum_add32
, 8));
159 sum_add32
= _mm_srai_epi32(sum_add32
, 16);
160 sum_add32
= _mm_shuffle_epi32(sum_add32
, 3);
162 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
163 __m128i sum_mul_add32
= _mm_add_epi16(mul_add16_1
, mul_add16_2
);
164 sum_mul_add32
= _mm_add_epi16(sum_mul_add32
, _mm_slli_si128(sum_mul_add32
, 2));
165 sum_mul_add32
= _mm_add_epi16(sum_mul_add32
, _mm_slli_si128(sum_mul_add32
, 4));
166 sum_mul_add32
= _mm_add_epi16(sum_mul_add32
, _mm_slli_si128(sum_mul_add32
, 8));
167 sum_mul_add32
= _mm_srai_epi32(sum_mul_add32
, 16);
168 sum_mul_add32
= _mm_shuffle_epi32(sum_mul_add32
, 3);
170 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
171 ss1
= _mm_add_epi32(ss1
, sum_add32
);
173 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
174 ss2
= _mm_add_epi32(ss2
, sum_mul_add32
);
176 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
177 // We could've combined this with generating sum_add32 above and
178 // save an instruction but benchmarking shows that as being slower
179 __m128i add16
= _mm_hadds_epi16(add16_1
, add16_2
);
181 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
182 __m128i mul32
= _mm_madd_epi16(add16
, mul_t1
);
184 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
185 mul32
= _mm_add_epi32(mul32
, _mm_srli_si128(mul32
, 4));
186 mul32
= _mm_add_epi32(mul32
, _mm_srli_si128(mul32
, 8));
188 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
189 ss2
= _mm_add_epi32(ss2
, mul32
);
192 // s1 += 32*CHAR_OFFSET
193 __m128i char_offset_multiplier
= _mm_set1_epi32(32 * CHAR_OFFSET
);
194 ss1
= _mm_add_epi32(ss1
, char_offset_multiplier
);
196 // s2 += 528*CHAR_OFFSET
197 char_offset_multiplier
= _mm_set1_epi32(528 * CHAR_OFFSET
);
198 ss2
= _mm_add_epi32(ss2
, char_offset_multiplier
);
202 _mm_store_si128((__m128i_u
*)x
, ss1
);
204 _mm_store_si128((__m128i_u
*)x
, ss2
);
211 Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
212 For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
213 target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
214 to a near 50% performance drop.
216 __attribute__ ((target("sse2"))) MVSTATIC int32
get_checksum1_sse2_32(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
)
219 int aligned
= ((uintptr_t)buf
& 15) == 0;
223 __m128i ss1
= _mm_loadu_si128((__m128i_u
*)x
);
225 __m128i ss2
= _mm_loadu_si128((__m128i_u
*)x
);
227 const int16 mul_t1_buf
[8] = {28, 24, 20, 16, 12, 8, 4, 0};
228 __m128i mul_t1
= _mm_loadu_si128((__m128i_u
*)mul_t1_buf
);
230 for (; i
< (len
-32); i
+=32) {
231 // Load ... 2*[int8*16]
232 __m128i in8_1
, in8_2
;
234 in8_1
= _mm_loadu_si128((__m128i_u
*)&buf
[i
]);
235 in8_2
= _mm_loadu_si128((__m128i_u
*)&buf
[i
+ 16]);
237 in8_1
= _mm_load_si128((__m128i_u
*)&buf
[i
]);
238 in8_2
= _mm_load_si128((__m128i_u
*)&buf
[i
+ 16]);
241 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
242 // Fastest, even though multiply by 1
243 __m128i mul_one
= _mm_set1_epi8(1);
244 __m128i add16_1
= SSE2_MADDUBS_EPI16(mul_one
, in8_1
);
245 __m128i add16_2
= SSE2_MADDUBS_EPI16(mul_one
, in8_2
);
247 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
248 __m128i mul_const
= _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
249 __m128i mul_add16_1
= SSE2_MADDUBS_EPI16(mul_const
, in8_1
);
250 __m128i mul_add16_2
= SSE2_MADDUBS_EPI16(mul_const
, in8_2
);
253 ss2
= _mm_add_epi32(ss2
, _mm_slli_epi32(ss1
, 5));
255 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
256 // Shifting left, then shifting right again and shuffling (rather than just
257 // shifting right as with mul32 below) to cheaply end up with the correct sign
258 // extension as we go from int16 to int32.
259 __m128i sum_add32
= _mm_add_epi16(add16_1
, add16_2
);
260 sum_add32
= _mm_add_epi16(sum_add32
, _mm_slli_si128(sum_add32
, 2));
261 sum_add32
= _mm_add_epi16(sum_add32
, _mm_slli_si128(sum_add32
, 4));
262 sum_add32
= _mm_add_epi16(sum_add32
, _mm_slli_si128(sum_add32
, 8));
263 sum_add32
= _mm_srai_epi32(sum_add32
, 16);
264 sum_add32
= _mm_shuffle_epi32(sum_add32
, 3);
266 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
267 __m128i sum_mul_add32
= _mm_add_epi16(mul_add16_1
, mul_add16_2
);
268 sum_mul_add32
= _mm_add_epi16(sum_mul_add32
, _mm_slli_si128(sum_mul_add32
, 2));
269 sum_mul_add32
= _mm_add_epi16(sum_mul_add32
, _mm_slli_si128(sum_mul_add32
, 4));
270 sum_mul_add32
= _mm_add_epi16(sum_mul_add32
, _mm_slli_si128(sum_mul_add32
, 8));
271 sum_mul_add32
= _mm_srai_epi32(sum_mul_add32
, 16);
272 sum_mul_add32
= _mm_shuffle_epi32(sum_mul_add32
, 3);
274 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
275 ss1
= _mm_add_epi32(ss1
, sum_add32
);
277 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
278 ss2
= _mm_add_epi32(ss2
, sum_mul_add32
);
280 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
281 // We could've combined this with generating sum_add32 above and
282 // save an instruction but benchmarking shows that as being slower
283 __m128i add16
= SSE2_HADDS_EPI16(add16_1
, add16_2
);
285 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
286 __m128i mul32
= _mm_madd_epi16(add16
, mul_t1
);
288 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
289 mul32
= _mm_add_epi32(mul32
, _mm_srli_si128(mul32
, 4));
290 mul32
= _mm_add_epi32(mul32
, _mm_srli_si128(mul32
, 8));
292 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
293 ss2
= _mm_add_epi32(ss2
, mul32
);
296 // s1 += 32*CHAR_OFFSET
297 __m128i char_offset_multiplier
= _mm_set1_epi32(32 * CHAR_OFFSET
);
298 ss1
= _mm_add_epi32(ss1
, char_offset_multiplier
);
300 // s2 += 528*CHAR_OFFSET
301 char_offset_multiplier
= _mm_set1_epi32(528 * CHAR_OFFSET
);
302 ss2
= _mm_add_epi32(ss2
, char_offset_multiplier
);
306 _mm_store_si128((__m128i_u
*)x
, ss1
);
308 _mm_store_si128((__m128i_u
*)x
, ss2
);
315 AVX2 loop per 64 bytes:
318 for (int j = 0; j < 16; j++) {
319 t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
320 t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
322 s2 += 64*s1 + (uint32)(
323 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
324 t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
325 ) + 2080*CHAR_OFFSET;
326 s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
329 __attribute__ ((target("avx2"))) MVSTATIC int32
get_checksum1_avx2_64(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
)
332 // Instructions reshuffled compared to SSE2 for slightly better performance
333 int aligned
= ((uintptr_t)buf
& 31) == 0;
337 __m256i ss1
= _mm256_lddqu_si256((__m256i_u
*)x
);
339 __m256i ss2
= _mm256_lddqu_si256((__m256i_u
*)x
);
341 // The order gets shuffled compared to SSE2
342 const int16 mul_t1_buf
[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
343 __m256i mul_t1
= _mm256_lddqu_si256((__m256i_u
*)mul_t1_buf
);
345 for (; i
< (len
-64); i
+=64) {
346 // Load ... 2*[int8*32]
347 __m256i in8_1
, in8_2
;
349 in8_1
= _mm256_lddqu_si256((__m256i_u
*)&buf
[i
]);
350 in8_2
= _mm256_lddqu_si256((__m256i_u
*)&buf
[i
+ 32]);
352 in8_1
= _mm256_load_si256((__m256i_u
*)&buf
[i
]);
353 in8_2
= _mm256_load_si256((__m256i_u
*)&buf
[i
+ 32]);
356 // Prefetch for next loops. This has no observable effect on the
357 // tested AMD but makes as much as 20% difference on the Intel.
358 // Curiously that same Intel sees no benefit from this with SSE2
360 _mm_prefetch(&buf
[i
+ 64], _MM_HINT_T0
);
361 _mm_prefetch(&buf
[i
+ 96], _MM_HINT_T0
);
362 _mm_prefetch(&buf
[i
+ 128], _MM_HINT_T0
);
363 _mm_prefetch(&buf
[i
+ 160], _MM_HINT_T0
);
365 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
366 // Fastest, even though multiply by 1
367 __m256i mul_one
= _mm256_set1_epi8(1);
368 __m256i add16_1
= _mm256_maddubs_epi16(mul_one
, in8_1
);
369 __m256i add16_2
= _mm256_maddubs_epi16(mul_one
, in8_2
);
371 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
372 __m256i mul_const
= _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
373 __m256i mul_add16_1
= _mm256_maddubs_epi16(mul_const
, in8_1
);
374 __m256i mul_add16_2
= _mm256_maddubs_epi16(mul_const
, in8_2
);
377 ss2
= _mm256_add_epi32(ss2
, _mm256_slli_epi32(ss1
, 6));
379 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
380 __m256i add16
= _mm256_hadds_epi16(add16_1
, add16_2
);
382 // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
383 __m256i mul32
= _mm256_madd_epi16(add16
, mul_t1
);
385 // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
386 __m256i sum_add32
= _mm256_add_epi16(add16_1
, add16_2
);
387 sum_add32
= _mm256_add_epi16(sum_add32
, _mm256_permute4x64_epi64(sum_add32
, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
388 sum_add32
= _mm256_add_epi16(sum_add32
, _mm256_slli_si256(sum_add32
, 2));
389 sum_add32
= _mm256_add_epi16(sum_add32
, _mm256_slli_si256(sum_add32
, 4));
390 sum_add32
= _mm256_add_epi16(sum_add32
, _mm256_slli_si256(sum_add32
, 8));
391 sum_add32
= _mm256_srai_epi32(sum_add32
, 16);
392 sum_add32
= _mm256_shuffle_epi32(sum_add32
, 3);
394 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
395 ss1
= _mm256_add_epi32(ss1
, sum_add32
);
397 // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
398 __m256i sum_mul_add32
= _mm256_add_epi16(mul_add16_1
, mul_add16_2
);
399 sum_mul_add32
= _mm256_add_epi16(sum_mul_add32
, _mm256_permute4x64_epi64(sum_mul_add32
, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
400 sum_mul_add32
= _mm256_add_epi16(sum_mul_add32
, _mm256_slli_si256(sum_mul_add32
, 2));
401 sum_mul_add32
= _mm256_add_epi16(sum_mul_add32
, _mm256_slli_si256(sum_mul_add32
, 4));
402 sum_mul_add32
= _mm256_add_epi16(sum_mul_add32
, _mm256_slli_si256(sum_mul_add32
, 8));
403 sum_mul_add32
= _mm256_srai_epi32(sum_mul_add32
, 16);
404 sum_mul_add32
= _mm256_shuffle_epi32(sum_mul_add32
, 3);
406 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
407 ss2
= _mm256_add_epi32(ss2
, sum_mul_add32
);
409 // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
410 mul32
= _mm256_add_epi32(mul32
, _mm256_permute2x128_si256(mul32
, mul32
, 1));
411 mul32
= _mm256_add_epi32(mul32
, _mm256_srli_si256(mul32
, 4));
412 mul32
= _mm256_add_epi32(mul32
, _mm256_srli_si256(mul32
, 8));
414 // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
415 ss2
= _mm256_add_epi32(ss2
, mul32
);
418 // s1 += 64*CHAR_OFFSET
419 __m256i char_offset_multiplier
= _mm256_set1_epi32(64 * CHAR_OFFSET
);
420 ss1
= _mm256_add_epi32(ss1
, char_offset_multiplier
);
422 // s2 += 2080*CHAR_OFFSET
423 char_offset_multiplier
= _mm256_set1_epi32(2080 * CHAR_OFFSET
);
424 ss2
= _mm256_add_epi32(ss2
, char_offset_multiplier
);
428 _mm256_store_si256((__m256i_u
*)x
, ss1
);
430 _mm256_store_si256((__m256i_u
*)x
, ss2
);
436 static int32
get_checksum1_default_1(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
)
440 for (; i
< (len
-4); i
+=4) {
441 s2
+= 4*(s1
+ buf
[i
]) + 3*buf
[i
+1] + 2*buf
[i
+2] + buf
[i
+3] + 10*CHAR_OFFSET
;
442 s1
+= (buf
[i
+0] + buf
[i
+1] + buf
[i
+2] + buf
[i
+3] + 4*CHAR_OFFSET
);
444 for (; i
< len
; i
++) {
445 s1
+= (buf
[i
]+CHAR_OFFSET
); s2
+= s1
;
452 /* With GCC 10 putting this implementation inside 'extern "C"' causes an
453 assembler error. That worked fine on GCC 5-9 and clang 6-10...
455 static inline uint32
get_checksum1_cpp(char *buf1
, int32 len
)
461 // multiples of 64 bytes using AVX2 (if available)
462 i
= get_checksum1_avx2_64((schar
*)buf1
, len
, i
, &s1
, &s2
);
464 // multiples of 32 bytes using SSSE3 (if available)
465 i
= get_checksum1_ssse3_32((schar
*)buf1
, len
, i
, &s1
, &s2
);
467 // multiples of 32 bytes using SSE2 (if available)
468 i
= get_checksum1_sse2_32((schar
*)buf1
, len
, i
, &s1
, &s2
);
471 i
= get_checksum1_default_1((schar
*)buf1
, len
, i
, &s1
, &s2
);
473 return (s1
& 0xffff) + (s2
<< 16);
478 uint32
get_checksum1(char *buf1
, int32 len
)
480 return get_checksum1_cpp(buf1
, len
);
485 #ifdef BENCHMARK_SIMD_CHECKSUM1
486 #pragma clang optimize off
487 #pragma GCC push_options
488 #pragma GCC optimize ("O0")
491 #define BLOCK_LEN 1024*1024
493 #ifndef CLOCK_MONOTONIC_RAW
494 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
497 static void benchmark(const char* desc
, int32 (*func
)(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
), schar
* buf
, int32 len
) {
498 struct timespec start
, end
;
503 clock_gettime(CLOCK_MONOTONIC_RAW
, &start
);
504 for (i
= 0; i
< ROUNDS
; i
++) {
506 next
= func((schar
*)buf
, len
, 0, &s1
, &s2
);
507 get_checksum1_default_1((schar
*)buf
, len
, next
, &s1
, &s2
);
509 clock_gettime(CLOCK_MONOTONIC_RAW
, &end
);
510 us
= next
== 0 ? 0 : (end
.tv_sec
- start
.tv_sec
) * 1000000 + (end
.tv_nsec
- start
.tv_nsec
) / 1000;
511 cs
= next
== 0 ? 0 : (s1
& 0xffff) + (s2
<< 16);
512 printf("%-5s :: %5.0f MB/s :: %08x\n", desc
, us
? (float)(len
/ (1024 * 1024) * ROUNDS
) / ((float)us
/ 1000000.0f
) : 0, cs
);
515 static int32
get_checksum1_auto(schar
* buf
, int32 len
, int32 i
, uint32
* ps1
, uint32
* ps2
) {
516 uint32 cs
= get_checksum1((char*)buf
, len
);
524 unsigned char* buf
= (unsigned char*)malloc(BLOCK_LEN
);
525 for (i
= 0; i
< BLOCK_LEN
; i
++) buf
[i
] = (i
+ (i
% 3) + (i
% 11)) % 256;
527 benchmark("Auto", get_checksum1_auto
, (schar
*)buf
, BLOCK_LEN
);
528 benchmark("Raw-C", get_checksum1_default_1
, (schar
*)buf
, BLOCK_LEN
);
529 benchmark("SSE2", get_checksum1_sse2_32
, (schar
*)buf
, BLOCK_LEN
);
530 benchmark("SSSE3", get_checksum1_ssse3_32
, (schar
*)buf
, BLOCK_LEN
);
531 benchmark("AVX2", get_checksum1_avx2_64
, (schar
*)buf
, BLOCK_LEN
);
537 #pragma GCC pop_options
538 #pragma clang optimize on
539 #endif /* BENCHMARK_SIMD_CHECKSUM1 */
541 #endif /* HAVE_SIMD */
542 #endif /* __cplusplus */
543 #endif /* __x86_64__ */