Tweak a define.
[rsync.git] / simd-checksum-x86_64.cpp
blob33f26e9205c5c1ac41417e59209dbf25eb3748ce
1 /*
2 * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
4 * Copyright (C) 1996 Andrew Tridgell
5 * Copyright (C) 1996 Paul Mackerras
6 * Copyright (C) 2004-2020 Wayne Davison
7 * Copyright (C) 2020 Jorrit Jongma
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 3 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, visit the http://fsf.org website.
23 * Optimization target for get_checksum1() was the Intel Atom D2700, the
24 * slowest CPU in the test set and the most likely to be CPU limited during
25 * transfers. The combination of intrinsics was chosen specifically for the
26 * most gain on that CPU, other combinations were occasionally slightly
27 * faster on the others.
29 * While on more modern CPUs transfers are less likely to be CPU limited
30 * (at least by this specific function), lower CPU usage is always better.
31 * Improvements may still be seen when matching chunks from NVMe storage
32 * even on newer CPUs.
34 * Benchmarks (in MB/s) C SSE2 SSSE3 AVX2
35 * - Intel Atom D2700 550 750 1000 N/A
36 * - Intel i7-7700hq 1850 2550 4050 6200
37 * - AMD ThreadRipper 2950x 2900 5600 8950 8100
39 * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
40 * significantly faster. AVX2 is kept because it's more likely to relieve
41 * the bottleneck on the slower CPU.
43 * This optimization for get_checksum1() is intentionally limited to x86-64
44 * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
45 * the available xmm registers, this optimized version may not be faster than
46 * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
48 * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
49 * use of the target attribute, selecting the fastest code path based on
50 * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
51 * GCC 4.x are not supported to ease configure.ac logic.
54 #ifdef __x86_64__ /* { */
55 #ifdef __cplusplus /* { */
57 #include "rsync.h"
59 #ifdef USE_ROLL_SIMD /* { */
61 #include <immintrin.h>
63 /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
64 #ifdef __clang__
65 #define MVSTATIC
66 #else
67 #define MVSTATIC static
68 #endif
70 // Missing from the headers on gcc 6 and older, clang 8 and older
71 typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
72 typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
74 /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
75 These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
76 as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
77 performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
80 #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
81 #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
82 #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
83 #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
85 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
86 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
88 #ifndef USE_ROLL_ASM
89 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
90 #endif
91 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
92 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
95 Original loop per 4 bytes:
96 s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
97 s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
99 SSE2/SSSE3 loop per 32 bytes:
100 int16 t1[8];
101 int16 t2[8];
102 for (int j = 0; j < 8; j++) {
103 t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
104 t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
106 s2 += 32*s1 + (uint32)(
107 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
108 t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
109 ) + 528*CHAR_OFFSET;
110 s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
111 32*CHAR_OFFSET;
113 __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
115 if (len > 32) {
116 int aligned = ((uintptr_t)buf & 15) == 0;
118 uint32 x[4] = {0};
119 x[0] = *ps1;
120 __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
121 x[0] = *ps2;
122 __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
124 const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
125 __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
127 for (; i < (len-32); i+=32) {
128 // Load ... 2*[int8*16]
129 __m128i in8_1, in8_2;
130 if (!aligned) {
131 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
132 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
133 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
134 } else {
135 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
136 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
139 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
140 // Fastest, even though multiply by 1
141 __m128i mul_one = _mm_set1_epi8(1);
142 __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
143 __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
145 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
146 __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
147 __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
148 __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
150 // s2 += 32*s1
151 ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
153 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
154 // Shifting left, then shifting right again and shuffling (rather than just
155 // shifting right as with mul32 below) to cheaply end up with the correct sign
156 // extension as we go from int16 to int32.
157 __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
158 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
159 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
160 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
161 sum_add32 = _mm_srai_epi32(sum_add32, 16);
162 sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
164 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
165 __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
166 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
167 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
168 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
169 sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
170 sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
172 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
173 ss1 = _mm_add_epi32(ss1, sum_add32);
175 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
176 ss2 = _mm_add_epi32(ss2, sum_mul_add32);
178 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
179 // We could've combined this with generating sum_add32 above and
180 // save an instruction but benchmarking shows that as being slower
181 __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
183 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
184 __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
186 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
187 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
188 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
190 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
191 ss2 = _mm_add_epi32(ss2, mul32);
193 #if CHAR_OFFSET != 0
194 // s1 += 32*CHAR_OFFSET
195 __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
196 ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
198 // s2 += 528*CHAR_OFFSET
199 char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
200 ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
201 #endif
204 _mm_store_si128((__m128i_u*)x, ss1);
205 *ps1 = x[0];
206 _mm_store_si128((__m128i_u*)x, ss2);
207 *ps2 = x[0];
209 return i;
213 Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
214 For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
215 target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
216 to a near 50% performance drop.
218 __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
220 if (len > 32) {
221 int aligned = ((uintptr_t)buf & 15) == 0;
223 uint32 x[4] = {0};
224 x[0] = *ps1;
225 __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
226 x[0] = *ps2;
227 __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
229 const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
230 __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
232 for (; i < (len-32); i+=32) {
233 // Load ... 2*[int8*16]
234 __m128i in8_1, in8_2;
235 if (!aligned) {
236 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
237 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
238 } else {
239 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
240 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
243 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
244 // Fastest, even though multiply by 1
245 __m128i mul_one = _mm_set1_epi8(1);
246 __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
247 __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
249 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
250 __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
251 __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
252 __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
254 // s2 += 32*s1
255 ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
257 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
258 // Shifting left, then shifting right again and shuffling (rather than just
259 // shifting right as with mul32 below) to cheaply end up with the correct sign
260 // extension as we go from int16 to int32.
261 __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
262 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
263 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
264 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
265 sum_add32 = _mm_srai_epi32(sum_add32, 16);
266 sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
268 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
269 __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
270 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
271 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
272 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
273 sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
274 sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
276 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
277 ss1 = _mm_add_epi32(ss1, sum_add32);
279 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
280 ss2 = _mm_add_epi32(ss2, sum_mul_add32);
282 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
283 // We could've combined this with generating sum_add32 above and
284 // save an instruction but benchmarking shows that as being slower
285 __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
287 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
288 __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
290 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
291 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
292 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
294 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
295 ss2 = _mm_add_epi32(ss2, mul32);
297 #if CHAR_OFFSET != 0
298 // s1 += 32*CHAR_OFFSET
299 __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
300 ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
302 // s2 += 528*CHAR_OFFSET
303 char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
304 ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
305 #endif
308 _mm_store_si128((__m128i_u*)x, ss1);
309 *ps1 = x[0];
310 _mm_store_si128((__m128i_u*)x, ss2);
311 *ps2 = x[0];
313 return i;
316 #ifdef USE_ROLL_ASM /* { */
318 extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2_asm(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
320 #else /* } { */
323 AVX2 loop per 64 bytes:
324 int16 t1[16];
325 int16 t2[16];
326 for (int j = 0; j < 16; j++) {
327 t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
328 t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
330 s2 += 64*s1 + (uint32)(
331 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
332 t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
333 ) + 2080*CHAR_OFFSET;
334 s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
335 64*CHAR_OFFSET;
338 __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
340 if (len > 64) {
342 uint32 x[4] = {0};
343 __m128i ss1 = _mm_cvtsi32_si128(*ps1);
344 __m128i ss2 = _mm_cvtsi32_si128(*ps2);
346 const char mul_t1_buf[16] = {60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0};
347 __m128i tmp = _mm_load_si128((__m128i*) mul_t1_buf);
348 __m256i mul_t1 = _mm256_cvtepu8_epi16(tmp);
349 __m256i mul_const = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(4 | (3 << 8) | (2 << 16) | (1 << 24)));
350 __m256i mul_one;
351 mul_one = _mm256_abs_epi8(_mm256_cmpeq_epi16(mul_one,mul_one)); // set all vector elements to 1
353 for (; i < (len-64); i+=64) {
354 // Load ... 4*[int8*16]
355 __m256i in8_1, in8_2;
356 __m128i in8_1_low, in8_2_low, in8_1_high, in8_2_high;
357 in8_1_low = _mm_loadu_si128((__m128i_u*)&buf[i]);
358 in8_2_low = _mm_loadu_si128((__m128i_u*)&buf[i+16]);
359 in8_1_high = _mm_loadu_si128((__m128i_u*)&buf[i+32]);
360 in8_2_high = _mm_loadu_si128((__m128i_u*)&buf[i+48]);
361 in8_1 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_1_low), in8_1_high,1);
362 in8_2 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_2_low), in8_2_high,1);
364 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
365 // Fastest, even though multiply by 1
366 __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
367 __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
369 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
370 __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
371 __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
373 // s2 += 64*s1
374 ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 6));
376 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
377 __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
378 sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_epi32(sum_add32, 16));
379 sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 4));
380 sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 8));
382 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
383 __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
384 sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_epi32(sum_mul_add32, 16));
385 sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 4));
386 sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 8));
388 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
389 __m128i sum_add32_hi = _mm256_extracti128_si256(sum_add32, 0x1);
390 ss1 = _mm_add_epi32(ss1, _mm256_castsi256_si128(sum_add32));
391 ss1 = _mm_add_epi32(ss1, sum_add32_hi);
393 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
394 __m128i sum_mul_add32_hi = _mm256_extracti128_si256(sum_mul_add32, 0x1);
395 ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(sum_mul_add32));
396 ss2 = _mm_add_epi32(ss2, sum_mul_add32_hi);
398 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
399 // We could've combined this with generating sum_add32 above and
400 // save an instruction but benchmarking shows that as being slower
401 __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
403 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
404 __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
406 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
407 mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
408 mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
409 // prefetch 2 cacheline ahead
410 _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
412 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
413 __m128i mul32_hi = _mm256_extracti128_si256(mul32, 0x1);
414 ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(mul32));
415 ss2 = _mm_add_epi32(ss2, mul32_hi);
417 #if CHAR_OFFSET != 0
418 // s1 += 32*CHAR_OFFSET
419 __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
420 ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
422 // s2 += 528*CHAR_OFFSET
423 char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
424 ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
425 #endif
428 _mm_store_si128((__m128i_u*)x, ss1);
429 *ps1 = x[0];
430 _mm_store_si128((__m128i_u*)x, ss2);
431 *ps2 = x[0];
433 return i;
436 #endif /* } !USE_ROLL_ASM */
438 static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
440 uint32 s1 = *ps1;
441 uint32 s2 = *ps2;
442 for (; i < (len-4); i+=4) {
443 s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
444 s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
446 for (; i < len; i++) {
447 s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
449 *ps1 = s1;
450 *ps2 = s2;
451 return i;
454 /* With GCC 10 putting this implementation inside 'extern "C"' causes an
455 assembler error. That worked fine on GCC 5-9 and clang 6-10...
457 static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
459 int32 i = 0;
460 uint32 s1 = 0;
461 uint32 s2 = 0;
463 // multiples of 64 bytes using AVX2 (if available)
464 #ifdef USE_ROLL_ASM
465 i = get_checksum1_avx2_asm((schar*)buf1, len, i, &s1, &s2);
466 #else
467 i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
468 #endif
470 // multiples of 32 bytes using SSSE3 (if available)
471 i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
473 // multiples of 32 bytes using SSE2 (if available)
474 i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
476 // whatever is left
477 i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
479 return (s1 & 0xffff) + (s2 << 16);
482 extern "C" {
484 uint32 get_checksum1(char *buf1, int32 len)
486 return get_checksum1_cpp(buf1, len);
489 } // extern "C"
491 #ifdef BENCHMARK_SIMD_CHECKSUM1
492 #pragma clang optimize off
493 #pragma GCC push_options
494 #pragma GCC optimize ("O0")
496 #define ROUNDS 1024
497 #define BLOCK_LEN 1024*1024
499 #ifndef CLOCK_MONOTONIC_RAW
500 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
501 #endif
503 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
504 struct timespec start, end;
505 uint64_t us;
506 uint32_t cs, s1, s2;
507 int i, next;
509 clock_gettime(CLOCK_MONOTONIC_RAW, &start);
510 for (i = 0; i < ROUNDS; i++) {
511 s1 = s2 = 0;
512 next = func((schar*)buf, len, 0, &s1, &s2);
513 get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
515 clock_gettime(CLOCK_MONOTONIC_RAW, &end);
516 us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
517 cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
518 printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
521 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
522 uint32 cs = get_checksum1((char*)buf, len);
523 *ps1 = cs & 0xffff;
524 *ps2 = cs >> 16;
525 return len;
528 int main() {
529 int i;
530 unsigned char* buf = (unsigned char*)aligned_alloc(64,BLOCK_LEN);
531 for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
533 benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
534 benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
535 benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
536 benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
537 #ifdef USE_ROLL_ASM
538 benchmark("AVX2-ASM", get_checksum1_avx2_asm, (schar*)buf, BLOCK_LEN);
539 #else
540 benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
541 #endif
543 free(buf);
544 return 0;
547 #pragma GCC pop_options
548 #pragma clang optimize on
549 #endif /* BENCHMARK_SIMD_CHECKSUM1 */
551 #endif /* } USE_ROLL_SIMD */
552 #endif /* } __cplusplus */
553 #endif /* } __x86_64__ */