simd-checksum-x86_64.cpp

   1 /*
   2  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
   3  *
   4  * Copyright (C) 1996 Andrew Tridgell
   5  * Copyright (C) 1996 Paul Mackerras
   6  * Copyright (C) 2004-2020 Wayne Davison
   7  * Copyright (C) 2020 Jorrit Jongma
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, visit the http://fsf.org website.
  21  */
  22 /*
  23  * Optimization target for get_checksum1() was the Intel Atom D2700, the
  24  * slowest CPU in the test set and the most likely to be CPU limited during
  25  * transfers. The combination of intrinsics was chosen specifically for the
  26  * most gain on that CPU, other combinations were occasionally slightly
  27  * faster on the others.
  28  *
  29  * While on more modern CPUs transfers are less likely to be CPU limited
  30  * (at least by this specific function), lower CPU usage is always better.
  31  * Improvements may still be seen when matching chunks from NVMe storage
  32  * even on newer CPUs.
  33  *
  34  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
  35  * - Intel Atom D2700            550     750    1000     N/A
  36  * - Intel i7-7700hq            1850    2550    4050    6200
  37  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
  38  *
  39  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
  40  * significantly faster. AVX2 is kept because it's more likely to relieve
  41  * the bottleneck on the slower CPU.
  42  *
  43  * This optimization for get_checksum1() is intentionally limited to x86-64
  44  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
  45  * the available xmm registers, this optimized version may not be faster than
  46  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
  47  *
  48  * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
  49  * use of the target attribute, selecting the fastest code path based on
  50  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
  51  * GCC 4.x are not supported to ease configure.ac logic.
  52  */
  53
  54 #ifdef __x86_64__ /* { */
  55 #ifdef __cplusplus /* { */
  56
  57 #include "rsync.h"
  58
  59 #ifdef USE_ROLL_SIMD /* { */
  60
  61 #include <immintrin.h>
  62
  63 /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
  64 #ifdef __clang__
  65 #define MVSTATIC
  66 #else
  67 #define MVSTATIC static
  68 #endif
  69
  70 // Missing from the headers on gcc 6 and older, clang 8 and older
  71 typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  72 typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
  73
  74 /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
  75    These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
  76    as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
  77    performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
  78 */
  79
  80 #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
  81 #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
  82 #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
  83 #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
  84
  85 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
  86 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
  87
  88 #ifndef USE_ROLL_ASM
  89 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  90 #endif
  91 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  92 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  93
  94 /*
  95   Original loop per 4 bytes:
  96     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
  97     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
  98
  99   SSE2/SSSE3 loop per 32 bytes:
 100     int16 t1[8];
 101     int16 t2[8];
 102     for (int j = 0; j < 8; j++) {
 103       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 104       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 105     }
 106     s2 += 32*s1 + (uint32)(
 107               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
 108               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 109           ) + 528*CHAR_OFFSET;
 110     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
 111           32*CHAR_OFFSET;
 112  */
 113 __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 114 {
 115     if (len > 32) {
 116         int aligned = ((uintptr_t)buf & 15) == 0;
 117
 118         uint32 x[4] = {0};
 119         x[0] = *ps1;
 120         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 121         x[0] = *ps2;
 122         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 123
 124         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 125         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 126
 127         for (; i < (len-32); i+=32) {
 128             // Load ... 2*[int8*16]
 129             __m128i in8_1, in8_2;
 130             if (!aligned) {
 131                 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
 132                 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
 133                 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
 134             } else {
 135                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 136                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 137             }
 138
 139             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 140             // Fastest, even though multiply by 1
 141             __m128i mul_one = _mm_set1_epi8(1);
 142             __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
 143             __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
 144
 145             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 146             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 147             __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
 148             __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
 149
 150             // s2 += 32*s1
 151             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 152
 153             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 154             // Shifting left, then shifting right again and shuffling (rather than just
 155             // shifting right as with mul32 below) to cheaply end up with the correct sign
 156             // extension as we go from int16 to int32.
 157             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 158             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 159             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 160             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 161             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 162             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 163
 164             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 165             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 166             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 167             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 168             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 169             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 170             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 171
 172             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 173             ss1 = _mm_add_epi32(ss1, sum_add32);
 174
 175             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 176             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 177
 178             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 179             // We could've combined this with generating sum_add32 above and
 180             // save an instruction but benchmarking shows that as being slower
 181             __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
 182
 183             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 184             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 185
 186             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 187             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 188             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 189
 190             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 191             ss2 = _mm_add_epi32(ss2, mul32);
 192
 193 #if CHAR_OFFSET != 0
 194             // s1 += 32*CHAR_OFFSET
 195             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 196             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 197
 198             // s2 += 528*CHAR_OFFSET
 199             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 200             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 201 #endif
 202         }
 203
 204         _mm_store_si128((__m128i_u*)x, ss1);
 205         *ps1 = x[0];
 206         _mm_store_si128((__m128i_u*)x, ss2);
 207         *ps2 = x[0];
 208     }
 209     return i;
 210 }
 211
 212 /*
 213   Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
 214   For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
 215   target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
 216   to a near 50% performance drop.
 217  */
 218 __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 219 {
 220     if (len > 32) {
 221         int aligned = ((uintptr_t)buf & 15) == 0;
 222
 223         uint32 x[4] = {0};
 224         x[0] = *ps1;
 225         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 226         x[0] = *ps2;
 227         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 228
 229         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 230         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 231
 232         for (; i < (len-32); i+=32) {
 233             // Load ... 2*[int8*16]
 234             __m128i in8_1, in8_2;
 235             if (!aligned) {
 236                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
 237                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
 238             } else {
 239                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 240                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 241             }
 242
 243             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 244             // Fastest, even though multiply by 1
 245             __m128i mul_one = _mm_set1_epi8(1);
 246             __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
 247             __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
 248
 249             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 250             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 251             __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
 252             __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
 253
 254             // s2 += 32*s1
 255             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 256
 257             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 258             // Shifting left, then shifting right again and shuffling (rather than just
 259             // shifting right as with mul32 below) to cheaply end up with the correct sign
 260             // extension as we go from int16 to int32.
 261             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 262             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 263             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 264             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 265             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 266             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 267
 268             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 269             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 270             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 271             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 272             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 273             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 274             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 275
 276             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 277             ss1 = _mm_add_epi32(ss1, sum_add32);
 278
 279             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 280             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 281
 282             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 283             // We could've combined this with generating sum_add32 above and
 284             // save an instruction but benchmarking shows that as being slower
 285             __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
 286
 287             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 288             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 289
 290             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 291             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 292             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 293
 294             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 295             ss2 = _mm_add_epi32(ss2, mul32);
 296
 297 #if CHAR_OFFSET != 0
 298             // s1 += 32*CHAR_OFFSET
 299             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 300             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 301
 302             // s2 += 528*CHAR_OFFSET
 303             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 304             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 305 #endif
 306         }
 307
 308         _mm_store_si128((__m128i_u*)x, ss1);
 309         *ps1 = x[0];
 310         _mm_store_si128((__m128i_u*)x, ss2);
 311         *ps2 = x[0];
 312     }
 313     return i;
 314 }
 315
 316 #ifdef USE_ROLL_ASM /* { */
 317
 318 extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2_asm(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
 319
 320 #else /* } { */
 321
 322 /*
 323   AVX2 loop per 64 bytes:
 324     int16 t1[16];
 325     int16 t2[16];
 326     for (int j = 0; j < 16; j++) {
 327       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 328       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 329     }
 330     s2 += 64*s1 + (uint32)(
 331               60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
 332               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
 333           ) + 2080*CHAR_OFFSET;
 334     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
 335           64*CHAR_OFFSET;
 336  */
 337
 338 __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 339 {
 340     if (len > 64) {
 341
 342         uint32 x[4] = {0};
 343         __m128i ss1 = _mm_cvtsi32_si128(*ps1);
 344         __m128i ss2 = _mm_cvtsi32_si128(*ps2);
 345
 346         const char mul_t1_buf[16] = {60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0};
 347         __m128i tmp = _mm_load_si128((__m128i*) mul_t1_buf);
 348         __m256i mul_t1 = _mm256_cvtepu8_epi16(tmp);
 349         __m256i mul_const = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(4 | (3 << 8) | (2 << 16) | (1 << 24)));
 350         __m256i mul_one;
 351             mul_one = _mm256_abs_epi8(_mm256_cmpeq_epi16(mul_one,mul_one)); // set all vector elements to 1
 352
 353         for (; i < (len-64); i+=64) {
 354             // Load ... 4*[int8*16]
 355             __m256i in8_1, in8_2;
 356             __m128i in8_1_low, in8_2_low, in8_1_high, in8_2_high;
 357             in8_1_low = _mm_loadu_si128((__m128i_u*)&buf[i]);
 358             in8_2_low = _mm_loadu_si128((__m128i_u*)&buf[i+16]);
 359             in8_1_high = _mm_loadu_si128((__m128i_u*)&buf[i+32]);
 360             in8_2_high = _mm_loadu_si128((__m128i_u*)&buf[i+48]);
 361             in8_1 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_1_low), in8_1_high,1);
 362             in8_2 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_2_low), in8_2_high,1);
 363
 364             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 365             // Fastest, even though multiply by 1
 366             __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
 367             __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
 368
 369             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 370             __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
 371             __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
 372
 373             // s2 += 64*s1
 374             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 6));
 375
 376             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 377             __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
 378             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_epi32(sum_add32, 16));
 379             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 4));
 380             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 8));
 381
 382             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 383             __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
 384             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_epi32(sum_mul_add32, 16));
 385             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 4));
 386             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 8));
 387
 388             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 389             __m128i sum_add32_hi = _mm256_extracti128_si256(sum_add32, 0x1);
 390             ss1 = _mm_add_epi32(ss1, _mm256_castsi256_si128(sum_add32));
 391             ss1 = _mm_add_epi32(ss1, sum_add32_hi);
 392
 393             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 394             __m128i sum_mul_add32_hi = _mm256_extracti128_si256(sum_mul_add32, 0x1);
 395             ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(sum_mul_add32));
 396             ss2 = _mm_add_epi32(ss2, sum_mul_add32_hi);
 397
 398             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 399             // We could've combined this with generating sum_add32 above and
 400             // save an instruction but benchmarking shows that as being slower
 401             __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
 402
 403             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 404             __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
 405
 406             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 407             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
 408             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
 409             // prefetch 2 cacheline ahead
 410             _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
 411
 412             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 413             __m128i mul32_hi = _mm256_extracti128_si256(mul32, 0x1);
 414             ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(mul32));
 415             ss2 = _mm_add_epi32(ss2, mul32_hi);
 416
 417 #if CHAR_OFFSET != 0
 418             // s1 += 32*CHAR_OFFSET
 419             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 420             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 421
 422             // s2 += 528*CHAR_OFFSET
 423             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 424             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 425 #endif
 426         }
 427
 428         _mm_store_si128((__m128i_u*)x, ss1);
 429         *ps1 = x[0];
 430         _mm_store_si128((__m128i_u*)x, ss2);
 431         *ps2 = x[0];
 432     }
 433     return i;
 434 }
 435
 436 #endif /* } !USE_ROLL_ASM */
 437
 438 static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 439 {
 440     uint32 s1 = *ps1;
 441     uint32 s2 = *ps2;
 442     for (; i < (len-4); i+=4) {
 443         s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
 444         s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
 445     }
 446     for (; i < len; i++) {
 447         s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
 448     }
 449     *ps1 = s1;
 450     *ps2 = s2;
 451     return i;
 452 }
 453
 454 /* With GCC 10 putting this implementation inside 'extern "C"' causes an
 455    assembler error. That worked fine on GCC 5-9 and clang 6-10...
 456   */
 457 static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
 458 {
 459     int32 i = 0;
 460     uint32 s1 = 0;
 461     uint32 s2 = 0;
 462
 463     // multiples of 64 bytes using AVX2 (if available)
 464 #ifdef USE_ROLL_ASM
 465     i = get_checksum1_avx2_asm((schar*)buf1, len, i, &s1, &s2);
 466 #else
 467     i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
 468 #endif
 469
 470     // multiples of 32 bytes using SSSE3 (if available)
 471     i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
 472
 473     // multiples of 32 bytes using SSE2 (if available)
 474     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
 475
 476     // whatever is left
 477     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
 478
 479     return (s1 & 0xffff) + (s2 << 16);
 480 }
 481
 482 extern "C" {
 483
 484 uint32 get_checksum1(char *buf1, int32 len)
 485 {
 486     return get_checksum1_cpp(buf1, len);
 487 }
 488
 489 } // extern "C"
 490
 491 #ifdef BENCHMARK_SIMD_CHECKSUM1
 492 #pragma clang optimize off
 493 #pragma GCC push_options
 494 #pragma GCC optimize ("O0")
 495
 496 #define ROUNDS 1024
 497 #define BLOCK_LEN 1024*1024
 498
 499 #ifndef CLOCK_MONOTONIC_RAW
 500 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
 501 #endif
 502
 503 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
 504     struct timespec start, end;
 505     uint64_t us;
 506     uint32_t cs, s1, s2;
 507     int i, next;
 508
 509     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 510     for (i = 0; i < ROUNDS; i++) {
 511         s1 = s2 = 0;
 512         next = func((schar*)buf, len, 0, &s1, &s2);
 513         get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
 514     }
 515     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
 516     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
 517     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
 518     printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
 519 }
 520
 521 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 522     uint32 cs = get_checksum1((char*)buf, len);
 523     *ps1 = cs & 0xffff;
 524     *ps2 = cs >> 16;
 525     return len;
 526 }
 527
 528 int main() {
 529     int i;
 530     unsigned char* buf = (unsigned char*)aligned_alloc(64,BLOCK_LEN);
 531     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
 532
 533     benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
 534     benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
 535     benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
 536     benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
 537 #ifdef USE_ROLL_ASM
 538     benchmark("AVX2-ASM", get_checksum1_avx2_asm, (schar*)buf, BLOCK_LEN);
 539 #else
 540     benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
 541 #endif
 542
 543     free(buf);
 544     return 0;
 545 }
 546
 547 #pragma GCC pop_options
 548 #pragma clang optimize on
 549 #endif /* BENCHMARK_SIMD_CHECKSUM1 */
 550
 551 #endif /* } USE_ROLL_SIMD */
 552 #endif /* } __cplusplus */
 553 #endif /* } __x86_64__ */