include/gmx_simd_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code.
  40  * To support a new architecture, adding macros here should be (nearly)
  41  * all that is needed.
  42  */
  43
  44 #ifdef _gmx_simd_macros_h_
  45 #error "gmx_simd_macros.h included twice"
  46 #else
  47 #define _gmx_simd_macros_h_
  48
  49 /* NOTE: SSE2 acceleration does not include floor or blendv */
  50
  51
  52 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  53 /* #define GMX_SIMD_REFERENCE_PLAIN_C */
  54 #ifdef GMX_SIMD_REFERENCE_PLAIN_C
  55 /* Plain C SIMD reference implementation, also serves as documentation */
  56 #define GMX_HAVE_SIMD_MACROS
  57
  58 /* In general the reference SIMD supports any SIMD width, including 1.
  59  * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
  60  * The nbnxn 2xnn kernels are currently not supported.
  61  */
  62 #define GMX_SIMD_REF_WIDTH  4
  63
  64 /* Include plain-C reference implementation, also serves as documentation */
  65 #include "gmx_simd_ref.h"
  66
  67 #define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
  68
  69 /* float/double SIMD register type */
  70 #define gmx_mm_pr  gmx_simd_ref_pr
  71
  72 /* boolean SIMD register type */
  73 #define gmx_mm_pb  gmx_simd_ref_pb
  74
  75 /* integer SIMD register type, only for table indexing and exclusion masks */
  76 #define gmx_epi32  gmx_simd_ref_epi32
  77 #define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  78
  79 /* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
  80 #define gmx_load_pr       gmx_simd_ref_load_pr
  81 /* Set all SIMD register elements to *r */
  82 #define gmx_load1_pr      gmx_simd_ref_load1_pr
  83 #define gmx_set1_pr       gmx_simd_ref_set1_pr
  84 #define gmx_setzero_pr    gmx_simd_ref_setzero_pr
  85 #define gmx_store_pr      gmx_simd_ref_store_pr
  86
  87 #define gmx_add_pr        gmx_simd_ref_add_pr
  88 #define gmx_sub_pr        gmx_simd_ref_sub_pr
  89 #define gmx_mul_pr        gmx_simd_ref_mul_pr
  90 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
  91 #define gmx_madd_pr       gmx_simd_ref_madd_pr
  92 #define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
  93
  94 #define gmx_max_pr        gmx_simd_ref_max_pr
  95 #define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
  96
  97 #define gmx_round_pr      gmx_simd_ref_round_pr
  98
  99 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
 100 #define GMX_SIMD_HAVE_FLOOR
 101 #ifdef GMX_SIMD_HAVE_FLOOR
 102 #define gmx_floor_pr      gmx_simd_ref_floor_pr
 103 #endif
 104
 105 /* Not required, only used when blendv is faster than comparison */
 106 #define GMX_SIMD_HAVE_BLENDV
 107 #ifdef GMX_SIMD_HAVE_BLENDV
 108 #define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 109 #endif
 110
 111 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
 112 #define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 113
 114 /* Very specific operation required in the non-bonded kernels */
 115 #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 116
 117 /* Comparison */
 118 #define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 119
 120 /* Logical operations on SIMD booleans */
 121 #define gmx_and_pb        gmx_simd_ref_and_pb
 122 #define gmx_or_pb         gmx_simd_ref_or_pb
 123
 124 /* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
 125  * If this is not present, define GMX_SIMD_IS_TRUE(real x),
 126  * which should return x==True, where True is True as defined in SIMD.
 127  */
 128 #define GMX_SIMD_HAVE_ANYTRUE
 129 #ifdef GMX_SIMD_HAVE_ANYTRUE
 130 #define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 131 #else
 132 /* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
 133 #define gmx_store_pb      gmx_simd_ref_store_pb
 134 #endif
 135
 136 /* For topology exclusion pair checking we need: ((a & b) ? True : False)
 137  * when we do a bit-wise and between a and b.
 138  * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
 139  * Otherwise we do all operations, except for the set1, in reals.
 140  */
 141
 142 #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 143 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 144 #define gmx_set1_epi32          gmx_simd_ref_set1_epi32
 145 #define gmx_load_si             gmx_simd_ref_load_si
 146 #define gmx_checkbitmask_epi32  gmx_simd_ref_checkbitmask_epi32
 147 #endif
 148
 149 /* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
 150 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
 151 #define gmx_castsi_pr           gmx_simd_ref_castsi_pr
 152 /* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
 153  * identical 32-bit masks are set in one double and one or both can be used.
 154  */
 155 #define gmx_checkbitmask_pr     gmx_simd_ref_checkbitmask_pr
 156 #endif
 157
 158 /* Conversions only used for PME table lookup */
 159 #define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 160 #define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 161
 162 /* These two function only need to be approximate, Newton-Raphson iteration
 163  * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 164  */
 165 #define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 166 #define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 167
 168 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 169 #define GMX_SIMD_HAVE_EXP
 170 #ifdef GMX_SIMD_HAVE_EXP
 171 #define gmx_exp_pr        gmx_simd_ref_exp_pr
 172 #endif
 173 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 174 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 175 #define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 176 #define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 177 #define gmx_acos_pr       gmx_simd_ref_acos_pr
 178 #define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 179 #endif
 180
 181 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 182
 183
 184 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 185  * to instructions for) different SIMD width and float precision.
 186  *
 187  * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 188  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 189  * Compiler settings will decide if 128-bit intrinsics will
 190  * be translated into SSE or AVX instructions.
 191  */
 192
 193
 194 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 195 #if defined GMX_X86_AVX_256
 196 /* We have half SIMD width support, continue */
 197 #else
 198 #error "half SIMD width intrinsics are not supported"
 199 #endif
 200 #endif
 201
 202
 203 #ifdef GMX_X86_SSE2
 204 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 205 #define GMX_HAVE_SIMD_MACROS
 206
 207 /* Include the highest supported x86 SIMD intrisics + math functions */
 208 #ifdef GMX_X86_AVX_256
 209 #include "gmx_x86_avx_256.h"
 210 #ifdef GMX_DOUBLE
 211 #include "gmx_math_x86_avx_256_double.h"
 212 #else
 213 #include "gmx_math_x86_avx_256_single.h"
 214 #endif
 215 #else
 216 #ifdef GMX_X86_AVX_128_FMA
 217 #include "gmx_x86_avx_128_fma.h"
 218 #ifdef GMX_DOUBLE
 219 #include "gmx_math_x86_avx_128_fma_double.h"
 220 #else
 221 #include "gmx_math_x86_avx_128_fma_single.h"
 222 #endif
 223 #else
 224 #ifdef GMX_X86_SSE4_1
 225 #include "gmx_x86_sse4_1.h"
 226 #ifdef GMX_DOUBLE
 227 #include "gmx_math_x86_sse4_1_double.h"
 228 #else
 229 #include "gmx_math_x86_sse4_1_single.h"
 230 #endif
 231 #else
 232 #ifdef GMX_X86_SSE2
 233 #include "gmx_x86_sse2.h"
 234 #ifdef GMX_DOUBLE
 235 #include "gmx_math_x86_sse2_double.h"
 236 #else
 237 #include "gmx_math_x86_sse2_single.h"
 238 #endif
 239 #else
 240 #error No x86 acceleration defined
 241 #endif
 242 #endif
 243 #endif
 244 #endif
 245 /* exp and trigonometric functions are included above */
 246 #define GMX_SIMD_HAVE_EXP
 247 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 248
 249 #if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 250
 251 #ifndef GMX_DOUBLE
 252
 253 #define GMX_SIMD_WIDTH_HERE  4
 254
 255 #define gmx_mm_pr  __m128
 256
 257 #define gmx_mm_pb  __m128
 258
 259 #define gmx_epi32  __m128i
 260 #define GMX_SIMD_EPI32_WIDTH  4
 261
 262 #define gmx_load_pr       _mm_load_ps
 263 #define gmx_load1_pr      _mm_load1_ps
 264 #define gmx_set1_pr       _mm_set1_ps
 265 #define gmx_setzero_pr    _mm_setzero_ps
 266 #define gmx_store_pr      _mm_store_ps
 267
 268 #define gmx_add_pr        _mm_add_ps
 269 #define gmx_sub_pr        _mm_sub_ps
 270 #define gmx_mul_pr        _mm_mul_ps
 271 #ifdef GMX_X86_AVX_128_FMA
 272 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 273 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 274 #else
 275 #define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 276 #define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 277 #endif
 278 #define gmx_max_pr        _mm_max_ps
 279 #define gmx_blendzero_pr  _mm_and_ps
 280
 281 #define gmx_cmplt_pr      _mm_cmplt_ps
 282 #define gmx_and_pb        _mm_and_ps
 283 #define gmx_or_pb         _mm_or_ps
 284
 285 #ifdef GMX_X86_SSE4_1
 286 #define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 287 #define GMX_SIMD_HAVE_FLOOR
 288 #define gmx_floor_pr      _mm_floor_ps
 289 #else
 290 #define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 291 #endif
 292
 293 #ifdef GMX_X86_SSE4_1
 294 #define GMX_SIMD_HAVE_BLENDV
 295 #define gmx_blendv_pr     _mm_blendv_ps
 296 #endif
 297
 298 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 299 {
 300     /* The value -0.0 has only the sign-bit set */
 301     gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 302     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 303 };
 304
 305 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 306
 307 #define GMX_SIMD_HAVE_ANYTRUE
 308 #define gmx_anytrue_pb    _mm_movemask_ps
 309
 310 #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 311 #define gmx_set1_epi32    _mm_set1_epi32
 312 #define gmx_load_si(i)    _mm_load_si128((__m128i *) (i))
 313 #define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
 314
 315 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
 316 #define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 317
 318 #define gmx_rsqrt_pr      _mm_rsqrt_ps
 319 #define gmx_rcp_pr        _mm_rcp_ps
 320
 321 #define gmx_exp_pr        gmx_mm_exp_ps
 322 #define gmx_sqrt_pr       gmx_mm_sqrt_ps
 323 #define gmx_sincos_pr     gmx_mm_sincos_ps
 324 #define gmx_acos_pr       gmx_mm_acos_ps
 325 #define gmx_atan2_pr      gmx_mm_atan2_ps
 326
 327 #else /* ifndef GMX_DOUBLE */
 328
 329 #define GMX_SIMD_WIDTH_HERE  2
 330
 331 #define gmx_mm_pr  __m128d
 332
 333 #define gmx_mm_pb  __m128d
 334
 335 #define gmx_epi32  __m128i
 336 #define GMX_SIMD_EPI32_WIDTH  4
 337
 338 #define gmx_load_pr       _mm_load_pd
 339 #define gmx_load1_pr      _mm_load1_pd
 340 #define gmx_set1_pr       _mm_set1_pd
 341 #define gmx_setzero_pr    _mm_setzero_pd
 342 #define gmx_store_pr      _mm_store_pd
 343
 344 #define gmx_add_pr        _mm_add_pd
 345 #define gmx_sub_pr        _mm_sub_pd
 346 #define gmx_mul_pr        _mm_mul_pd
 347 #ifdef GMX_X86_AVX_128_FMA
 348 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 349 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 350 #else
 351 #define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 352 #define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 353 #endif
 354 #define gmx_max_pr        _mm_max_pd
 355 #define gmx_blendzero_pr  _mm_and_pd
 356
 357 #ifdef GMX_X86_SSE4_1
 358 #define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 359 #define GMX_SIMD_HAVE_FLOOR
 360 #define gmx_floor_pr      _mm_floor_pd
 361 #else
 362 #define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 363 /* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 364 #endif
 365
 366 #ifdef GMX_X86_SSE4_1
 367 #define GMX_SIMD_HAVE_BLENDV
 368 #define gmx_blendv_pr     _mm_blendv_pd
 369 #endif
 370
 371 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 372 {
 373     gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 374     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 375 };
 376
 377 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 378
 379 #define gmx_cmplt_pr      _mm_cmplt_pd
 380
 381 #define gmx_and_pb        _mm_and_pd
 382 #define gmx_or_pb         _mm_or_pd
 383
 384 #define GMX_SIMD_HAVE_ANYTRUE
 385 #define gmx_anytrue_pb    _mm_movemask_pd
 386
 387 #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 388 #define gmx_set1_epi32    _mm_set1_epi32
 389 #define gmx_load_si(i)    _mm_load_si128((__m128i *) (i))
 390 #define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
 391
 392 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 393 #define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 394
 395 #define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 396 #define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 397
 398 #define gmx_exp_pr        gmx_mm_exp_pd
 399 #define gmx_sqrt_pr       gmx_mm_sqrt_pd
 400 #define gmx_sincos_pr     gmx_mm_sincos_pd
 401 #define gmx_acos_pr       gmx_mm_acos_pd
 402 #define gmx_atan2_pr      gmx_mm_atan2_pd
 403
 404 #endif /* ifndef GMX_DOUBLE */
 405
 406 #else
 407 /* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 408  * so we use 256-bit SIMD.
 409  */
 410
 411 #ifndef GMX_DOUBLE
 412
 413 #define GMX_SIMD_WIDTH_HERE  8
 414
 415 #define gmx_mm_pr  __m256
 416
 417 #define gmx_mm_pb  __m256
 418
 419 #define gmx_epi32  __m256i
 420 #define GMX_SIMD_EPI32_WIDTH  8
 421
 422 #define gmx_load_pr       _mm256_load_ps
 423 #define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 424 #define gmx_set1_pr       _mm256_set1_ps
 425 #define gmx_setzero_pr    _mm256_setzero_ps
 426 #define gmx_store_pr      _mm256_store_ps
 427
 428 #define gmx_add_pr        _mm256_add_ps
 429 #define gmx_sub_pr        _mm256_sub_ps
 430 #define gmx_mul_pr        _mm256_mul_ps
 431 #define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 432 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 433 #define gmx_max_pr        _mm256_max_ps
 434 #define gmx_blendzero_pr  _mm256_and_ps
 435
 436 #define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 437 #define GMX_SIMD_HAVE_FLOOR
 438 #define gmx_floor_pr      _mm256_floor_ps
 439
 440 #define GMX_SIMD_HAVE_BLENDV
 441 #define gmx_blendv_pr     _mm256_blendv_ps
 442
 443 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 444 {
 445     gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 446     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 447 };
 448
 449 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 450
 451 /* Less-than (we use ordered, non-signaling, but that's not required) */
 452 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 453 #define gmx_and_pb        _mm256_and_ps
 454 #define gmx_or_pb         _mm256_or_ps
 455
 456 #define GMX_SIMD_HAVE_ANYTRUE
 457 #define gmx_anytrue_pb    _mm256_movemask_ps
 458
 459 #define GMX_SIMD_HAVE_CHECKBITMASK_PR
 460 #define gmx_set1_epi32    _mm256_set1_epi32
 461 #define gmx_castsi_pr     _mm256_castsi256_ps
 462 /* With <= 16 bits used the cast and conversion should not be required,
 463  * since only mantissa bits are set and that would give a non-zero float,
 464  * but with the Intel compiler this does not work correctly.
 465  */
 466 #define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
 467
 468 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 469
 470 #define gmx_rsqrt_pr      _mm256_rsqrt_ps
 471 #define gmx_rcp_pr        _mm256_rcp_ps
 472
 473 #define gmx_exp_pr        gmx_mm256_exp_ps
 474 #define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 475 #define gmx_sincos_pr     gmx_mm256_sincos_ps
 476 #define gmx_acos_pr       gmx_mm256_acos_ps
 477 #define gmx_atan2_pr      gmx_mm256_atan2_ps
 478
 479 #else
 480
 481 #define GMX_SIMD_WIDTH_HERE  4
 482
 483 #define gmx_mm_pr  __m256d
 484
 485 #define gmx_mm_pb  __m256d
 486
 487 /* We use 128-bit integer registers because of missing 256-bit operations */
 488 #define gmx_epi32  __m128i
 489 #define GMX_SIMD_EPI32_WIDTH  4
 490
 491 #define gmx_load_pr       _mm256_load_pd
 492 #define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 493 #define gmx_set1_pr       _mm256_set1_pd
 494 #define gmx_setzero_pr    _mm256_setzero_pd
 495 #define gmx_store_pr      _mm256_store_pd
 496
 497 #define gmx_add_pr        _mm256_add_pd
 498 #define gmx_sub_pr        _mm256_sub_pd
 499 #define gmx_mul_pr        _mm256_mul_pd
 500 #define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 501 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 502 #define gmx_max_pr        _mm256_max_pd
 503 #define gmx_blendzero_pr  _mm256_and_pd
 504
 505 #define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 506 #define GMX_SIMD_HAVE_FLOOR
 507 #define gmx_floor_pr      _mm256_floor_pd
 508
 509 #define GMX_SIMD_HAVE_BLENDV
 510 #define gmx_blendv_pr     _mm256_blendv_pd
 511
 512 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 513 {
 514     gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 515     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 516 };
 517
 518 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 519
 520 /* Less-than (we use ordered, non-signaling, but that's not required) */
 521 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 522
 523 #define gmx_and_pb        _mm256_and_pd
 524 #define gmx_or_pb         _mm256_or_pd
 525
 526 #define GMX_SIMD_HAVE_ANYTRUE
 527 #define gmx_anytrue_pb    _mm256_movemask_pd
 528
 529 #define GMX_SIMD_HAVE_CHECKBITMASK_PR
 530 #define gmx_set1_epi32    _mm256_set1_epi32
 531 #define gmx_castsi_pr     _mm256_castsi256_pd
 532 /* With <= 16 bits used the cast and conversion should not be required,
 533  * since only mantissa bits are set and that would give a non-zero float,
 534  * but with the Intel compiler this does not work correctly.
 535  * Because AVX does not have int->double conversion, we convert via float.
 536  */
 537 #define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
 538
 539 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 540
 541 #define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 542 #define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 543
 544 #define gmx_exp_pr        gmx_mm256_exp_pd
 545 #define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 546 #define gmx_sincos_pr     gmx_mm256_sincos_pd
 547 #define gmx_acos_pr       gmx_mm256_acos_pd
 548 #define gmx_atan2_pr      gmx_mm256_atan2_pd
 549
 550 #endif /* GMX_DOUBLE */
 551
 552 #endif /* 128- or 256-bit x86 SIMD */
 553
 554 #endif /* GMX_X86_SSE2 */
 555
 556
 557 #ifdef GMX_HAVE_SIMD_MACROS
 558 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
 559  * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 560  * to how many you want to use, to avoid indexing outside the aligned region.
 561  */
 562
 563 static gmx_inline real *
 564 gmx_simd_align_real(const real *x)
 565 {
 566     return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 567 }
 568
 569 static gmx_inline int *
 570 gmx_simd_align_int(const int *x)
 571 {
 572     return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 573 }
 574
 575
 576 /* Include the math functions which only need the above macros,
 577  * generally these are the ones that don't need masking operations.
 578  */
 579 #ifdef GMX_DOUBLE
 580 #include "gmx_simd_math_double.h"
 581 #else
 582 #include "gmx_simd_math_single.h"
 583 #endif
 584
 585 #endif /* GMX_HAVE_SIMD_MACROS */
 586
 587 #endif /* _gmx_simd_macros_h_ */