Properly finalize MPI on mdrun -version. Fixes #1313
[gromacs.git] / include / gmx_simd_macros.h
bloba62c15058b13ec47f8d869cb957fda397c8f7597
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* The macros in this file are intended to be used for writing
39 * architecture-independent SIMD intrinsics code.
40 * To support a new architecture, adding macros here should be (nearly)
41 * all that is needed.
44 #ifdef _gmx_simd_macros_h_
45 #error "gmx_simd_macros.h included twice"
46 #else
47 #define _gmx_simd_macros_h_
49 /* NOTE: SSE2 acceleration does not include floor or blendv */
52 /* Uncomment the next line, without other SIMD active, for testing plain-C */
53 /* #define GMX_SIMD_REFERENCE_PLAIN_C */
54 #ifdef GMX_SIMD_REFERENCE_PLAIN_C
55 /* Plain C SIMD reference implementation, also serves as documentation */
56 #define GMX_HAVE_SIMD_MACROS
58 /* In general the reference SIMD supports any SIMD width, including 1.
59 * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
60 * The nbnxn 2xnn kernels are currently not supported.
62 #define GMX_SIMD_REF_WIDTH 4
64 /* Include plain-C reference implementation, also serves as documentation */
65 #include "gmx_simd_ref.h"
67 #define GMX_SIMD_WIDTH_HERE GMX_SIMD_REF_WIDTH
69 /* float/double SIMD register type */
70 #define gmx_mm_pr gmx_simd_ref_pr
72 /* boolean SIMD register type */
73 #define gmx_mm_pb gmx_simd_ref_pb
75 /* integer SIMD register type, only for table indexing and exclusion masks */
76 #define gmx_epi32 gmx_simd_ref_epi32
77 #define GMX_SIMD_EPI32_WIDTH GMX_SIMD_REF_EPI32_WIDTH
79 /* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
80 #define gmx_load_pr gmx_simd_ref_load_pr
81 /* Set all SIMD register elements to *r */
82 #define gmx_load1_pr gmx_simd_ref_load1_pr
83 #define gmx_set1_pr gmx_simd_ref_set1_pr
84 #define gmx_setzero_pr gmx_simd_ref_setzero_pr
85 #define gmx_store_pr gmx_simd_ref_store_pr
87 #define gmx_add_pr gmx_simd_ref_add_pr
88 #define gmx_sub_pr gmx_simd_ref_sub_pr
89 #define gmx_mul_pr gmx_simd_ref_mul_pr
90 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
91 #define gmx_madd_pr gmx_simd_ref_madd_pr
92 #define gmx_nmsub_pr gmx_simd_ref_nmsub_pr
94 #define gmx_max_pr gmx_simd_ref_max_pr
95 #define gmx_blendzero_pr gmx_simd_ref_blendzero_pr
97 #define gmx_round_pr gmx_simd_ref_round_pr
99 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
100 #define GMX_SIMD_HAVE_FLOOR
101 #ifdef GMX_SIMD_HAVE_FLOOR
102 #define gmx_floor_pr gmx_simd_ref_floor_pr
103 #endif
105 /* Not required, only used when blendv is faster than comparison */
106 #define GMX_SIMD_HAVE_BLENDV
107 #ifdef GMX_SIMD_HAVE_BLENDV
108 #define gmx_blendv_pr gmx_simd_ref_blendv_pr
109 #endif
111 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
112 #define gmx_cpsgn_nonneg_pr gmx_simd_ref_cpsgn_nonneg_pr
114 /* Very specific operation required in the non-bonded kernels */
115 #define gmx_masknot_add_pr gmx_simd_ref_masknot_add_pr
117 /* Comparison */
118 #define gmx_cmplt_pr gmx_simd_ref_cmplt_pr
120 /* Logical operations on SIMD booleans */
121 #define gmx_and_pb gmx_simd_ref_and_pb
122 #define gmx_or_pb gmx_simd_ref_or_pb
124 /* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
125 * If this is not present, define GMX_SIMD_IS_TRUE(real x),
126 * which should return x==True, where True is True as defined in SIMD.
128 #define GMX_SIMD_HAVE_ANYTRUE
129 #ifdef GMX_SIMD_HAVE_ANYTRUE
130 #define gmx_anytrue_pb gmx_simd_ref_anytrue_pb
131 #else
132 /* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
133 #define gmx_store_pb gmx_simd_ref_store_pb
134 #endif
136 /* For topology exclusion pair checking we need: ((a & b) ? True : False)
137 * when we do a bit-wise and between a and b.
138 * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
139 * Otherwise we do all operations, except for the set1, in reals.
142 #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
143 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
144 #define gmx_set1_epi32 gmx_simd_ref_set1_epi32
145 #define gmx_load_si gmx_simd_ref_load_si
146 #define gmx_checkbitmask_epi32 gmx_simd_ref_checkbitmask_epi32
147 #endif
149 /* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
150 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
151 #define gmx_castsi_pr gmx_simd_ref_castsi_pr
152 /* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
153 * identical 32-bit masks are set in one double and one or both can be used.
155 #define gmx_checkbitmask_pr gmx_simd_ref_checkbitmask_pr
156 #endif
158 /* Conversions only used for PME table lookup */
159 #define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32
160 #define gmx_cvtepi32_pr gmx_simd_ref_cvtepi32_pr
162 /* These two function only need to be approximate, Newton-Raphson iteration
163 * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
165 #define gmx_rsqrt_pr gmx_simd_ref_rsqrt_pr
166 #define gmx_rcp_pr gmx_simd_ref_rcp_pr
168 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
169 #define GMX_SIMD_HAVE_EXP
170 #ifdef GMX_SIMD_HAVE_EXP
171 #define gmx_exp_pr gmx_simd_ref_exp_pr
172 #endif
173 #define GMX_SIMD_HAVE_TRIGONOMETRIC
174 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
175 #define gmx_sqrt_pr gmx_simd_ref_sqrt_pr
176 #define gmx_sincos_pr gmx_simd_ref_sincos_pr
177 #define gmx_acos_pr gmx_simd_ref_acos_pr
178 #define gmx_atan2_pr gmx_simd_ref_atan2_pr
179 #endif
181 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
184 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
185 * to instructions for) different SIMD width and float precision.
187 * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
188 * The _pr suffix is replaced by _ps or _pd (for single or double precision).
189 * Compiler settings will decide if 128-bit intrinsics will
190 * be translated into SSE or AVX instructions.
194 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
195 #if defined GMX_X86_AVX_256
196 /* We have half SIMD width support, continue */
197 #else
198 #error "half SIMD width intrinsics are not supported"
199 #endif
200 #endif
203 #ifdef GMX_X86_SSE2
204 /* This is for general x86 SIMD instruction sets that also support SSE2 */
205 #define GMX_HAVE_SIMD_MACROS
207 /* Include the highest supported x86 SIMD intrisics + math functions */
208 #ifdef GMX_X86_AVX_256
209 #include "gmx_x86_avx_256.h"
210 #ifdef GMX_DOUBLE
211 #include "gmx_math_x86_avx_256_double.h"
212 #else
213 #include "gmx_math_x86_avx_256_single.h"
214 #endif
215 #else
216 #ifdef GMX_X86_AVX_128_FMA
217 #include "gmx_x86_avx_128_fma.h"
218 #ifdef GMX_DOUBLE
219 #include "gmx_math_x86_avx_128_fma_double.h"
220 #else
221 #include "gmx_math_x86_avx_128_fma_single.h"
222 #endif
223 #else
224 #ifdef GMX_X86_SSE4_1
225 #include "gmx_x86_sse4_1.h"
226 #ifdef GMX_DOUBLE
227 #include "gmx_math_x86_sse4_1_double.h"
228 #else
229 #include "gmx_math_x86_sse4_1_single.h"
230 #endif
231 #else
232 #ifdef GMX_X86_SSE2
233 #include "gmx_x86_sse2.h"
234 #ifdef GMX_DOUBLE
235 #include "gmx_math_x86_sse2_double.h"
236 #else
237 #include "gmx_math_x86_sse2_single.h"
238 #endif
239 #else
240 #error No x86 acceleration defined
241 #endif
242 #endif
243 #endif
244 #endif
245 /* exp and trigonometric functions are included above */
246 #define GMX_SIMD_HAVE_EXP
247 #define GMX_SIMD_HAVE_TRIGONOMETRIC
249 #if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
251 #ifndef GMX_DOUBLE
253 #define GMX_SIMD_WIDTH_HERE 4
255 #define gmx_mm_pr __m128
257 #define gmx_mm_pb __m128
259 #define gmx_epi32 __m128i
260 #define GMX_SIMD_EPI32_WIDTH 4
262 #define gmx_load_pr _mm_load_ps
263 #define gmx_load1_pr _mm_load1_ps
264 #define gmx_set1_pr _mm_set1_ps
265 #define gmx_setzero_pr _mm_setzero_ps
266 #define gmx_store_pr _mm_store_ps
268 #define gmx_add_pr _mm_add_ps
269 #define gmx_sub_pr _mm_sub_ps
270 #define gmx_mul_pr _mm_mul_ps
271 #ifdef GMX_X86_AVX_128_FMA
272 #define gmx_madd_pr(a, b, c) _mm_macc_ps(a, b, c)
273 #define gmx_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c)
274 #else
275 #define gmx_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
276 #define gmx_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
277 #endif
278 #define gmx_max_pr _mm_max_ps
279 #define gmx_blendzero_pr _mm_and_ps
281 #define gmx_cmplt_pr _mm_cmplt_ps
282 #define gmx_and_pb _mm_and_ps
283 #define gmx_or_pb _mm_or_ps
285 #ifdef GMX_X86_SSE4_1
286 #define gmx_round_pr(x) _mm_round_ps(x, 0x0)
287 #define GMX_SIMD_HAVE_FLOOR
288 #define gmx_floor_pr _mm_floor_ps
289 #else
290 #define gmx_round_pr(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
291 #endif
293 #ifdef GMX_X86_SSE4_1
294 #define GMX_SIMD_HAVE_BLENDV
295 #define gmx_blendv_pr _mm_blendv_ps
296 #endif
298 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
300 /* The value -0.0 has only the sign-bit set */
301 gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
302 return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
305 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
307 #define GMX_SIMD_HAVE_ANYTRUE
308 #define gmx_anytrue_pb _mm_movemask_ps
310 #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
311 #define gmx_set1_epi32 _mm_set1_epi32
312 #define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
313 #define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
315 #define gmx_cvttpr_epi32 _mm_cvttps_epi32
316 #define gmx_cvtepi32_pr _mm_cvtepi32_ps
318 #define gmx_rsqrt_pr _mm_rsqrt_ps
319 #define gmx_rcp_pr _mm_rcp_ps
321 #define gmx_exp_pr gmx_mm_exp_ps
322 #define gmx_sqrt_pr gmx_mm_sqrt_ps
323 #define gmx_sincos_pr gmx_mm_sincos_ps
324 #define gmx_acos_pr gmx_mm_acos_ps
325 #define gmx_atan2_pr gmx_mm_atan2_ps
327 #else /* ifndef GMX_DOUBLE */
329 #define GMX_SIMD_WIDTH_HERE 2
331 #define gmx_mm_pr __m128d
333 #define gmx_mm_pb __m128d
335 #define gmx_epi32 __m128i
336 #define GMX_SIMD_EPI32_WIDTH 4
338 #define gmx_load_pr _mm_load_pd
339 #define gmx_load1_pr _mm_load1_pd
340 #define gmx_set1_pr _mm_set1_pd
341 #define gmx_setzero_pr _mm_setzero_pd
342 #define gmx_store_pr _mm_store_pd
344 #define gmx_add_pr _mm_add_pd
345 #define gmx_sub_pr _mm_sub_pd
346 #define gmx_mul_pr _mm_mul_pd
347 #ifdef GMX_X86_AVX_128_FMA
348 #define gmx_madd_pr(a, b, c) _mm_macc_pd(a, b, c)
349 #define gmx_nmsub_pr(a, b, c) _mm_nmacc_pd(a, b, c)
350 #else
351 #define gmx_madd_pr(a, b, c) _mm_add_pd(c, _mm_mul_pd(a, b))
352 #define gmx_nmsub_pr(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b))
353 #endif
354 #define gmx_max_pr _mm_max_pd
355 #define gmx_blendzero_pr _mm_and_pd
357 #ifdef GMX_X86_SSE4_1
358 #define gmx_round_pr(x) _mm_round_pd(x, 0x0)
359 #define GMX_SIMD_HAVE_FLOOR
360 #define gmx_floor_pr _mm_floor_pd
361 #else
362 #define gmx_round_pr(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
363 /* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
364 #endif
366 #ifdef GMX_X86_SSE4_1
367 #define GMX_SIMD_HAVE_BLENDV
368 #define gmx_blendv_pr _mm_blendv_pd
369 #endif
371 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
373 gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
374 return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
377 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
379 #define gmx_cmplt_pr _mm_cmplt_pd
381 #define gmx_and_pb _mm_and_pd
382 #define gmx_or_pb _mm_or_pd
384 #define GMX_SIMD_HAVE_ANYTRUE
385 #define gmx_anytrue_pb _mm_movemask_pd
387 #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
388 #define gmx_set1_epi32 _mm_set1_epi32
389 #define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
390 #define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
392 #define gmx_cvttpr_epi32 _mm_cvttpd_epi32
393 #define gmx_cvtepi32_pr _mm_cvtepi32_pd
395 #define gmx_rsqrt_pr(r) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
396 #define gmx_rcp_pr(r) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
398 #define gmx_exp_pr gmx_mm_exp_pd
399 #define gmx_sqrt_pr gmx_mm_sqrt_pd
400 #define gmx_sincos_pr gmx_mm_sincos_pd
401 #define gmx_acos_pr gmx_mm_acos_pd
402 #define gmx_atan2_pr gmx_mm_atan2_pd
404 #endif /* ifndef GMX_DOUBLE */
406 #else
407 /* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
408 * so we use 256-bit SIMD.
411 #ifndef GMX_DOUBLE
413 #define GMX_SIMD_WIDTH_HERE 8
415 #define gmx_mm_pr __m256
417 #define gmx_mm_pb __m256
419 #define gmx_epi32 __m256i
420 #define GMX_SIMD_EPI32_WIDTH 8
422 #define gmx_load_pr _mm256_load_ps
423 #define gmx_load1_pr(x) _mm256_set1_ps((x)[0])
424 #define gmx_set1_pr _mm256_set1_ps
425 #define gmx_setzero_pr _mm256_setzero_ps
426 #define gmx_store_pr _mm256_store_ps
428 #define gmx_add_pr _mm256_add_ps
429 #define gmx_sub_pr _mm256_sub_ps
430 #define gmx_mul_pr _mm256_mul_ps
431 #define gmx_madd_pr(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
432 #define gmx_nmsub_pr(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
433 #define gmx_max_pr _mm256_max_ps
434 #define gmx_blendzero_pr _mm256_and_ps
436 #define gmx_round_pr(x) _mm256_round_ps(x, 0x0)
437 #define GMX_SIMD_HAVE_FLOOR
438 #define gmx_floor_pr _mm256_floor_ps
440 #define GMX_SIMD_HAVE_BLENDV
441 #define gmx_blendv_pr _mm256_blendv_ps
443 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
445 gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
446 return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
449 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
451 /* Less-than (we use ordered, non-signaling, but that's not required) */
452 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
453 #define gmx_and_pb _mm256_and_ps
454 #define gmx_or_pb _mm256_or_ps
456 #define GMX_SIMD_HAVE_ANYTRUE
457 #define gmx_anytrue_pb _mm256_movemask_ps
459 #define GMX_SIMD_HAVE_CHECKBITMASK_PR
460 #define gmx_set1_epi32 _mm256_set1_epi32
461 #define gmx_castsi_pr _mm256_castsi256_ps
462 /* With <= 16 bits used the cast and conversion should not be required,
463 * since only mantissa bits are set and that would give a non-zero float,
464 * but with the Intel compiler this does not work correctly.
466 #define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
468 #define gmx_cvttpr_epi32 _mm256_cvttps_epi32
470 #define gmx_rsqrt_pr _mm256_rsqrt_ps
471 #define gmx_rcp_pr _mm256_rcp_ps
473 #define gmx_exp_pr gmx_mm256_exp_ps
474 #define gmx_sqrt_pr gmx_mm256_sqrt_ps
475 #define gmx_sincos_pr gmx_mm256_sincos_ps
476 #define gmx_acos_pr gmx_mm256_acos_ps
477 #define gmx_atan2_pr gmx_mm256_atan2_ps
479 #else
481 #define GMX_SIMD_WIDTH_HERE 4
483 #define gmx_mm_pr __m256d
485 #define gmx_mm_pb __m256d
487 /* We use 128-bit integer registers because of missing 256-bit operations */
488 #define gmx_epi32 __m128i
489 #define GMX_SIMD_EPI32_WIDTH 4
491 #define gmx_load_pr _mm256_load_pd
492 #define gmx_load1_pr(x) _mm256_set1_pd((x)[0])
493 #define gmx_set1_pr _mm256_set1_pd
494 #define gmx_setzero_pr _mm256_setzero_pd
495 #define gmx_store_pr _mm256_store_pd
497 #define gmx_add_pr _mm256_add_pd
498 #define gmx_sub_pr _mm256_sub_pd
499 #define gmx_mul_pr _mm256_mul_pd
500 #define gmx_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
501 #define gmx_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
502 #define gmx_max_pr _mm256_max_pd
503 #define gmx_blendzero_pr _mm256_and_pd
505 #define gmx_round_pr(x) _mm256_round_pd(x, 0x0)
506 #define GMX_SIMD_HAVE_FLOOR
507 #define gmx_floor_pr _mm256_floor_pd
509 #define GMX_SIMD_HAVE_BLENDV
510 #define gmx_blendv_pr _mm256_blendv_pd
512 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
514 gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
515 return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
518 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
520 /* Less-than (we use ordered, non-signaling, but that's not required) */
521 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
523 #define gmx_and_pb _mm256_and_pd
524 #define gmx_or_pb _mm256_or_pd
526 #define GMX_SIMD_HAVE_ANYTRUE
527 #define gmx_anytrue_pb _mm256_movemask_pd
529 #define GMX_SIMD_HAVE_CHECKBITMASK_PR
530 #define gmx_set1_epi32 _mm256_set1_epi32
531 #define gmx_castsi_pr _mm256_castsi256_pd
532 /* With <= 16 bits used the cast and conversion should not be required,
533 * since only mantissa bits are set and that would give a non-zero float,
534 * but with the Intel compiler this does not work correctly.
535 * Because AVX does not have int->double conversion, we convert via float.
537 #define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
539 #define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
541 #define gmx_rsqrt_pr(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
542 #define gmx_rcp_pr(r) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
544 #define gmx_exp_pr gmx_mm256_exp_pd
545 #define gmx_sqrt_pr gmx_mm256_sqrt_pd
546 #define gmx_sincos_pr gmx_mm256_sincos_pd
547 #define gmx_acos_pr gmx_mm256_acos_pd
548 #define gmx_atan2_pr gmx_mm256_atan2_pd
550 #endif /* GMX_DOUBLE */
552 #endif /* 128- or 256-bit x86 SIMD */
554 #endif /* GMX_X86_SSE2 */
557 #ifdef GMX_HAVE_SIMD_MACROS
558 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
559 * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
560 * to how many you want to use, to avoid indexing outside the aligned region.
563 static gmx_inline real *
564 gmx_simd_align_real(const real *x)
566 return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
569 static gmx_inline int *
570 gmx_simd_align_int(const int *x)
572 return (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
576 /* Include the math functions which only need the above macros,
577 * generally these are the ones that don't need masking operations.
579 #ifdef GMX_DOUBLE
580 #include "gmx_simd_math_double.h"
581 #else
582 #include "gmx_simd_math_single.h"
583 #endif
585 #endif /* GMX_HAVE_SIMD_MACROS */
587 #endif /* _gmx_simd_macros_h_ */