From b608083a6f73474bbc02b486c3792947128d8cff Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Thu, 1 Aug 2013 14:34:22 +0200 Subject: [PATCH] introduced general 4-wide SIMD support PME spread+gather and the nbnxn search bounding box checks use 4-wide SIMD (as opposed to arbitrary width SIMD). This SSE code has now been replaced by macros from gmx_simd4_macros.h. pme_sse_single.h has been renamed to pme_simd4.h This change is mainly refactoring; it only adds PME spread+gather AVX acceleration in double precision plus a few FMA instructions. Change-Id: Ia5e02295bb281a2e23d57f4c165f555de6744064 --- include/gmx_simd4_macros.h | 309 +++++++++++++++++++++++++ include/gmx_simd4_ref.h | 308 +++++++++++++++++++++++++ include/gmx_simd_macros.h | 15 +- include/gmx_simd_ref.h | 17 +- src/mdlib/nbnxn_internal.h | 45 ++-- src/mdlib/nbnxn_search.c | 445 ++++++++++++++++++------------------- src/mdlib/nbnxn_search_simd_2xnn.h | 134 +++++------ src/mdlib/nbnxn_search_simd_4xn.h | 218 ++++++++---------- src/mdlib/pme.c | 148 +++++++----- src/mdlib/pme_simd4.h | 356 +++++++++++++++++++++++++++++ src/mdlib/pme_sse_single.h | 325 --------------------------- 11 files changed, 1451 insertions(+), 869 deletions(-) create mode 100644 include/gmx_simd4_macros.h create mode 100644 include/gmx_simd4_ref.h create mode 100644 src/mdlib/pme_simd4.h delete mode 100644 src/mdlib/pme_sse_single.h diff --git a/include/gmx_simd4_macros.h b/include/gmx_simd4_macros.h new file mode 100644 index 0000000000..7ee1581807 --- /dev/null +++ b/include/gmx_simd4_macros.h @@ -0,0 +1,309 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2012, The GROMACS Development Team + * Copyright (c) 2012,2013, by the GROMACS development team, led by + * David van der Spoel, Berk Hess, Erik Lindahl, and including many + * others, as listed in the AUTHORS file in the top-level source + * directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/* The macros in this file are intended to be used for writing + * architecture-independent SIMD intrinsics code with a SIMD width of 4. + * To support a new architecture, adding macros here should be all + * that is needed. + * + * Note that this file is intended only for SIMD operations that require + * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware + * support, more functionality and higher performance, but the SIMD width is + * not necessarily equal to 4. + */ + +#ifdef _gmx_simd4_macros_h_ +#error "gmx_simd4_macros.h included twice" +#else +#define _gmx_simd4_macros_h_ + + +/* The SIMD width here is always 4, since that is the whole point */ +#define GMX_SIMD4_WIDTH 4 + + +#if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE +/* Precision set before inclusion, honour that request */ +#else +/* Match precision to the Gromacs real precision */ +#ifdef GMX_DOUBLE +#define GMX_SIMD4_DOUBLE +#else +#define GMX_SIMD4_SINGLE +#endif +#endif + +#ifdef GMX_SIMD4_DOUBLE +typedef double gmx_simd4_real; +#endif +#ifdef GMX_SIMD4_SINGLE +typedef float gmx_simd4_real; +#endif + +/* Uncomment the next line, without other SIMD active, for testing plain-C */ +/* #define GMX_SIMD4_REFERENCE_PLAIN_C */ +#ifdef GMX_SIMD4_REFERENCE_PLAIN_C +/* Plain C SIMD reference implementation, also serves as documentation */ +#define GMX_HAVE_SIMD4_MACROS + +/* Include plain-C reference implementation, also serves as documentation */ +#include "gmx_simd4_ref.h" + +/* float/double SIMD register type */ +#define gmx_simd4_pr gmx_simd4_ref_pr + +/* boolean SIMD register type */ +#define gmx_simd4_pb gmx_simd4_ref_pb + +#define gmx_simd4_load_pr gmx_simd4_ref_load_pr +#define gmx_simd4_set1_pr gmx_simd4_ref_set1_pr +#define gmx_simd4_setzero_pr gmx_simd4_ref_setzero_pr +#define gmx_simd4_store_pr gmx_simd4_ref_store_pr + +/* Unaligned load+store are not required, + * but they can speed up the PME spread+gather operations. + */ +#define GMX_SIMD4_HAVE_UNALIGNED +#ifdef GMX_SIMD4_HAVE_UNALIGNED +#define gmx_simd4_loadu_pr gmx_simd4_ref_load_pr +#define gmx_simd4_storeu_pr gmx_simd4_ref_store_pr +#endif + +#define gmx_simd4_add_pr gmx_simd4_ref_add_pr +#define gmx_simd4_sub_pr gmx_simd4_ref_sub_pr +#define gmx_simd4_mul_pr gmx_simd4_ref_mul_pr +/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */ +#define gmx_simd4_madd_pr gmx_simd4_ref_madd_pr +#define gmx_simd4_nmsub_pr gmx_simd4_ref_nmsub_pr + +#define gmx_simd4_dotproduct3 gmx_simd4_ref_dotproduct3 + +#define gmx_simd4_min_pr gmx_simd4_ref_min_pr +#define gmx_simd4_max_pr gmx_simd4_ref_max_pr + +#define gmx_simd4_blendzero_pr gmx_simd4_ref_blendzero_pr + +/* Comparison */ +#define gmx_simd4_cmplt_pr gmx_simd4_ref_cmplt_pr + +/* Logical operations on SIMD booleans */ +#define gmx_simd4_and_pb gmx_simd4_ref_and_pb +#define gmx_simd4_or_pb gmx_simd4_ref_or_pb + +/* Returns a single int (0/1) which tells if any of the 4 booleans is True */ +#define gmx_simd4_anytrue_pb gmx_simd4_ref_anytrue_pb + +#endif /* GMX_SIMD4_REFERENCE_PLAIN_C */ + + +/* The same SIMD macros can be translated to SIMD intrinsics (and compiled + * to instructions for) different SIMD width and float precision. + * + * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX). + * The _pr suffix is replaced by _ps or _pd (for single or double precision). + * Compiler settings will decide if 128-bit intrinsics will + * be translated into SSE or AVX instructions. + */ + + +#ifdef GMX_X86_SSE2 +/* This is for general x86 SIMD instruction sets that also support SSE2 */ + +#ifdef GMX_SIMD4_SINGLE +#define GMX_HAVE_SIMD4_MACROS +#endif + +#ifdef GMX_SIMD4_DOUBLE +/* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA. + * This is inconsistent naming wise, but should give the best performance. + */ +#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256 +#define GMX_HAVE_SIMD4_MACROS +#endif +#endif + +#ifdef GMX_HAVE_SIMD4_MACROS + +#if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256 + +#include +#ifdef HAVE_X86INTRIN_H +#include /* FMA */ +#endif +#ifdef HAVE_INTRIN_H +#include /* FMA MSVC */ +#endif + +#else +#ifdef GMX_X86_SSE4_1 +#include +#else +/* We only have SSE2 */ +#include +#endif +#endif + +#ifdef GMX_SIMD4_SINGLE + +#define gmx_simd4_pr __m128 + +#define gmx_simd4_pb __m128 + +#define gmx_simd4_load_pr _mm_load_ps +#define gmx_simd4_set1_pr _mm_set1_ps +#define gmx_simd4_setzero_pr _mm_setzero_ps +#define gmx_simd4_store_pr _mm_store_ps + +/* Some old AMD processors could have problems with unaligned loads+stores */ +#ifndef GMX_FAHCORE +#define GMX_SIMD4_HAVE_UNALIGNED +#endif +#ifdef GMX_SIMD4_HAVE_UNALIGNED +#define gmx_simd4_loadu_pr _mm_loadu_ps +#define gmx_simd4_storeu_pr _mm_storeu_ps +#endif + +#define gmx_simd4_add_pr _mm_add_ps +#define gmx_simd4_sub_pr _mm_sub_ps +#define gmx_simd4_mul_pr _mm_mul_ps + +#ifdef GMX_X86_AVX_128_FMA +#define gmx_simd4_madd_pr(a, b, c) _mm_macc_ps(a, b, c) +#define gmx_simd4_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c) +#else +#define gmx_simd4_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b)) +#define gmx_simd4_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) +#endif + +static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b) +#ifdef GMX_X86_SSE4_1 +{ + float dp; + + /* SSE4.1 dot product of components 0,1,2, stored in component 0 */ + _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71)); + + return dp; +} +#else +{ + float dp_array[7], *dp; + + /* Generate an aligned pointer */ + dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15))); + + _mm_store_ps(dp, _mm_mul_ps(a, b)); + + return dp[0] + dp[1] + dp[2]; +} +#endif + +#define gmx_simd4_min_pr _mm_min_ps +#define gmx_simd4_max_pr _mm_max_ps + +#define gmx_simd4_blendzero_pr _mm_and_ps + +#define gmx_simd4_cmplt_pr _mm_cmplt_ps +#define gmx_simd4_and_pb _mm_and_ps +#define gmx_simd4_or_pb _mm_or_ps + +#define gmx_simd4_anytrue_pb _mm_movemask_ps + +#endif /* GMX_SIMD4_SINGLE */ + + +#ifdef GMX_SIMD4_DOUBLE + +#define gmx_simd4_pr __m256d + +#define gmx_simd4_pb __m256d + +#define gmx_simd4_load_pr _mm256_load_pd +#define gmx_simd4_set1_pr _mm256_set1_pd +#define gmx_simd4_setzero_pr _mm256_setzero_pd +#define gmx_simd4_store_pr _mm256_store_pd + +#define GMX_SIMD4_HAVE_UNALIGNED +#define gmx_simd4_loadu_pr _mm256_loadu_pd +#define gmx_simd4_storeu_pr _mm256_storeu_pd + +#define gmx_simd4_add_pr _mm256_add_pd +#define gmx_simd4_sub_pr _mm256_sub_pd +#define gmx_simd4_mul_pr _mm256_mul_pd +#ifdef GMX_X86_AVX_128_FMA +#define gmx_simd4_madd_pr(a, b, c) _mm256_macc_pd(a, b, c) +#define gmx_simd4_nmsub_pr(a, b, c) _mm256_nmacc_pd(a, b, c) +#else +#define gmx_simd4_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b)) +#define gmx_simd4_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b)) +#endif +#define gmx_simd4_min_pr _mm256_min_pd +#define gmx_simd4_max_pr _mm256_max_pd + +#define gmx_simd4_blendzero_pr _mm256_and_pd + +/* Less-than (we use ordered, non-signaling, but that's not required) */ +#define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11) +#define gmx_simd4_and_pb _mm256_and_pd +#define gmx_simd4_or_pb _mm256_or_pd + +#define gmx_simd4_anytrue_pb _mm256_movemask_pd + +#endif /* GMX_SIMD4_DOUBLE */ + + +#endif /* GMX_HAVE_SIMD4_MACROS */ + + +#endif /* GMX_X86_SSE2 */ + + +#ifdef GMX_HAVE_SIMD4_MACROS +/* Generic functions to extract a SIMD4 aligned pointer from a pointer x. + * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared + * to how many you want to use, to avoid indexing outside the aligned region. + */ + +static gmx_inline gmx_simd4_real * +gmx_simd4_align_real(const gmx_simd4_real *x) +{ + return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1)))); +} +#endif + + +#endif /* _gmx_simd4_macros_h_ */ diff --git a/include/gmx_simd4_ref.h b/include/gmx_simd4_ref.h new file mode 100644 index 0000000000..192a6105a9 --- /dev/null +++ b/include/gmx_simd4_ref.h @@ -0,0 +1,308 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2012, The GROMACS Development Team + * Copyright (c) 2012,2013, by the GROMACS development team, led by + * David van der Spoel, Berk Hess, Erik Lindahl, and including many + * others, as listed in the AUTHORS file in the top-level source + * directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef _gmx_simd4_ref_h_ +#define _gmx_simd4_ref_h_ + +/* This file contains a reference plain-C implementation of 4-wide SIMD. + * This code is only useful for testing and documentation. + * Either float or double precision is supported through gmx_simd4_real, + * which is set in gmx_simd4_macros.h + */ + + +#include + +/* float/double SIMD register type */ +typedef struct { + gmx_simd4_real r[GMX_SIMD4_WIDTH]; +} gmx_simd4_ref_pr; + +/* boolean SIMD register type */ +typedef struct { + char r[GMX_SIMD4_WIDTH]; +} gmx_simd4_ref_pb; + + +/* Load GMX_SIMD4_WIDTH reals for memory starting at r */ +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_load_pr(const gmx_simd4_real *r) +{ + gmx_simd4_ref_pr a; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + a.r[i] = r[i]; + } + + return a; +} + +/* Set all SIMD register elements to r */ +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_set1_pr(gmx_simd4_real r) +{ + gmx_simd4_ref_pr a; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + a.r[i] = r; + } + + return a; +} + +/* Set all SIMD register elements to 0 */ +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_setzero_pr() +{ + gmx_simd4_ref_pr a; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + a.r[i] = 0.0; + } + + return a; +} + +static gmx_inline void +gmx_simd4_ref_store_pr(gmx_simd4_real *dest, gmx_simd4_ref_pr src) +{ + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + dest[i] = src.r[i]; + } +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_add_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_ref_pr c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = a.r[i] + b.r[i]; + } + + return c; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_sub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_ref_pr c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = a.r[i] - b.r[i]; + } + + return c; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_mul_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_ref_pr c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = a.r[i]*b.r[i]; + } + + return c; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_madd_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c) +{ + gmx_simd4_ref_pr d; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + d.r[i] = a.r[i]*b.r[i] + c.r[i]; + } + + return d; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_nmsub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c) +{ + gmx_simd4_ref_pr d; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + d.r[i] = -a.r[i]*b.r[i] + c.r[i]; + } + + return d; +} + +static gmx_inline gmx_simd4_real +gmx_simd4_ref_dotproduct3(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_real dp; + int i; + + dp = 0.0; + for (i = 0; i < 3; i++) + { + dp += a.r[i]*b.r[i]; + } + + return dp; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_min_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_ref_pr c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]); + } + + return c; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_max_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_ref_pr c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]); + } + + return c; +} + +static gmx_inline gmx_simd4_ref_pr +gmx_simd4_ref_blendzero_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pb b) +{ + gmx_simd4_ref_pr c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = (b.r[i] ? a.r[i] : 0.0); + } + + return c; +} + +/* Comparison */ +static gmx_inline gmx_simd4_ref_pb +gmx_simd4_ref_cmplt_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b) +{ + gmx_simd4_ref_pb c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = (a.r[i] < b.r[i]); + } + + return c; +} + +/* Logical AND on SIMD booleans */ +static gmx_inline gmx_simd4_ref_pb +gmx_simd4_ref_and_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b) +{ + gmx_simd4_ref_pb c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = (a.r[i] && b.r[i]); + } + + return c; +} + +/* Logical OR on SIMD booleans */ +static gmx_inline gmx_simd4_ref_pb +gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b) +{ + gmx_simd4_ref_pb c; + int i; + + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + c.r[i] = (a.r[i] || b.r[i]); + } + + return c; +} + +/* gmx_anytrue_pb(x) returns if any of the boolean is x is True */ +static gmx_inline int +gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a) +{ + int anytrue; + int i; + + anytrue = 0; + for (i = 0; i < GMX_SIMD4_WIDTH; i++) + { + if (a.r[i]) + { + anytrue = 1; + } + } + + return anytrue; +} + +#endif /* _gmx_simd4_ref_h_ */ diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h index c997b39b51..d487565b9a 100644 --- a/include/gmx_simd_macros.h +++ b/include/gmx_simd_macros.h @@ -121,17 +121,8 @@ #define gmx_and_pb gmx_simd_ref_and_pb #define gmx_or_pb gmx_simd_ref_or_pb -/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True. - * If this is not present, define GMX_SIMD_IS_TRUE(real x), - * which should return x==True, where True is True as defined in SIMD. - */ -#define GMX_SIMD_HAVE_ANYTRUE -#ifdef GMX_SIMD_HAVE_ANYTRUE +/* Returns a single int (0/1) which tells if any of the 4 booleans is True */ #define gmx_anytrue_pb gmx_simd_ref_anytrue_pb -#else -/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */ -#define gmx_store_pb gmx_simd_ref_store_pb -#endif /* Conversions only used for PME table lookup */ #define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32 @@ -283,7 +274,6 @@ static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b) static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); }; -#define GMX_SIMD_HAVE_ANYTRUE #define gmx_anytrue_pb _mm_movemask_ps #define gmx_cvttpr_epi32 _mm_cvttps_epi32 @@ -356,7 +346,6 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_ #define gmx_and_pb _mm_and_pd #define gmx_or_pb _mm_or_pd -#define GMX_SIMD_HAVE_ANYTRUE #define gmx_anytrue_pb _mm_movemask_pd #define gmx_cvttpr_epi32 _mm_cvttpd_epi32 @@ -423,7 +412,6 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_ #define gmx_and_pb _mm256_and_ps #define gmx_or_pb _mm256_or_ps -#define GMX_SIMD_HAVE_ANYTRUE #define gmx_anytrue_pb _mm256_movemask_ps #define gmx_cvttpr_epi32 _mm256_cvttps_epi32 @@ -484,7 +472,6 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_ #define gmx_and_pb _mm256_and_pd #define gmx_or_pb _mm256_or_pd -#define GMX_SIMD_HAVE_ANYTRUE #define gmx_anytrue_pb _mm256_movemask_pd #define gmx_cvttpr_epi32 _mm256_cvttpd_epi32 diff --git a/include/gmx_simd_ref.h b/include/gmx_simd_ref.h index 3a3900a821..082132ed5c 100644 --- a/include/gmx_simd_ref.h +++ b/include/gmx_simd_ref.h @@ -377,10 +377,7 @@ gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b) return c; } -/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True. - * If this is not present, define GMX_SIMD_IS_TRUE(real x), - * which should return x==True, where True is True as defined in SIMD. - */ +/* Returns a single int (0/1) which tells if any of the booleans is True */ static gmx_inline int gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a) { @@ -399,18 +396,6 @@ gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a) return anytrue; } -/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */ -static gmx_inline void -gmx_simd_ref_store_pb(real *dest, gmx_simd_ref_pb src) -{ - int i; - - for (i = 0; i < GMX_SIMD_REF_WIDTH; i++) - { - dest[i] = src.r[i]; - } -}; - /* Conversions only used for PME table lookup */ static gmx_inline gmx_simd_ref_epi32 gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a) diff --git a/src/mdlib/nbnxn_internal.h b/src/mdlib/nbnxn_internal.h index 833407e1dc..ae76083faf 100644 --- a/src/mdlib/nbnxn_internal.h +++ b/src/mdlib/nbnxn_internal.h @@ -53,14 +53,21 @@ #include "gmx_simd_macros.h" #endif -#ifdef __cplusplus -extern "C" { + +/* Bounding box calculations are (currently) always in single precision. + * This uses less (cache-)memory and SIMD is faster, at least on x86. + */ +#define GMX_SIMD4_SINGLE +/* Include the 4-wide SIMD macro file */ +#include "gmx_simd4_macros.h" +/* Check if we have 4-wide SIMD macro support */ +#ifdef GMX_HAVE_SIMD4_MACROS +#define NBNXN_SEARCH_BB_SIMD4 #endif -#ifdef GMX_X86_SSE2 -/* Use 4-way SIMD for, always, single precision bounding box calculations */ -#define NBNXN_SEARCH_BB_SSE +#ifdef __cplusplus +extern "C" { #endif @@ -73,9 +80,18 @@ extern "C" { #endif +#ifdef NBNXN_SEARCH_BB_SIMD4 +/* Memory alignment in bytes as required by SIMD aligned loads/stores */ +#define NBNXN_SEARCH_BB_MEM_ALIGN (GMX_SIMD4_WIDTH*sizeof(float)) +#else +/* No alignment required, but set it so we can call the same routines */ +#define NBNXN_SEARCH_BB_MEM_ALIGN 32 +#endif + + /* Pair search box lower and upper corner in x,y,z. - * Store this in 4 iso 3 reals, which is useful with SSE. - * To avoid complicating the code we also use 4 without SSE. + * Store this in 4 iso 3 reals, which is useful with 4-wide SIMD. + * To avoid complicating the code we also use 4 without 4-wide SIMD. */ #define NNBSBB_C 4 /* Pair search box lower and upper bound in z only. */ @@ -122,7 +138,8 @@ typedef struct { int *nsubc; /* The number of sub cells for each super cell */ float *bbcz; /* Bounding boxes in z for the super cells */ nbnxn_bb_t *bb; /* 3D bounding boxes for the sub cells */ - nbnxn_bb_t *bbj; /* 3D j-b.boxes for SSE-double or AVX-single */ + nbnxn_bb_t *bbj; /* 3D j-bounding boxes for the case where * + * the i- and j-cluster sizes are different */ float *pbb; /* 3D b. boxes in xxxx format per super cell */ int *flags; /* Flag for the super cells */ int nc_nalloc; /* Allocation size for the pointers above */ @@ -139,16 +156,16 @@ typedef struct { typedef struct nbnxn_x_ci_simd_4xn { /* The i-cluster coordinates for simple search */ - gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0; - gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1; - gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2; - gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3; + gmx_mm_pr ix_S0, iy_S0, iz_S0; + gmx_mm_pr ix_S1, iy_S1, iz_S1; + gmx_mm_pr ix_S2, iy_S2, iz_S2; + gmx_mm_pr ix_S3, iy_S3, iz_S3; } nbnxn_x_ci_simd_4xn_t; typedef struct nbnxn_x_ci_simd_2xnn { /* The i-cluster coordinates for simple search */ - gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0; - gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2; + gmx_mm_pr ix_S0, iy_S0, iz_S0; + gmx_mm_pr ix_S2, iy_S2, iz_S2; } nbnxn_x_ci_simd_2xnn_t; #endif diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c index 738ffac644..ac23d9bc59 100644 --- a/src/mdlib/nbnxn_search.c +++ b/src/mdlib/nbnxn_search.c @@ -62,29 +62,26 @@ #include "nrnb.h" -#ifdef NBNXN_SEARCH_BB_SSE -/* We use SSE or AVX-128bit for bounding box calculations */ +#ifdef NBNXN_SEARCH_BB_SIMD4 +/* We use 4-wide SIMD for bounding box calculations */ #ifndef GMX_DOUBLE -/* Single precision BBs + coordinates, we can also load coordinates using SSE */ -#define NBNXN_SEARCH_SSE_SINGLE +/* Single precision BBs + coordinates, we can also load coordinates with SIMD */ +#define NBNXN_SEARCH_SIMD4_FLOAT_X_BB #endif -/* Include basic SSE2 stuff */ -#include - -#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8) +#if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8) /* Store bounding boxes with x, y and z coordinates in packs of 4 */ -#define NBNXN_PBB_SSE +#define NBNXN_PBB_SIMD4 #endif -/* The width of SSE/AVX128 with single precision for bounding boxes with GPU. - * Here AVX-256 turns out to be slightly slower than AVX-128. +/* The packed bounding box coordinate stride is always set to 4. + * With AVX we could use 8, but that turns out not to be faster. */ #define STRIDE_PBB 4 #define STRIDE_PBB_2LOG 2 -#endif /* NBNXN_SEARCH_BB_SSE */ +#endif /* NBNXN_SEARCH_BB_SIMD4 */ #ifdef GMX_NBNXN_SIMD @@ -145,7 +142,7 @@ #endif /* GMX_NBNXN_SIMD */ -#ifdef NBNXN_SEARCH_BB_SSE +#ifdef NBNXN_SEARCH_BB_SIMD4 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */ #define NBNXN_BBXXXX /* Size of bounding box corners quadruplet */ @@ -497,7 +494,7 @@ static int set_grid_size_xy(const nbnxn_search_t nbs, sfree_aligned(grid->bb); /* This snew also zeros the contents, this avoid possible - * floating exceptions in SSE with the unused bb elements. + * floating exceptions in SIMD with the unused bb elements. */ if (grid->bSimple) { @@ -805,19 +802,21 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x, /* Set the "empty" bounding box to the same as the first one, * so we don't need to treat special cases in the rest of the code. */ -#ifdef NBNXN_SEARCH_BB_SSE - _mm_store_ps(&bbj[1].lower[0], _mm_load_ps(&bbj[0].lower[0])); - _mm_store_ps(&bbj[1].upper[0], _mm_load_ps(&bbj[0].upper[0])); +#ifdef NBNXN_SEARCH_BB_SIMD4 + gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_pr(&bbj[0].lower[0])); + gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_pr(&bbj[0].upper[0])); #else bbj[1] = bbj[0]; #endif } -#ifdef NBNXN_SEARCH_BB_SSE - _mm_store_ps(&bb->lower[0], _mm_min_ps(_mm_load_ps(&bbj[0].lower[0]), - _mm_load_ps(&bbj[1].lower[0]))); - _mm_store_ps(&bb->upper[0], _mm_max_ps(_mm_load_ps(&bbj[0].upper[0]), - _mm_load_ps(&bbj[1].upper[0]))); +#ifdef NBNXN_SEARCH_BB_SIMD4 + gmx_simd4_store_pr(&bb->lower[0], + gmx_simd4_min_pr(gmx_simd4_load_pr(&bbj[0].lower[0]), + gmx_simd4_load_pr(&bbj[1].lower[0]))); + gmx_simd4_store_pr(&bb->upper[0], + gmx_simd4_max_pr(gmx_simd4_load_pr(&bbj[0].upper[0]), + gmx_simd4_load_pr(&bbj[1].upper[0]))); #else { int i; @@ -831,7 +830,7 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x, #endif } -#ifdef NBNXN_SEARCH_BB_SSE +#ifdef NBNXN_SEARCH_BB_SIMD4 /* Coordinate order xyz, bb order xxxxyyyyzzzz */ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb) @@ -866,38 +865,38 @@ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb) bb[5*STRIDE_PBB] = R2F_U(zh); } -#endif /* NBNXN_SEARCH_BB_SSE */ +#endif /* NBNXN_SEARCH_BB_SIMD4 */ -#ifdef NBNXN_SEARCH_SSE_SINGLE +#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB /* Coordinate order xyz?, bb order xyz0 */ -static void calc_bounding_box_sse(int na, const float *x, nbnxn_bb_t *bb) +static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb) { - __m128 bb_0_SSE, bb_1_SSE; - __m128 x_SSE; + gmx_simd4_pr bb_0_S, bb_1_S; + gmx_simd4_pr x_S; int i; - bb_0_SSE = _mm_load_ps(x); - bb_1_SSE = bb_0_SSE; + bb_0_S = gmx_simd4_load_pr(x); + bb_1_S = bb_0_S; for (i = 1; i < na; i++) { - x_SSE = _mm_load_ps(x+i*NNBSBB_C); - bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE); - bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE); + x_S = gmx_simd4_load_pr(x+i*NNBSBB_C); + bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S); + bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S); } - _mm_store_ps(&bb->lower[0], bb_0_SSE); - _mm_store_ps(&bb->upper[0], bb_1_SSE); + gmx_simd4_store_pr(&bb->lower[0], bb_0_S); + gmx_simd4_store_pr(&bb->upper[0], bb_1_S); } /* Coordinate order xyz?, bb order xxxxyyyyzzzz */ -static void calc_bounding_box_xxxx_sse(int na, const float *x, - nbnxn_bb_t *bb_work_aligned, - real *bb) +static void calc_bounding_box_xxxx_simd4(int na, const float *x, + nbnxn_bb_t *bb_work_aligned, + real *bb) { - calc_bounding_box_sse(na, x, bb_work_aligned); + calc_bounding_box_simd4(na, x, bb_work_aligned); bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X]; bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y]; @@ -907,7 +906,7 @@ static void calc_bounding_box_xxxx_sse(int na, const float *x, bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z]; } -#endif /* NBNXN_SEARCH_SSE_SINGLE */ +#endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */ /* Combines pairs of consecutive bounding boxes */ @@ -923,15 +922,15 @@ static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb) nc2 = (grid->cxy_na[i]+3)>>(2+1); for (c2 = sc2; c2 < sc2+nc2; c2++) { -#ifdef NBNXN_SEARCH_BB_SSE - __m128 min_SSE, max_SSE; - - min_SSE = _mm_min_ps(_mm_load_ps(&bb[c2*2+0].lower[0]), - _mm_load_ps(&bb[c2*2+1].lower[0])); - max_SSE = _mm_max_ps(_mm_load_ps(&bb[c2*2+0].upper[0]), - _mm_load_ps(&bb[c2*2+1].upper[0])); - _mm_store_ps(&grid->bbj[c2].lower[0], min_SSE); - _mm_store_ps(&grid->bbj[c2].upper[0], max_SSE); +#ifdef NBNXN_SEARCH_BB_SIMD4 + gmx_simd4_pr min_S, max_S; + + min_S = gmx_simd4_min_pr(gmx_simd4_load_pr(&bb[c2*2+0].lower[0]), + gmx_simd4_load_pr(&bb[c2*2+1].lower[0])); + max_S = gmx_simd4_max_pr(gmx_simd4_load_pr(&bb[c2*2+0].upper[0]), + gmx_simd4_load_pr(&bb[c2*2+1].upper[0])); + gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S); + gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S); #else for (j = 0; j < NNBSBB_C; j++) { @@ -1176,18 +1175,18 @@ void fill_cell(const nbnxn_search_t nbs, else if (!grid->bSimple) { /* Store the bounding boxes in a format convenient - * for SSE calculations: xxxxyyyyzzzz... + * for SIMD4 calculations: xxxxyyyyzzzz... */ pbb_ptr = grid->pbb + ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX + (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1)); -#ifdef NBNXN_SEARCH_SSE_SINGLE +#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB if (nbat->XFormat == nbatXYZQ) { - calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride, - bb_work_aligned, pbb_ptr); + calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride, + bb_work_aligned, pbb_ptr); } else #endif @@ -2065,134 +2064,114 @@ static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci, return d2; } -#ifdef NBNXN_SEARCH_BB_SSE +#ifdef NBNXN_SEARCH_BB_SIMD4 -/* SSE code for bb distance for bb format xyz0 */ -static float subc_bb_dist2_sse(int si, const nbnxn_bb_t *bb_i_ci, - int csj, const nbnxn_bb_t *bb_j_all) +/* 4-wide SIMD code for bb distance for bb format xyz0 */ +static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci, + int csj, const nbnxn_bb_t *bb_j_all) { - __m128 bb_i_SSE0, bb_i_SSE1; - __m128 bb_j_SSE0, bb_j_SSE1; - __m128 dl_SSE; - __m128 dh_SSE; - __m128 dm_SSE; - __m128 dm0_SSE; - __m128 d2_SSE; -#ifndef GMX_X86_SSE4_1 - float d2_array[7], *d2_align; - - d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15))); -#else - float d2; -#endif - - bb_i_SSE0 = _mm_load_ps(&bb_i_ci[si].lower[0]); - bb_i_SSE1 = _mm_load_ps(&bb_i_ci[si].upper[0]); - bb_j_SSE0 = _mm_load_ps(&bb_j_all[csj].lower[0]); - bb_j_SSE1 = _mm_load_ps(&bb_j_all[csj].upper[0]); - - dl_SSE = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1); - dh_SSE = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1); + gmx_simd4_pr bb_i_S0, bb_i_S1; + gmx_simd4_pr bb_j_S0, bb_j_S1; + gmx_simd4_pr dl_S; + gmx_simd4_pr dh_S; + gmx_simd4_pr dm_S; + gmx_simd4_pr dm0_S; - dm_SSE = _mm_max_ps(dl_SSE, dh_SSE); - dm0_SSE = _mm_max_ps(dm_SSE, _mm_setzero_ps()); -#ifndef GMX_X86_SSE4_1 - d2_SSE = _mm_mul_ps(dm0_SSE, dm0_SSE); + bb_i_S0 = gmx_simd4_load_pr(&bb_i_ci[si].lower[0]); + bb_i_S1 = gmx_simd4_load_pr(&bb_i_ci[si].upper[0]); + bb_j_S0 = gmx_simd4_load_pr(&bb_j_all[csj].lower[0]); + bb_j_S1 = gmx_simd4_load_pr(&bb_j_all[csj].upper[0]); - _mm_store_ps(d2_align, d2_SSE); + dl_S = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1); + dh_S = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1); - return d2_align[0] + d2_align[1] + d2_align[2]; -#else - /* SSE4.1 dot product of components 0,1,2 */ - d2_SSE = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71); - - _mm_store_ss(&d2, d2_SSE); + dm_S = gmx_simd4_max_pr(dl_S, dh_S); + dm0_S = gmx_simd4_max_pr(dm_S, gmx_simd4_setzero_pr()); - return d2; -#endif + return gmx_simd4_dotproduct3(dm0_S, dm0_S); } /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */ -#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \ +#define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \ { \ int shi; \ \ - __m128 dx_0, dy_0, dz_0; \ - __m128 dx_1, dy_1, dz_1; \ + gmx_simd4_pr dx_0, dy_0, dz_0; \ + gmx_simd4_pr dx_1, dy_1, dz_1; \ \ - __m128 mx, my, mz; \ - __m128 m0x, m0y, m0z; \ + gmx_simd4_pr mx, my, mz; \ + gmx_simd4_pr m0x, m0y, m0z; \ \ - __m128 d2x, d2y, d2z; \ - __m128 d2s, d2t; \ + gmx_simd4_pr d2x, d2y, d2z; \ + gmx_simd4_pr d2s, d2t; \ \ shi = si*NNBSBB_D*DIM; \ \ - xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB); \ - yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB); \ - zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB); \ - xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB); \ - yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB); \ - zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB); \ + xi_l = gmx_simd4_load_pr(bb_i+shi+0*STRIDE_PBB); \ + yi_l = gmx_simd4_load_pr(bb_i+shi+1*STRIDE_PBB); \ + zi_l = gmx_simd4_load_pr(bb_i+shi+2*STRIDE_PBB); \ + xi_h = gmx_simd4_load_pr(bb_i+shi+3*STRIDE_PBB); \ + yi_h = gmx_simd4_load_pr(bb_i+shi+4*STRIDE_PBB); \ + zi_h = gmx_simd4_load_pr(bb_i+shi+5*STRIDE_PBB); \ \ - dx_0 = _mm_sub_ps(xi_l, xj_h); \ - dy_0 = _mm_sub_ps(yi_l, yj_h); \ - dz_0 = _mm_sub_ps(zi_l, zj_h); \ + dx_0 = gmx_simd4_sub_pr(xi_l, xj_h); \ + dy_0 = gmx_simd4_sub_pr(yi_l, yj_h); \ + dz_0 = gmx_simd4_sub_pr(zi_l, zj_h); \ \ - dx_1 = _mm_sub_ps(xj_l, xi_h); \ - dy_1 = _mm_sub_ps(yj_l, yi_h); \ - dz_1 = _mm_sub_ps(zj_l, zi_h); \ + dx_1 = gmx_simd4_sub_pr(xj_l, xi_h); \ + dy_1 = gmx_simd4_sub_pr(yj_l, yi_h); \ + dz_1 = gmx_simd4_sub_pr(zj_l, zi_h); \ \ - mx = _mm_max_ps(dx_0, dx_1); \ - my = _mm_max_ps(dy_0, dy_1); \ - mz = _mm_max_ps(dz_0, dz_1); \ + mx = gmx_simd4_max_pr(dx_0, dx_1); \ + my = gmx_simd4_max_pr(dy_0, dy_1); \ + mz = gmx_simd4_max_pr(dz_0, dz_1); \ \ - m0x = _mm_max_ps(mx, zero); \ - m0y = _mm_max_ps(my, zero); \ - m0z = _mm_max_ps(mz, zero); \ + m0x = gmx_simd4_max_pr(mx, zero); \ + m0y = gmx_simd4_max_pr(my, zero); \ + m0z = gmx_simd4_max_pr(mz, zero); \ \ - d2x = _mm_mul_ps(m0x, m0x); \ - d2y = _mm_mul_ps(m0y, m0y); \ - d2z = _mm_mul_ps(m0z, m0z); \ + d2x = gmx_simd4_mul_pr(m0x, m0x); \ + d2y = gmx_simd4_mul_pr(m0y, m0y); \ + d2z = gmx_simd4_mul_pr(m0z, m0z); \ \ - d2s = _mm_add_ps(d2x, d2y); \ - d2t = _mm_add_ps(d2s, d2z); \ + d2s = gmx_simd4_add_pr(d2x, d2y); \ + d2t = gmx_simd4_add_pr(d2s, d2z); \ \ - _mm_store_ps(d2+si, d2t); \ + gmx_simd4_store_pr(d2+si, d2t); \ } -/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */ -static void subc_bb_dist2_sse_xxxx(const float *bb_j, - int nsi, const float *bb_i, - float *d2) +/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */ +static void subc_bb_dist2_simd4_xxxx(const float *bb_j, + int nsi, const float *bb_i, + float *d2) { - __m128 xj_l, yj_l, zj_l; - __m128 xj_h, yj_h, zj_h; - __m128 xi_l, yi_l, zi_l; - __m128 xi_h, yi_h, zi_h; + gmx_simd4_pr xj_l, yj_l, zj_l; + gmx_simd4_pr xj_h, yj_h, zj_h; + gmx_simd4_pr xi_l, yi_l, zi_l; + gmx_simd4_pr xi_h, yi_h, zi_h; - __m128 zero; + gmx_simd4_pr zero; - zero = _mm_setzero_ps(); + zero = gmx_simd4_setzero_pr(); - xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]); - yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]); - zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]); - xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]); - yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]); - zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]); + xj_l = gmx_simd4_set1_pr(bb_j[0*STRIDE_PBB]); + yj_l = gmx_simd4_set1_pr(bb_j[1*STRIDE_PBB]); + zj_l = gmx_simd4_set1_pr(bb_j[2*STRIDE_PBB]); + xj_h = gmx_simd4_set1_pr(bb_j[3*STRIDE_PBB]); + yj_h = gmx_simd4_set1_pr(bb_j[4*STRIDE_PBB]); + zj_h = gmx_simd4_set1_pr(bb_j[5*STRIDE_PBB]); /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB. * But as we know the number of iterations is 1 or 2, we unroll manually. */ - SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2); + SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i, d2); if (STRIDE_PBB < nsi) { - SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2); + SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB, bb_i, d2); } } -#endif /* NBNXN_SEARCH_BB_SSE */ +#endif /* NBNXN_SEARCH_BB_SIMD4 */ /* Plain C function which determines if any atom pair between two cells * is within distance sqrt(rl2). @@ -2226,44 +2205,44 @@ static gmx_bool subc_in_range_x(int na_c, return FALSE; } -#ifdef NBNXN_SEARCH_SSE_SINGLE +#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB /* When we make seperate single/double precision SIMD vector operation * include files, this function should be moved there (also using FMA). */ -static inline __m128 -gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z) +static inline gmx_simd4_pr +gmx_simd4_calc_rsq_pr(gmx_simd4_pr x, gmx_simd4_pr y, gmx_simd4_pr z) { - return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) ); + return gmx_simd4_add_pr( gmx_simd4_add_pr( gmx_simd4_mul_pr(x, x), gmx_simd4_mul_pr(y, y) ), gmx_simd4_mul_pr(z, z) ); } #endif -/* SSE function which determines if any atom pair between two cells, +/* 4-wide SIMD function which determines if any atom pair between two cells, * both with 8 atoms, is within distance sqrt(rl2). - * Not performance critical, so only uses plain SSE. + * Using 8-wide AVX is not faster on Intel Sandy Bridge. */ -static gmx_bool subc_in_range_sse8(int na_c, - int si, const real *x_i, - int csj, int stride, const real *x_j, - real rl2) +static gmx_bool subc_in_range_simd4(int na_c, + int si, const real *x_i, + int csj, int stride, const real *x_j, + real rl2) { -#ifdef NBNXN_SEARCH_SSE_SINGLE - __m128 ix_SSE0, iy_SSE0, iz_SSE0; - __m128 ix_SSE1, iy_SSE1, iz_SSE1; +#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB + gmx_simd4_pr ix_S0, iy_S0, iz_S0; + gmx_simd4_pr ix_S1, iy_S1, iz_S1; - __m128 rc2_SSE; + gmx_simd4_pr rc2_S; - int na_c_sse; + int dim_stride; int j0, j1; - rc2_SSE = _mm_set1_ps(rl2); + rc2_S = gmx_simd4_set1_pr(rl2); - na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB; - ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB); - iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB); - iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB); - ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB); - iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB); - iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB); + dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM; + ix_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+0)*STRIDE_PBB); + iy_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+1)*STRIDE_PBB); + iz_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+2)*STRIDE_PBB); + ix_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+3)*STRIDE_PBB); + iy_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+4)*STRIDE_PBB); + iz_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+5)*STRIDE_PBB); /* We loop from the outer to the inner particles to maximize * the chance that we find a pair in range quickly and return. @@ -2272,63 +2251,63 @@ static gmx_bool subc_in_range_sse8(int na_c, j1 = j0 + na_c - 1; while (j0 < j1) { - __m128 jx0_SSE, jy0_SSE, jz0_SSE; - __m128 jx1_SSE, jy1_SSE, jz1_SSE; + gmx_simd4_pr jx0_S, jy0_S, jz0_S; + gmx_simd4_pr jx1_S, jy1_S, jz1_S; - __m128 dx_SSE0, dy_SSE0, dz_SSE0; - __m128 dx_SSE1, dy_SSE1, dz_SSE1; - __m128 dx_SSE2, dy_SSE2, dz_SSE2; - __m128 dx_SSE3, dy_SSE3, dz_SSE3; + gmx_simd4_pr dx_S0, dy_S0, dz_S0; + gmx_simd4_pr dx_S1, dy_S1, dz_S1; + gmx_simd4_pr dx_S2, dy_S2, dz_S2; + gmx_simd4_pr dx_S3, dy_S3, dz_S3; - __m128 rsq_SSE0; - __m128 rsq_SSE1; - __m128 rsq_SSE2; - __m128 rsq_SSE3; + gmx_simd4_pr rsq_S0; + gmx_simd4_pr rsq_S1; + gmx_simd4_pr rsq_S2; + gmx_simd4_pr rsq_S3; - __m128 wco_SSE0; - __m128 wco_SSE1; - __m128 wco_SSE2; - __m128 wco_SSE3; - __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE; + gmx_simd4_pb wco_S0; + gmx_simd4_pb wco_S1; + gmx_simd4_pb wco_S2; + gmx_simd4_pb wco_S3; + gmx_simd4_pb wco_any_S01, wco_any_S23, wco_any_S; - jx0_SSE = _mm_load1_ps(x_j+j0*stride+0); - jy0_SSE = _mm_load1_ps(x_j+j0*stride+1); - jz0_SSE = _mm_load1_ps(x_j+j0*stride+2); + jx0_S = gmx_simd4_set1_pr(x_j[j0*stride+0]); + jy0_S = gmx_simd4_set1_pr(x_j[j0*stride+1]); + jz0_S = gmx_simd4_set1_pr(x_j[j0*stride+2]); - jx1_SSE = _mm_load1_ps(x_j+j1*stride+0); - jy1_SSE = _mm_load1_ps(x_j+j1*stride+1); - jz1_SSE = _mm_load1_ps(x_j+j1*stride+2); + jx1_S = gmx_simd4_set1_pr(x_j[j1*stride+0]); + jy1_S = gmx_simd4_set1_pr(x_j[j1*stride+1]); + jz1_S = gmx_simd4_set1_pr(x_j[j1*stride+2]); /* Calculate distance */ - dx_SSE0 = _mm_sub_ps(ix_SSE0, jx0_SSE); - dy_SSE0 = _mm_sub_ps(iy_SSE0, jy0_SSE); - dz_SSE0 = _mm_sub_ps(iz_SSE0, jz0_SSE); - dx_SSE1 = _mm_sub_ps(ix_SSE1, jx0_SSE); - dy_SSE1 = _mm_sub_ps(iy_SSE1, jy0_SSE); - dz_SSE1 = _mm_sub_ps(iz_SSE1, jz0_SSE); - dx_SSE2 = _mm_sub_ps(ix_SSE0, jx1_SSE); - dy_SSE2 = _mm_sub_ps(iy_SSE0, jy1_SSE); - dz_SSE2 = _mm_sub_ps(iz_SSE0, jz1_SSE); - dx_SSE3 = _mm_sub_ps(ix_SSE1, jx1_SSE); - dy_SSE3 = _mm_sub_ps(iy_SSE1, jy1_SSE); - dz_SSE3 = _mm_sub_ps(iz_SSE1, jz1_SSE); + dx_S0 = gmx_simd4_sub_pr(ix_S0, jx0_S); + dy_S0 = gmx_simd4_sub_pr(iy_S0, jy0_S); + dz_S0 = gmx_simd4_sub_pr(iz_S0, jz0_S); + dx_S1 = gmx_simd4_sub_pr(ix_S1, jx0_S); + dy_S1 = gmx_simd4_sub_pr(iy_S1, jy0_S); + dz_S1 = gmx_simd4_sub_pr(iz_S1, jz0_S); + dx_S2 = gmx_simd4_sub_pr(ix_S0, jx1_S); + dy_S2 = gmx_simd4_sub_pr(iy_S0, jy1_S); + dz_S2 = gmx_simd4_sub_pr(iz_S0, jz1_S); + dx_S3 = gmx_simd4_sub_pr(ix_S1, jx1_S); + dy_S3 = gmx_simd4_sub_pr(iy_S1, jy1_S); + dz_S3 = gmx_simd4_sub_pr(iz_S1, jz1_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3); + rsq_S0 = gmx_simd4_calc_rsq_pr(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_simd4_calc_rsq_pr(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_simd4_calc_rsq_pr(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_simd4_calc_rsq_pr(dx_S3, dy_S3, dz_S3); - wco_SSE0 = _mm_cmplt_ps(rsq_SSE0, rc2_SSE); - wco_SSE1 = _mm_cmplt_ps(rsq_SSE1, rc2_SSE); - wco_SSE2 = _mm_cmplt_ps(rsq_SSE2, rc2_SSE); - wco_SSE3 = _mm_cmplt_ps(rsq_SSE3, rc2_SSE); + wco_S0 = gmx_simd4_cmplt_pr(rsq_S0, rc2_S); + wco_S1 = gmx_simd4_cmplt_pr(rsq_S1, rc2_S); + wco_S2 = gmx_simd4_cmplt_pr(rsq_S2, rc2_S); + wco_S3 = gmx_simd4_cmplt_pr(rsq_S3, rc2_S); - wco_any_SSE01 = _mm_or_ps(wco_SSE0, wco_SSE1); - wco_any_SSE23 = _mm_or_ps(wco_SSE2, wco_SSE3); - wco_any_SSE = _mm_or_ps(wco_any_SSE01, wco_any_SSE23); + wco_any_S01 = gmx_simd4_or_pb(wco_S0, wco_S1); + wco_any_S23 = gmx_simd4_or_pb(wco_S2, wco_S3); + wco_any_S = gmx_simd4_or_pb(wco_any_S01, wco_any_S23); - if (_mm_movemask_ps(wco_any_SSE)) + if (gmx_simd4_anytrue_pb(wco_any_S)) { return TRUE; } @@ -2339,8 +2318,8 @@ static gmx_bool subc_in_range_sse8(int na_c, return FALSE; #else - /* No SSE */ - gmx_incons("SSE function called without SSE support"); + /* No SIMD4 */ + gmx_incons("SIMD4 function called without 4-wide SIMD support"); return TRUE; #endif @@ -2493,22 +2472,22 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl, snew(nbl->work, 1); if (nbl->bSimple) { - snew_aligned(nbl->work->bb_ci, 1, NBNXN_MEM_ALIGN); + snew_aligned(nbl->work->bb_ci, 1, NBNXN_SEARCH_BB_MEM_ALIGN); } else { #ifdef NBNXN_BBXXXX - snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN); + snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_SEARCH_BB_MEM_ALIGN); #else - snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_MEM_ALIGN); + snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN); #endif } - snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN); + snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_SEARCH_BB_MEM_ALIGN); #ifdef GMX_NBNXN_SIMD snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN); snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN); #endif - snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN); + snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN); nbl->work->sort = NULL; nbl->work->sort_nalloc = 0; @@ -2901,7 +2880,7 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj, #include "nbnxn_search_simd_2xnn.h" #endif -/* Plain C or SSE code for making a pair list of super-cell sci vs scj. +/* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj. * Checks bounding box distances and possibly atom pair distances. */ static void make_cluster_list_supersub(const nbnxn_search_t nbs, @@ -2968,8 +2947,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs, } #ifdef NBNXN_BBXXXX - /* Determine all ci1 bb distances in one call with SSE */ - subc_bb_dist2_sse_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)), + /* Determine all ci1 bb distances in one call with SIMD4 */ + subc_bb_dist2_simd4_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)), ci1, pbb_ci, d2l); *ndistc += na_c*2; #endif @@ -2999,8 +2978,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs, *ndistc += na_c*na_c; if (d2 < rbb2 || (d2 < rl2 && -#ifdef NBNXN_PBB_SSE - subc_in_range_sse8 +#ifdef NBNXN_PBB_SIMD4 + subc_in_range_simd4 #else subc_in_range_x #endif @@ -3031,8 +3010,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs, { /* Avoid using function pointers here, as it's slower */ if ( -#ifdef NBNXN_PBB_SSE - !subc_in_range_sse8 +#ifdef NBNXN_PBB_SIMD4 + !subc_in_range_simd4 #else !subc_in_range_x #endif @@ -3128,7 +3107,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs, ndirect++; } } -#ifdef NBNXN_SEARCH_BB_SSE +#ifdef NBNXN_SEARCH_BB_SIMD4 else { while (cj_ind_first + ndirect <= cj_ind_last && @@ -3734,13 +3713,13 @@ static void icell_set_x_supersub(int ci, } } -#ifdef NBNXN_SEARCH_BB_SSE +#ifdef NBNXN_SEARCH_BB_SIMD4 /* Copies PBC shifted super-cell packed atom coordinates to working array */ -static void icell_set_x_supersub_sse8(int ci, - real shx, real shy, real shz, - int na_c, - int stride, const real *x, - nbnxn_list_work_t *work) +static void icell_set_x_supersub_simd4(int ci, + real shx, real shy, real shz, + int na_c, + int stride, const real *x, + nbnxn_list_work_t *work) { int si, io, ia, i, j; real *x_ci; @@ -4989,8 +4968,8 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, } else { -#ifdef NBNXN_SEARCH_BB_SSE - nbs->icell_set_x = icell_set_x_supersub_sse8; +#ifdef NBNXN_SEARCH_BB_SIMD4 + nbs->icell_set_x = icell_set_x_supersub_simd4; #else nbs->icell_set_x = icell_set_x_supersub; #endif diff --git a/src/mdlib/nbnxn_search_simd_2xnn.h b/src/mdlib/nbnxn_search_simd_2xnn.h index 3c649df2d5..f611791397 100644 --- a/src/mdlib/nbnxn_search_simd_2xnn.h +++ b/src/mdlib/nbnxn_search_simd_2xnn.h @@ -86,40 +86,14 @@ icell_set_x_simd_2xnn(int ci, ia = X_IND_CI_SIMD_2XNN(ci); - x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx); - x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy); - x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz); - x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx); - x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy); - x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz); + x_ci->ix_S0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx); + x_ci->iy_S0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy); + x_ci->iz_S0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz); + x_ci->ix_S2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx); + x_ci->iy_S2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy); + x_ci->iz_S2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz); } -#ifndef GMX_SIMD_HAVE_ANYTRUE -/* Fallback function in case gmx_anytrue_pr is not present */ -static gmx_inline gmx_bool -gmx_anytrue_2xn_pb(gmx_mm_pb bool_S) -{ - real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools; - gmx_bool any; - int s; - - bools = gmx_simd_align_real(bools_array); - - gmx_store_pb(bools, bool_S); - - any = FALSE; - for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++) - { - if (GMX_SIMD_IS_TRUE(s)) - { - any = TRUE; - } - } - - return any; -} -#endif - /* SIMD code for making a pair list of cell ci vs cell cjf-cjl * for coordinates in packed format. * Checks bouding box distances and possibly atom pair distances. @@ -137,19 +111,19 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, const nbnxn_x_ci_simd_2xnn_t *work; const nbnxn_bb_t *bb_ci; - gmx_mm_pr jx_SSE, jy_SSE, jz_SSE; + gmx_mm_pr jx_S, jy_S, jz_S; - gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0; - gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2; + gmx_mm_pr dx_S0, dy_S0, dz_S0; + gmx_mm_pr dx_S2, dy_S2, dz_S2; - gmx_mm_pr rsq_SSE0; - gmx_mm_pr rsq_SSE2; + gmx_mm_pr rsq_S0; + gmx_mm_pr rsq_S2; - gmx_mm_pb wco_SSE0; - gmx_mm_pb wco_SSE2; - gmx_mm_pb wco_any_SSE; + gmx_mm_pb wco_S0; + gmx_mm_pb wco_S2; + gmx_mm_pb wco_any_S; - gmx_mm_pr rc2_SSE; + gmx_mm_pr rc2_S; gmx_bool InRange; float d2; @@ -162,13 +136,13 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, bb_ci = nbl->work->bb_ci; - rc2_SSE = gmx_set1_pr(rl2); + rc2_S = gmx_set1_pr(rl2); InRange = FALSE; while (!InRange && cjf <= cjl) { -#ifdef NBNXN_SEARCH_BB_SSE - d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj); +#ifdef NBNXN_SEARCH_BB_SIMD4 + d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj); #else d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj); #endif @@ -187,32 +161,28 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, { xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf); - jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S); - jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S); - jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S); + jx_S = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S); + jy_S = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S); + jz_S = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S); /* Calculate distance */ - dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE); - dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE); - dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE); - dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE); - dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE); - dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE); + dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); + dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); + dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); + dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); + dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); + dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2); + rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); + rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); - wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE); - wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE); + wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); + wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); - wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2); + wco_any_S = gmx_or_pb(wco_S0, wco_S2); -#ifdef GMX_SIMD_HAVE_ANYTRUE - InRange = gmx_anytrue_pb(wco_any_SSE); -#else - InRange = gmx_anytrue_2xn_pb(wco_any_SSE); -#endif + InRange = gmx_anytrue_pb(wco_any_S); *ndistc += 2*GMX_SIMD_WIDTH_HERE; } @@ -229,8 +199,8 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, InRange = FALSE; while (!InRange && cjl > cjf) { -#ifdef NBNXN_SEARCH_BB_SSE - d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj); +#ifdef NBNXN_SEARCH_BB_SIMD4 + d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj); #else d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj); #endif @@ -249,32 +219,28 @@ make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, { xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl); - jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S); - jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S); - jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S); + jx_S = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S); + jy_S = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S); + jz_S = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S); /* Calculate distance */ - dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE); - dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE); - dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE); - dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE); - dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE); - dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE); + dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); + dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); + dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); + dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); + dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); + dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2); + rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); + rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); - wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE); - wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE); + wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); + wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); - wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2); + wco_any_S = gmx_or_pb(wco_S0, wco_S2); -#ifdef GMX_SIMD_HAVE_ANYTRUE - InRange = gmx_anytrue_pb(wco_any_SSE); -#else - InRange = gmx_anytrue_2xn_pb(wco_any_SSE); -#endif + InRange = gmx_anytrue_pb(wco_any_S); *ndistc += 2*GMX_SIMD_WIDTH_HERE; } diff --git a/src/mdlib/nbnxn_search_simd_4xn.h b/src/mdlib/nbnxn_search_simd_4xn.h index a7a19579bb..9d5c638c43 100644 --- a/src/mdlib/nbnxn_search_simd_4xn.h +++ b/src/mdlib/nbnxn_search_simd_4xn.h @@ -58,46 +58,20 @@ icell_set_x_simd_4xn(int ci, ia = X_IND_CI_SIMD_4XN(ci); - x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx); - x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy); - x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz); - x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx); - x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy); - x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz); - x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx); - x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy); - x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz); - x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx); - x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy); - x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz); + x_ci->ix_S0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx); + x_ci->iy_S0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy); + x_ci->iz_S0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz); + x_ci->ix_S1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx); + x_ci->iy_S1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy); + x_ci->iz_S1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz); + x_ci->ix_S2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx); + x_ci->iy_S2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy); + x_ci->iz_S2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz); + x_ci->ix_S3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx); + x_ci->iy_S3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy); + x_ci->iz_S3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz); } -#ifndef GMX_SIMD_HAVE_ANYTRUE -/* Fallback function in case gmx_anytrue_pr is not present */ -static gmx_inline gmx_bool -gmx_anytrue_4xn_pb(gmx_mm_pb bool_S) -{ - real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools; - gmx_bool any; - int s; - - bools = gmx_simd_align_real(bools_array); - - gmx_store_pb(bools, bool_S); - - any = FALSE; - for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++) - { - if (GMX_SIMD_IS_TRUE(bools[s])) - { - any = TRUE; - } - } - - return any; -} -#endif - /* SIMD code for making a pair list of cell ci vs cell cjf-cjl * for coordinates in packed format. * Checks bouding box distances and possibly atom pair distances. @@ -115,25 +89,25 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, const nbnxn_x_ci_simd_4xn_t *work; const nbnxn_bb_t *bb_ci; - gmx_mm_pr jx_SSE, jy_SSE, jz_SSE; + gmx_mm_pr jx_S, jy_S, jz_S; - gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0; - gmx_mm_pr dx_SSE1, dy_SSE1, dz_SSE1; - gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2; - gmx_mm_pr dx_SSE3, dy_SSE3, dz_SSE3; + gmx_mm_pr dx_S0, dy_S0, dz_S0; + gmx_mm_pr dx_S1, dy_S1, dz_S1; + gmx_mm_pr dx_S2, dy_S2, dz_S2; + gmx_mm_pr dx_S3, dy_S3, dz_S3; - gmx_mm_pr rsq_SSE0; - gmx_mm_pr rsq_SSE1; - gmx_mm_pr rsq_SSE2; - gmx_mm_pr rsq_SSE3; + gmx_mm_pr rsq_S0; + gmx_mm_pr rsq_S1; + gmx_mm_pr rsq_S2; + gmx_mm_pr rsq_S3; - gmx_mm_pb wco_SSE0; - gmx_mm_pb wco_SSE1; - gmx_mm_pb wco_SSE2; - gmx_mm_pb wco_SSE3; - gmx_mm_pb wco_any_SSE01, wco_any_SSE23, wco_any_SSE; + gmx_mm_pb wco_S0; + gmx_mm_pb wco_S1; + gmx_mm_pb wco_S2; + gmx_mm_pb wco_S3; + gmx_mm_pb wco_any_S01, wco_any_S23, wco_any_S; - gmx_mm_pr rc2_SSE; + gmx_mm_pr rc2_S; gmx_bool InRange; float d2; @@ -146,13 +120,13 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, bb_ci = nbl->work->bb_ci; - rc2_SSE = gmx_set1_pr(rl2); + rc2_S = gmx_set1_pr(rl2); InRange = FALSE; while (!InRange && cjf <= cjl) { -#ifdef NBNXN_SEARCH_BB_SSE - d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj); +#ifdef NBNXN_SEARCH_BB_SIMD4 + d2 = subc_bb_dist2_simd4(0, bb_ci, cjf, gridj->bbj); #else d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj); #endif @@ -171,45 +145,41 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, { xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf); - jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S); - jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S); - jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S); + jx_S = gmx_load_pr(x_j+xind_f+0*STRIDE_S); + jy_S = gmx_load_pr(x_j+xind_f+1*STRIDE_S); + jz_S = gmx_load_pr(x_j+xind_f+2*STRIDE_S); /* Calculate distance */ - dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE); - dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE); - dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE); - dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE); - dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE); - dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE); - dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE); - dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE); - dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE); - dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE); - dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE); - dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE); + dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); + dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); + dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); + dx_S1 = gmx_sub_pr(work->ix_S1, jx_S); + dy_S1 = gmx_sub_pr(work->iy_S1, jy_S); + dz_S1 = gmx_sub_pr(work->iz_S1, jz_S); + dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); + dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); + dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); + dx_S3 = gmx_sub_pr(work->ix_S3, jx_S); + dy_S3 = gmx_sub_pr(work->iy_S3, jy_S); + dz_S3 = gmx_sub_pr(work->iz_S3, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3); - - wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE); - wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE); - wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE); - wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE); - - wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1); - wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3); - wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23); - -#ifdef GMX_SIMD_HAVE_ANYTRUE - InRange = gmx_anytrue_pb(wco_any_SSE); -#else - InRange = gmx_anytrue_4xn_pb(wco_any_SSE); -#endif + rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3); + + wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); + wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S); + wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); + wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S); + + wco_any_S01 = gmx_or_pb(wco_S0, wco_S1); + wco_any_S23 = gmx_or_pb(wco_S2, wco_S3); + wco_any_S = gmx_or_pb(wco_any_S01, wco_any_S23); + + InRange = gmx_anytrue_pb(wco_any_S); *ndistc += 4*GMX_SIMD_WIDTH_HERE; } @@ -226,8 +196,8 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, InRange = FALSE; while (!InRange && cjl > cjf) { -#ifdef NBNXN_SEARCH_BB_SSE - d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj); +#ifdef NBNXN_SEARCH_BB_SIMD4 + d2 = subc_bb_dist2_simd4(0, bb_ci, cjl, gridj->bbj); #else d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj); #endif @@ -246,44 +216,40 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, { xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl); - jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S); - jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S); - jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S); + jx_S = gmx_load_pr(x_j+xind_l+0*STRIDE_S); + jy_S = gmx_load_pr(x_j+xind_l+1*STRIDE_S); + jz_S = gmx_load_pr(x_j+xind_l+2*STRIDE_S); /* Calculate distance */ - dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE); - dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE); - dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE); - dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE); - dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE); - dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE); - dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE); - dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE); - dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE); - dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE); - dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE); - dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE); + dx_S0 = gmx_sub_pr(work->ix_S0, jx_S); + dy_S0 = gmx_sub_pr(work->iy_S0, jy_S); + dz_S0 = gmx_sub_pr(work->iz_S0, jz_S); + dx_S1 = gmx_sub_pr(work->ix_S1, jx_S); + dy_S1 = gmx_sub_pr(work->iy_S1, jy_S); + dz_S1 = gmx_sub_pr(work->iz_S1, jz_S); + dx_S2 = gmx_sub_pr(work->ix_S2, jx_S); + dy_S2 = gmx_sub_pr(work->iy_S2, jy_S); + dz_S2 = gmx_sub_pr(work->iz_S2, jz_S); + dx_S3 = gmx_sub_pr(work->ix_S3, jx_S); + dy_S3 = gmx_sub_pr(work->iy_S3, jy_S); + dz_S3 = gmx_sub_pr(work->iz_S3, jz_S); /* rsq = dx*dx+dy*dy+dz*dz */ - rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0); - rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1); - rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2); - rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3); - - wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE); - wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE); - wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE); - wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE); - - wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1); - wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3); - wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23); - -#ifdef GMX_SIMD_HAVE_ANYTRUE - InRange = gmx_anytrue_pb(wco_any_SSE); -#else - InRange = gmx_anytrue_4xn_pb(wco_any_SSE); -#endif + rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0); + rsq_S1 = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1); + rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2); + rsq_S3 = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3); + + wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S); + wco_S1 = gmx_cmplt_pr(rsq_S1, rc2_S); + wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S); + wco_S3 = gmx_cmplt_pr(rsq_S3, rc2_S); + + wco_any_S01 = gmx_or_pb(wco_S0, wco_S1); + wco_any_S23 = gmx_or_pb(wco_S2, wco_S3); + wco_any_S = gmx_or_pb(wco_any_S01, wco_any_S23); + + InRange = gmx_anytrue_pb(wco_any_S); *ndistc += 4*GMX_SIMD_WIDTH_HERE; } diff --git a/src/mdlib/pme.c b/src/mdlib/pme.c index c641e46821..3316546344 100644 --- a/src/mdlib/pme.c +++ b/src/mdlib/pme.c @@ -95,23 +95,28 @@ /* Include the SIMD macro file and then check for support */ #include "gmx_simd_macros.h" #if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP -/* Turn on SIMD intrinsics for PME solve */ +/* Turn on arbitrary width SIMD intrinsics for PME solve */ #define PME_SIMD #endif -/* SIMD spread+gather only in single precision with SSE2 or higher available. - * We might want to switch to use gmx_simd_macros.h, but this is somewhat - * complicated, as we use unaligned and/or 4-wide only loads. +/* Include the 4-wide SIMD macro file */ +#include "gmx_simd4_macros.h" +/* Check if we have 4-wide SIMD macro support */ +#ifdef GMX_HAVE_SIMD4_MACROS +/* Do PME spread and gather with 4-wide SIMD. + * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common). */ -#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE) -#define PME_SSE_SPREAD_GATHER -#include -/* Some old AMD processors could have problems with unaligned loads+stores */ -#ifndef GMX_FAHCORE -#define PME_SSE_UNALIGNED +#define PME_SIMD4_SPREAD_GATHER + +#ifdef GMX_SIMD4_HAVE_UNALIGNED +/* With PME-order=4 on x86, unaligned load+store is slightly faster + * than doubling all SIMD operations when using aligned load+store. + */ +#define PME_SIMD4_UNALIGNED #endif #endif + #include "mpelogging.h" #define DFT_TOL 1e-7 @@ -128,7 +133,16 @@ #define mpi_type MPI_FLOAT #endif -/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */ +#ifdef PME_SIMD4_SPREAD_GATHER +#define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real)) +#else +/* We can use any alignment, apart from 0, so we use 4 reals */ +#define SIMD4_ALIGNMENT (4*sizeof(real)) +#endif + +/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size + * to preserve alignment. + */ #define GMX_CACHE_SEP 64 /* We only define a maximum to be able to use local arrays without allocation. @@ -239,9 +253,9 @@ typedef struct { typedef struct { -#ifdef PME_SSE_SPREAD_GATHER - /* Masks for SSE aligned spreading and gathering */ - __m128 mask_SSE0[6], mask_SSE1[6]; +#ifdef PME_SIMD4_SPREAD_GATHER + /* Masks for 4-wide SIMD aligned spreading and gathering */ + gmx_simd4_pb mask_S0[6], mask_S1[6]; #else int dummy; /* C89 requires that struct has at least one member */ #endif @@ -589,8 +603,9 @@ static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc) srenew(th[XX], nalloc); srenew(th[YY], nalloc); - /* In z we add padding, this is only required for the aligned SSE code */ - srenew(*ptr_z, nalloc+2*padding); + /* In z we add padding, this is only required for the aligned SIMD code */ + sfree_aligned(*ptr_z); + snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT); th[ZZ] = *ptr_z + padding; for (i = 0; i < padding; i++) @@ -1514,6 +1529,12 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid, int pnx, pny, pnz, ndatatot; int offx, offy, offz; +#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED + real thz_buffer[12], *thz_aligned; + + thz_aligned = gmx_simd4_align_real(thz_buffer); +#endif + pnx = pmegrid->s[XX]; pny = pmegrid->s[YY]; pnz = pmegrid->s[ZZ]; @@ -1552,23 +1573,23 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid, switch (order) { case 4: -#ifdef PME_SSE_SPREAD_GATHER -#ifdef PME_SSE_UNALIGNED -#define PME_SPREAD_SSE_ORDER4 +#ifdef PME_SIMD4_SPREAD_GATHER +#ifdef PME_SIMD4_UNALIGNED +#define PME_SPREAD_SIMD4_ORDER4 #else -#define PME_SPREAD_SSE_ALIGNED +#define PME_SPREAD_SIMD4_ALIGNED #define PME_ORDER 4 #endif -#include "pme_sse_single.h" +#include "pme_simd4.h" #else DO_BSPLINE(4); #endif break; case 5: -#ifdef PME_SSE_SPREAD_GATHER -#define PME_SPREAD_SSE_ALIGNED +#ifdef PME_SIMD4_SPREAD_GATHER +#define PME_SPREAD_SIMD4_ALIGNED #define PME_ORDER 5 -#include "pme_sse_single.h" +#include "pme_simd4.h" #else DO_BSPLINE(5); #endif @@ -1583,9 +1604,9 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid, static void set_grid_alignment(int *pmegrid_nz, int pme_order) { -#ifdef PME_SSE_SPREAD_GATHER +#ifdef PME_SIMD4_SPREAD_GATHER if (pme_order == 5 -#ifndef PME_SSE_UNALIGNED +#ifndef PME_SIMD4_UNALIGNED || pme_order == 4 #endif ) @@ -1598,8 +1619,8 @@ static void set_grid_alignment(int *pmegrid_nz, int pme_order) static void set_gridsize_alignment(int *gridsize, int pme_order) { -#ifdef PME_SSE_SPREAD_GATHER -#ifndef PME_SSE_UNALIGNED +#ifdef PME_SIMD4_SPREAD_GATHER +#ifndef PME_SIMD4_UNALIGNED if (pme_order == 4) { /* Add extra elements to ensured aligned operations do not go @@ -1650,7 +1671,7 @@ static void pmegrid_init(pmegrid_t *grid, { gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ]; set_gridsize_alignment(&gridsize, pme_order); - snew_aligned(grid->grid, gridsize, 16); + snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT); } else { @@ -1769,7 +1790,7 @@ static void pmegrids_init(pmegrids_t *grids, set_gridsize_alignment(&gridsize, pme_order); snew_aligned(grids->grid_all, grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP, - 16); + SIMD4_ALIGNMENT); for (x = 0; x < grids->nc[XX]; x++) { @@ -1868,6 +1889,8 @@ static void pmegrids_destroy(pmegrids_t *grids) static void realloc_work(pme_work_t *work, int nkx) { + int simd_width; + if (nkx > work->nalloc) { work->nalloc = nkx; @@ -1879,17 +1902,17 @@ static void realloc_work(pme_work_t *work, int nkx) * elements at the end for padding. */ #ifdef PME_SIMD -#define ALIGN_HERE GMX_SIMD_WIDTH_HERE + simd_width = GMX_SIMD_WIDTH_HERE; #else -/* We can use any alignment, apart from 0, so we use 4 */ -#define ALIGN_HERE 4 + /* We can use any alignment, apart from 0, so we use 4 */ + simd_width = 4; #endif sfree_aligned(work->denom); sfree_aligned(work->tmp1); sfree_aligned(work->eterm); - snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real)); - snew_aligned(work->tmp1, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real)); - snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real)); + snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real)); + snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real)); + snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real)); srenew(work->m2inv, work->nalloc); } } @@ -1918,7 +1941,7 @@ inline static void calc_exponentials(int start, int end, real f, real *d_aligned gmx_mm_pr lu; gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e; int kx; - f_simd = gmx_load1_pr(&f); + f_simd = gmx_set1_pr(f); for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE) { tmp_d1 = gmx_load_pr(d_aligned+kx); @@ -2267,6 +2290,14 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid, pme_spline_work_t *work; +#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED + real thz_buffer[12], *thz_aligned; + real dthz_buffer[12], *dthz_aligned; + + thz_aligned = gmx_simd4_align_real(thz_buffer); + dthz_aligned = gmx_simd4_align_real(dthz_buffer); +#endif + work = pme->spline_work; order = pme->pme_order; @@ -2324,23 +2355,23 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid, switch (order) { case 4: -#ifdef PME_SSE_SPREAD_GATHER -#ifdef PME_SSE_UNALIGNED -#define PME_GATHER_F_SSE_ORDER4 +#ifdef PME_SIMD4_SPREAD_GATHER +#ifdef PME_SIMD4_UNALIGNED +#define PME_GATHER_F_SIMD4_ORDER4 #else -#define PME_GATHER_F_SSE_ALIGNED +#define PME_GATHER_F_SIMD4_ALIGNED #define PME_ORDER 4 #endif -#include "pme_sse_single.h" +#include "pme_simd4.h" #else DO_FSPLINE(4); #endif break; case 5: -#ifdef PME_SSE_SPREAD_GATHER -#define PME_GATHER_F_SSE_ALIGNED +#ifdef PME_SIMD4_SPREAD_GATHER +#define PME_GATHER_F_SIMD4_ALIGNED #define PME_ORDER 5 -#include "pme_sse_single.h" +#include "pme_simd4.h" #else DO_FSPLINE(5); #endif @@ -3014,29 +3045,32 @@ static pme_spline_work_t *make_pme_spline_work(int order) { pme_spline_work_t *work; -#ifdef PME_SSE_SPREAD_GATHER - float tmp[8]; - __m128 zero_SSE; - int of, i; +#ifdef PME_SIMD4_SPREAD_GATHER + real tmp[12], *tmp_aligned; + gmx_simd4_pr zero_S; + gmx_simd4_pr real_mask_S0, real_mask_S1; + int of, i; + + snew_aligned(work, 1, SIMD4_ALIGNMENT); - snew_aligned(work, 1, 16); + tmp_aligned = gmx_simd4_align_real(tmp); - zero_SSE = _mm_setzero_ps(); + zero_S = gmx_simd4_setzero_pr(); /* Generate bit masks to mask out the unused grid entries, * as we only operate on order of the 8 grid entries that are - * load into 2 SSE float registers. + * load into 2 SIMD registers. */ for (of = 0; of < 8-(order-1); of++) { for (i = 0; i < 8; i++) { - tmp[i] = (i >= of && i < of+order ? 1 : 0); + tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0); } - work->mask_SSE0[of] = _mm_loadu_ps(tmp); - work->mask_SSE1[of] = _mm_loadu_ps(tmp+4); - work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE); - work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE); + real_mask_S0 = gmx_simd4_load_pr(tmp_aligned); + real_mask_S1 = gmx_simd4_load_pr(tmp_aligned+4); + work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S); + work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S); } #else work = NULL; diff --git a/src/mdlib/pme_simd4.h b/src/mdlib/pme_simd4.h new file mode 100644 index 0000000000..c66f8e99f6 --- /dev/null +++ b/src/mdlib/pme_simd4.h @@ -0,0 +1,356 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + * Copyright (c) 2012,2013, by the GROMACS development team, led by + * David van der Spoel, Berk Hess, Erik Lindahl, and including many + * others, as listed in the AUTHORS file in the top-level source + * directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/* This include file has code between ifdef's to make sure + * that this performance sensitive code is inlined + * and to remove conditionals and variable loop bounds at compile time. + */ + +#ifdef PME_SPREAD_SIMD4_ORDER4 +/* Spread one charge with pme_order=4 with unaligned SIMD4 load+store. + * This code does not assume any memory alignment for the grid. + */ +{ + gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3; + gmx_simd4_pr tz_S; + gmx_simd4_pr vx_S; + gmx_simd4_pr vx_tz_S; + gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3; + gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3; + + ty_S0 = gmx_simd4_set1_pr(thy[0]); + ty_S1 = gmx_simd4_set1_pr(thy[1]); + ty_S2 = gmx_simd4_set1_pr(thy[2]); + ty_S3 = gmx_simd4_set1_pr(thy[3]); + + /* With order 4 the z-spline is actually aligned */ + tz_S = gmx_simd4_load_pr(thz); + + for (ithx = 0; (ithx < 4); ithx++) + { + index_x = (i0+ithx)*pny*pnz; + valx = qn*thx[ithx]; + + vx_S = gmx_simd4_set1_pr(valx); + + vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S); + + gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0); + gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0); + gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0); + gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0); + + sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0); + sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1); + sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2); + sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3); + + gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0); + gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1); + gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2); + gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3); + } +} +#undef PME_SPREAD_SIMD4_ORDER4 +#endif + + +#ifdef PME_GATHER_F_SIMD4_ORDER4 +/* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store. + * This code does not assume any memory alignment for the grid. + */ +{ + real fx_tmp[4], fy_tmp[4], fz_tmp[4]; + + gmx_simd4_pr fx_S, fy_S, fz_S; + + gmx_simd4_pr tx_S, ty_S, tz_S; + gmx_simd4_pr dx_S, dy_S, dz_S; + + gmx_simd4_pr gval_S; + + gmx_simd4_pr fxy1_S; + gmx_simd4_pr fz1_S; + + fx_S = gmx_simd4_setzero_pr(); + fy_S = gmx_simd4_setzero_pr(); + fz_S = gmx_simd4_setzero_pr(); + + /* With order 4 the z-spline is actually aligned */ + tz_S = gmx_simd4_load_pr(thz); + dz_S = gmx_simd4_load_pr(dthz); + + for (ithx = 0; (ithx < 4); ithx++) + { + index_x = (i0+ithx)*pny*pnz; + tx_S = gmx_simd4_set1_pr(thx[ithx]); + dx_S = gmx_simd4_set1_pr(dthx[ithx]); + + for (ithy = 0; (ithy < 4); ithy++) + { + index_xy = index_x+(j0+ithy)*pnz; + ty_S = gmx_simd4_set1_pr(thy[ithy]); + dy_S = gmx_simd4_set1_pr(dthy[ithy]); + + gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0); + + fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S); + fz1_S = gmx_simd4_mul_pr(dz_S, gval_S); + + fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S); + fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S); + fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S); + } + } + + gmx_simd4_storeu_pr(fx_tmp, fx_S); + gmx_simd4_storeu_pr(fy_tmp, fy_S); + gmx_simd4_storeu_pr(fz_tmp, fz_S); + + fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3]; + fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3]; + fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3]; +} +#undef PME_GATHER_F_SIMD4_ORDER4 +#endif + + +#ifdef PME_SPREAD_SIMD4_ALIGNED +/* This code assumes that the grid is allocated 4-real aligned + * and that pnz is a multiple of 4. + * This code supports pme_order <= 5. + */ +{ + int offset; + int index; + gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4; + gmx_simd4_pr tz_S0; + gmx_simd4_pr tz_S1; + gmx_simd4_pr vx_S; + gmx_simd4_pr vx_tz_S0; + gmx_simd4_pr vx_tz_S1; + gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04; + gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14; + gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04; + gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14; + + offset = k0 & 3; + + ty_S0 = gmx_simd4_set1_pr(thy[0]); + ty_S1 = gmx_simd4_set1_pr(thy[1]); + ty_S2 = gmx_simd4_set1_pr(thy[2]); + ty_S3 = gmx_simd4_set1_pr(thy[3]); +#if PME_ORDER == 5 + ty_S4 = gmx_simd4_set1_pr(thy[4]); +#endif + +#ifdef GMX_SIMD4_HAVE_UNALIGNED + tz_S0 = gmx_simd4_loadu_pr(thz-offset); + tz_S1 = gmx_simd4_loadu_pr(thz-offset+4); +#else + { + int i; + /* Copy thz to an aligned buffer (unused buffer parts are masked) */ + for (i = 0; i < PME_ORDER; i++) + { + thz_aligned[offset+i] = thz[i]; + } + tz_S0 = gmx_simd4_load_pr(thz_aligned); + tz_S1 = gmx_simd4_load_pr(thz_aligned+4); + } +#endif + tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]); + tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]); + + for (ithx = 0; (ithx < PME_ORDER); ithx++) + { + index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset; + valx = qn*thx[ithx]; + + vx_S = gmx_simd4_set1_pr(valx); + + vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0); + vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1); + + gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz); + gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz); + gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz); + gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz); +#if PME_ORDER == 5 + gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz); +#endif + gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4); + gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4); + gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4); + gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4); +#if PME_ORDER == 5 + gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4); +#endif + + sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00); + sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01); + sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02); + sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03); +#if PME_ORDER == 5 + sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04); +#endif + sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10); + sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11); + sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12); + sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13); +#if PME_ORDER == 5 + sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14); +#endif + + gmx_simd4_store_pr(grid+index+0*pnz, sum_S00); + gmx_simd4_store_pr(grid+index+1*pnz, sum_S01); + gmx_simd4_store_pr(grid+index+2*pnz, sum_S02); + gmx_simd4_store_pr(grid+index+3*pnz, sum_S03); +#if PME_ORDER == 5 + gmx_simd4_store_pr(grid+index+4*pnz, sum_S04); +#endif + gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10); + gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11); + gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12); + gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13); +#if PME_ORDER == 5 + gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14); +#endif + } +} +#undef PME_ORDER +#undef PME_SPREAD_SIMD4_ALIGNED +#endif + + +#ifdef PME_GATHER_F_SIMD4_ALIGNED +/* This code assumes that the grid is allocated 4-real aligned + * and that pnz is a multiple of 4. + * This code supports pme_order <= 5. + */ +{ + int offset; + + real fx_tmp[4], fy_tmp[4], fz_tmp[4]; + + gmx_simd4_pr fx_S, fy_S, fz_S; + + gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1; + gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1; + + gmx_simd4_pr gval_S0; + gmx_simd4_pr gval_S1; + + gmx_simd4_pr fxy1_S0; + gmx_simd4_pr fz1_S0; + gmx_simd4_pr fxy1_S1; + gmx_simd4_pr fz1_S1; + gmx_simd4_pr fxy1_S; + gmx_simd4_pr fz1_S; + + offset = k0 & 3; + + fx_S = gmx_simd4_setzero_pr(); + fy_S = gmx_simd4_setzero_pr(); + fz_S = gmx_simd4_setzero_pr(); + +#ifdef GMX_SIMD4_HAVE_UNALIGNED + tz_S0 = gmx_simd4_loadu_pr(thz-offset); + tz_S1 = gmx_simd4_loadu_pr(thz-offset+4); + dz_S0 = gmx_simd4_loadu_pr(dthz-offset); + dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4); +#else + { + int i; + /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */ + for (i = 0; i < PME_ORDER; i++) + { + thz_aligned[offset+i] = thz[i]; + dthz_aligned[offset+i] = dthz[i]; + } + tz_S0 = gmx_simd4_load_pr(thz_aligned); + tz_S1 = gmx_simd4_load_pr(thz_aligned+4); + dz_S0 = gmx_simd4_load_pr(dthz_aligned); + dz_S1 = gmx_simd4_load_pr(dthz_aligned+4); + } +#endif + tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]); + dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]); + tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]); + dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]); + + for (ithx = 0; (ithx < PME_ORDER); ithx++) + { + index_x = (i0+ithx)*pny*pnz; + tx_S = gmx_simd4_set1_pr(thx[ithx]); + dx_S = gmx_simd4_set1_pr(dthx[ithx]); + + for (ithy = 0; (ithy < PME_ORDER); ithy++) + { + index_xy = index_x+(j0+ithy)*pnz; + ty_S = gmx_simd4_set1_pr(thy[ithy]); + dy_S = gmx_simd4_set1_pr(dthy[ithy]); + + gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset); + gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4); + + fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0); + fz1_S0 = gmx_simd4_mul_pr(dz_S0, gval_S0); + fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1); + fz1_S1 = gmx_simd4_mul_pr(dz_S1, gval_S1); + + fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1); + fz1_S = gmx_simd4_add_pr(fz1_S0, fz1_S1); + + fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S); + fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S); + fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S); + } + } + + gmx_simd4_store_pr(fx_tmp, fx_S); + gmx_simd4_store_pr(fy_tmp, fy_S); + gmx_simd4_store_pr(fz_tmp, fz_S); + + fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3]; + fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3]; + fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3]; +} +#undef PME_ORDER +#undef PME_GATHER_F_SIMD4_ALIGNED +#endif diff --git a/src/mdlib/pme_sse_single.h b/src/mdlib/pme_sse_single.h deleted file mode 100644 index ae03f54f46..0000000000 --- a/src/mdlib/pme_sse_single.h +++ /dev/null @@ -1,325 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 1991-2000, University of Groningen, The Netherlands. - * Copyright (c) 2001-2004, The GROMACS development team, - * check out http://www.gromacs.org for more information. - * Copyright (c) 2012,2013, by the GROMACS development team, led by - * David van der Spoel, Berk Hess, Erik Lindahl, and including many - * others, as listed in the AUTHORS file in the top-level source - * directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ - -/* This include file has code between ifdef's to make sure - * that this performance sensitive code is inlined - * and to remove conditionals and variable loop bounds at compile time. - */ - -#ifdef PME_SPREAD_SSE_ORDER4 -/* This code does not assume any memory alignment. - * This code only works for pme_order = 4. - */ -{ - __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3; - __m128 tz_SSE; - __m128 vx_SSE; - __m128 vx_tz_SSE; - __m128 sum_SSE0, sum_SSE1, sum_SSE2, sum_SSE3; - __m128 gri_SSE0, gri_SSE1, gri_SSE2, gri_SSE3; - - ty_SSE0 = _mm_load1_ps(&thy[0]); - ty_SSE1 = _mm_load1_ps(&thy[1]); - ty_SSE2 = _mm_load1_ps(&thy[2]); - ty_SSE3 = _mm_load1_ps(&thy[3]); - - tz_SSE = _mm_loadu_ps(thz); - - for (ithx = 0; (ithx < 4); ithx++) - { - index_x = (i0+ithx)*pny*pnz; - valx = qn*thx[ithx]; - - vx_SSE = _mm_load1_ps(&valx); - - vx_tz_SSE = _mm_mul_ps(vx_SSE, tz_SSE); - - gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0); - gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0); - gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0); - gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0); - - sum_SSE0 = _mm_add_ps(gri_SSE0, _mm_mul_ps(vx_tz_SSE, ty_SSE0)); - sum_SSE1 = _mm_add_ps(gri_SSE1, _mm_mul_ps(vx_tz_SSE, ty_SSE1)); - sum_SSE2 = _mm_add_ps(gri_SSE2, _mm_mul_ps(vx_tz_SSE, ty_SSE2)); - sum_SSE3 = _mm_add_ps(gri_SSE3, _mm_mul_ps(vx_tz_SSE, ty_SSE3)); - - _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0, sum_SSE0); - _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0, sum_SSE1); - _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0, sum_SSE2); - _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0, sum_SSE3); - } -} -#undef PME_SPREAD_SSE_ORDER4 -#endif - - -#ifdef PME_GATHER_F_SSE_ORDER4 -/* This code does not assume any memory alignment. - * This code only works for pme_order = 4. - */ -{ - float fx_tmp[4], fy_tmp[4], fz_tmp[4]; - - __m128 fx_SSE, fy_SSE, fz_SSE; - - __m128 tx_SSE, ty_SSE, tz_SSE; - __m128 dx_SSE, dy_SSE, dz_SSE; - - __m128 gval_SSE; - - __m128 fxy1_SSE; - __m128 fz1_SSE; - - fx_SSE = _mm_setzero_ps(); - fy_SSE = _mm_setzero_ps(); - fz_SSE = _mm_setzero_ps(); - - tz_SSE = _mm_loadu_ps(thz); - dz_SSE = _mm_loadu_ps(dthz); - - for (ithx = 0; (ithx < 4); ithx++) - { - index_x = (i0+ithx)*pny*pnz; - tx_SSE = _mm_load1_ps(thx+ithx); - dx_SSE = _mm_load1_ps(dthx+ithx); - - for (ithy = 0; (ithy < 4); ithy++) - { - index_xy = index_x+(j0+ithy)*pnz; - ty_SSE = _mm_load1_ps(thy+ithy); - dy_SSE = _mm_load1_ps(dthy+ithy); - - gval_SSE = _mm_loadu_ps(grid+index_xy+k0); - - fxy1_SSE = _mm_mul_ps(tz_SSE, gval_SSE); - fz1_SSE = _mm_mul_ps(dz_SSE, gval_SSE); - - fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE)); - fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE)); - fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE)); - } - } - - _mm_storeu_ps(fx_tmp, fx_SSE); - _mm_storeu_ps(fy_tmp, fy_SSE); - _mm_storeu_ps(fz_tmp, fz_SSE); - - fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3]; - fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3]; - fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3]; -} -#undef PME_GATHER_F_SSE_ORDER4 -#endif - - -#ifdef PME_SPREAD_SSE_ALIGNED -/* This code assumes that the grid is allocated 16 bit aligned - * and that pnz is a multiple of 4. - * This code supports pme_order <= 5. - */ -{ - int offset; - int index; - __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3, ty_SSE4; - __m128 tz_SSE0; - __m128 tz_SSE1; - __m128 vx_SSE; - __m128 vx_tz_SSE0; - __m128 vx_tz_SSE1; - __m128 sum_SSE00, sum_SSE01, sum_SSE02, sum_SSE03, sum_SSE04; - __m128 sum_SSE10, sum_SSE11, sum_SSE12, sum_SSE13, sum_SSE14; - __m128 gri_SSE00, gri_SSE01, gri_SSE02, gri_SSE03, gri_SSE04; - __m128 gri_SSE10, gri_SSE11, gri_SSE12, gri_SSE13, gri_SSE14; - - offset = k0 & 3; - - ty_SSE0 = _mm_load1_ps(&thy[0]); - ty_SSE1 = _mm_load1_ps(&thy[1]); - ty_SSE2 = _mm_load1_ps(&thy[2]); - ty_SSE3 = _mm_load1_ps(&thy[3]); -#if PME_ORDER == 5 - ty_SSE4 = _mm_load1_ps(&thy[4]); -#endif - - tz_SSE0 = _mm_loadu_ps(thz-offset); - tz_SSE1 = _mm_loadu_ps(thz-offset+4); - tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]); - tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]); - - for (ithx = 0; (ithx < PME_ORDER); ithx++) - { - index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset; - valx = qn*thx[ithx]; - - vx_SSE = _mm_load1_ps(&valx); - - vx_tz_SSE0 = _mm_mul_ps(vx_SSE, tz_SSE0); - vx_tz_SSE1 = _mm_mul_ps(vx_SSE, tz_SSE1); - - gri_SSE00 = _mm_load_ps(grid+index+0*pnz); - gri_SSE01 = _mm_load_ps(grid+index+1*pnz); - gri_SSE02 = _mm_load_ps(grid+index+2*pnz); - gri_SSE03 = _mm_load_ps(grid+index+3*pnz); -#if PME_ORDER == 5 - gri_SSE04 = _mm_load_ps(grid+index+4*pnz); -#endif - gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4); - gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4); - gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4); - gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4); -#if PME_ORDER == 5 - gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4); -#endif - - sum_SSE00 = _mm_add_ps(gri_SSE00, _mm_mul_ps(vx_tz_SSE0, ty_SSE0)); - sum_SSE01 = _mm_add_ps(gri_SSE01, _mm_mul_ps(vx_tz_SSE0, ty_SSE1)); - sum_SSE02 = _mm_add_ps(gri_SSE02, _mm_mul_ps(vx_tz_SSE0, ty_SSE2)); - sum_SSE03 = _mm_add_ps(gri_SSE03, _mm_mul_ps(vx_tz_SSE0, ty_SSE3)); -#if PME_ORDER == 5 - sum_SSE04 = _mm_add_ps(gri_SSE04, _mm_mul_ps(vx_tz_SSE0, ty_SSE4)); -#endif - sum_SSE10 = _mm_add_ps(gri_SSE10, _mm_mul_ps(vx_tz_SSE1, ty_SSE0)); - sum_SSE11 = _mm_add_ps(gri_SSE11, _mm_mul_ps(vx_tz_SSE1, ty_SSE1)); - sum_SSE12 = _mm_add_ps(gri_SSE12, _mm_mul_ps(vx_tz_SSE1, ty_SSE2)); - sum_SSE13 = _mm_add_ps(gri_SSE13, _mm_mul_ps(vx_tz_SSE1, ty_SSE3)); -#if PME_ORDER == 5 - sum_SSE14 = _mm_add_ps(gri_SSE14, _mm_mul_ps(vx_tz_SSE1, ty_SSE4)); -#endif - - _mm_store_ps(grid+index+0*pnz, sum_SSE00); - _mm_store_ps(grid+index+1*pnz, sum_SSE01); - _mm_store_ps(grid+index+2*pnz, sum_SSE02); - _mm_store_ps(grid+index+3*pnz, sum_SSE03); -#if PME_ORDER == 5 - _mm_store_ps(grid+index+4*pnz, sum_SSE04); -#endif - _mm_store_ps(grid+index+0*pnz+4, sum_SSE10); - _mm_store_ps(grid+index+1*pnz+4, sum_SSE11); - _mm_store_ps(grid+index+2*pnz+4, sum_SSE12); - _mm_store_ps(grid+index+3*pnz+4, sum_SSE13); -#if PME_ORDER == 5 - _mm_store_ps(grid+index+4*pnz+4, sum_SSE14); -#endif - } -} -#undef PME_ORDER -#undef PME_SPREAD_SSE_ALIGNED -#endif - - -#ifdef PME_GATHER_F_SSE_ALIGNED -/* This code assumes that the grid is allocated 16 bit aligned - * and that pnz is a multiple of 4. - * This code supports pme_order <= 5. - */ -{ - int offset; - - float fx_tmp[4], fy_tmp[4], fz_tmp[4]; - - __m128 fx_SSE, fy_SSE, fz_SSE; - - __m128 tx_SSE, ty_SSE, tz_SSE0, tz_SSE1; - __m128 dx_SSE, dy_SSE, dz_SSE0, dz_SSE1; - - __m128 gval_SSE0; - __m128 gval_SSE1; - - __m128 fxy1_SSE0; - __m128 fz1_SSE0; - __m128 fxy1_SSE1; - __m128 fz1_SSE1; - __m128 fxy1_SSE; - __m128 fz1_SSE; - - offset = k0 & 3; - - fx_SSE = _mm_setzero_ps(); - fy_SSE = _mm_setzero_ps(); - fz_SSE = _mm_setzero_ps(); - - tz_SSE0 = _mm_loadu_ps(thz-offset); - dz_SSE0 = _mm_loadu_ps(dthz-offset); - tz_SSE1 = _mm_loadu_ps(thz-offset+4); - dz_SSE1 = _mm_loadu_ps(dthz-offset+4); - tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]); - dz_SSE0 = _mm_and_ps(dz_SSE0, work->mask_SSE0[offset]); - tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]); - dz_SSE1 = _mm_and_ps(dz_SSE1, work->mask_SSE1[offset]); - - for (ithx = 0; (ithx < PME_ORDER); ithx++) - { - index_x = (i0+ithx)*pny*pnz; - tx_SSE = _mm_load1_ps(thx+ithx); - dx_SSE = _mm_load1_ps(dthx+ithx); - - for (ithy = 0; (ithy < PME_ORDER); ithy++) - { - index_xy = index_x+(j0+ithy)*pnz; - ty_SSE = _mm_load1_ps(thy+ithy); - dy_SSE = _mm_load1_ps(dthy+ithy); - - gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset); - gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4); - - fxy1_SSE0 = _mm_mul_ps(tz_SSE0, gval_SSE0); - fz1_SSE0 = _mm_mul_ps(dz_SSE0, gval_SSE0); - fxy1_SSE1 = _mm_mul_ps(tz_SSE1, gval_SSE1); - fz1_SSE1 = _mm_mul_ps(dz_SSE1, gval_SSE1); - - fxy1_SSE = _mm_add_ps(fxy1_SSE0, fxy1_SSE1); - fz1_SSE = _mm_add_ps(fz1_SSE0, fz1_SSE1); - - fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE)); - fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE)); - fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE)); - } - } - - _mm_store_ps(fx_tmp, fx_SSE); - _mm_store_ps(fy_tmp, fy_SSE); - _mm_store_ps(fz_tmp, fz_SSE); - - fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3]; - fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3]; - fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3]; -} -#undef PME_ORDER -#undef PME_GATHER_F_SSE_ALIGNED -#endif -- 2.11.4.GIT