From 868b62c12b8aa5c68a82d7ec154e8d215864446f Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Mon, 11 Dec 2017 03:07:51 -0800 Subject: [PATCH] Support Simd4N for SimdRealWidth<4 If the SIMD with is smaller 4 but Simd4N is supported then use Simd4 for Simd4N. Also move Traits into internal namespace to signify that they are not intended for usage outside of the simd module. Fixes #2327 Change-Id: I3d49c57cebc5d565df442d01e322c89312771699 --- docs/doxygen/lib/simd.md | 21 ++- src/gromacs/ewald/pme-gather.cpp | 30 ++-- src/gromacs/simd/simd.h | 155 +++++++++++++++------ src/gromacs/simd/tests/simd_floatingpoint_util.cpp | 19 +-- src/gromacs/simd/tests/simd_memory.cpp | 4 +- 5 files changed, 163 insertions(+), 66 deletions(-) diff --git a/docs/doxygen/lib/simd.md b/docs/doxygen/lib/simd.md index 66f717d732..e597d1479c 100644 --- a/docs/doxygen/lib/simd.md +++ b/docs/doxygen/lib/simd.md @@ -374,7 +374,26 @@ Unfortunately reality is not that simple. Some algorithms like lattice summation need quartets of elements, so even when the SIMD width is >4 we need width-4 SIMD if it is supported. The availability of SIMD4 is indicated by \ref GMX_SIMD4_HAVE_FLOAT and \ref GMX_SIMD4_HAVE_DOUBLE. For now we only -support a small subset of SIMD operations for SIMD4. +support a small subset of SIMD operations for SIMD4. Because SIMD4 doesn't +scale with increasingly large SIMD width it should be avoided for all new +code and SIMD4N should be used instead. + +SIMD4N implementation +--------------------- + +Some code, like lattice summation, has inner loops which are smaller +than the full SIMD width. In GROMACS algorithms 3 and 4 iterations are common +because of PME order and three dimensions. This makes 4 an important special +case. Vectorizing such loops efficiently requires to collapse the two +most inner loops and using e.g. one 8-wide SIMD vector for 2 outer +and 4 inner iterations or one 16-wide SIMD vector for 4 outer and 4 inner +iterations. For this SIMD4N functions are +provided. The availability of these function is indicated by +\ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT and +\ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE. +These functions return the type alias Simd4NFloat / Simd4NDouble which is +either the normal SIMD type or the SIMD4 type and thus only supports +the operations the SIMD4 type supports. Predefined SIMD preprocessor macros =================================== diff --git a/src/gromacs/ewald/pme-gather.cpp b/src/gromacs/ewald/pme-gather.cpp index d94c6fab84..2c9e961fee 100644 --- a/src/gromacs/ewald/pme-gather.cpp +++ b/src/gromacs/ewald/pme-gather.cpp @@ -148,32 +148,32 @@ struct do_fspline const real *const gmx_restrict dthy = spline->dtheta[YY] + norder; const real *const gmx_restrict dthz = spline->dtheta[ZZ] + norder; - SimdReal fx_S = setZero(); - SimdReal fy_S = setZero(); - SimdReal fz_S = setZero(); + Simd4NReal fx_S = setZero(); + Simd4NReal fy_S = setZero(); + Simd4NReal fz_S = setZero(); /* With order 4 the z-spline is actually aligned */ - const SimdReal tz_S = load4DuplicateN(thz); - const SimdReal dz_S = load4DuplicateN(dthz); + const Simd4NReal tz_S = load4DuplicateN(thz); + const Simd4NReal dz_S = load4DuplicateN(dthz); for (int ithx = 0; ithx < 4; ithx++) { - const int index_x = (idxX + ithx)*gridNY*gridNZ; - const SimdReal tx_S = SimdReal(thx[ithx]); - const SimdReal dx_S = SimdReal(dthx[ithx]); + const int index_x = (idxX + ithx)*gridNY*gridNZ; + const Simd4NReal tx_S = Simd4NReal(thx[ithx]); + const Simd4NReal dx_S = Simd4NReal(dthx[ithx]); - for (int ithy = 0; ithy < 4; ithy += GMX_SIMD_REAL_WIDTH/4) + for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH/4) { - const int index_xy = index_x + (idxY+ithy)*gridNZ; + const int index_xy = index_x + (idxY+ithy)*gridNZ; - const SimdReal ty_S = loadUNDuplicate4(thy +ithy); - const SimdReal dy_S = loadUNDuplicate4(dthy+ithy); + const Simd4NReal ty_S = loadUNDuplicate4(thy +ithy); + const Simd4NReal dy_S = loadUNDuplicate4(dthy+ithy); - const SimdReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ); + const Simd4NReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ); - const SimdReal fxy1_S = tz_S * gval_S; - const SimdReal fz1_S = dz_S * gval_S; + const Simd4NReal fxy1_S = tz_S * gval_S; + const Simd4NReal fz1_S = dz_S * gval_S; fx_S = fma(dx_S * ty_S, fxy1_S, fx_S); fy_S = fma(tx_S * dy_S, fxy1_S, fy_S); diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h index 45d4baf6da..673cfa3369 100644 --- a/src/gromacs/simd/simd.h +++ b/src/gromacs/simd/simd.h @@ -402,7 +402,17 @@ typedef Simd4FBool Simd4Bool; * \{ */ -/*! \libinternal \brief Simd traits */ +namespace internal +{ +/*! \libinternal \brief Simd traits + * + * These traits are used to query data about SIMD types. Currently provided + * data is useful for SIMD loads (load function and helper classes for + * ArrayRef<> in simd_memory.h). Provided data: + * - type: scalar type corresponding to the SIMD type + * - width: SIMD width + * - tag: tag used for type dispatch of load function + */ template struct SimdTraits {}; @@ -450,6 +460,7 @@ struct SimdTraits static constexpr int width = SimdTraits::width; using tag = typename SimdTraits::tag; }; +} //namespace internal /*! \brief Load function that returns SIMD or scalar * @@ -459,9 +470,9 @@ struct SimdTraits */ template static inline T -load(const typename SimdTraits::type *m) //disabled by SFINAE for non-SIMD types +load(const typename internal::SimdTraits::type *m) //disabled by SFINAE for non-SIMD types { - return simdLoad(m, typename SimdTraits::tag()); + return simdLoad(m, typename internal::SimdTraits::tag()); } template @@ -478,9 +489,9 @@ load(const typename std::enable_if::value, T>::type *m) template static inline T gmx_simdcall -load(const AlignedArray::type, N> &m) +load(const AlignedArray::type, N> &m) { - return simdLoad(m.data(), typename SimdTraits::tag()); + return simdLoad(m.data(), typename internal::SimdTraits::tag()); } /*! \brief Load function that returns SIMD or scalar based on template argument @@ -491,9 +502,9 @@ load(const AlignedArray::type, N> &m) */ template static inline T -loadU(const typename SimdTraits::type *m) +loadU(const typename internal::SimdTraits::type *m) { - return simdLoadU(m, typename SimdTraits::tag()); + return simdLoadU(m, typename internal::SimdTraits::tag()); } template @@ -505,9 +516,9 @@ loadU(const typename std::enable_if::value, T>::type *m) template static inline T gmx_simdcall -loadU(const AlignedArray::type, N> &m) +loadU(const AlignedArray::type, N> &m) { - return simdLoadU(m.data(), typename SimdTraits::tag()); + return simdLoadU(m.data(), typename internal::SimdTraits::tag()); } class SimdSetZeroProxyInternal; @@ -572,89 +583,145 @@ setZero() return {}; } +namespace internal +{ +//TODO: Don't foward function but properly rename them and use proper traits +template +struct Simd4Traits {}; + +#if GMX_SIMD4_HAVE_FLOAT +template<> +struct Simd4Traits +{ + using type = float; +}; +#endif + +#if GMX_SIMD4_HAVE_DOUBLE +template<> +struct Simd4Traits +{ + using type = double; +}; +#endif +} //namespace internal + +#if GMX_SIMD4_HAVE_REAL +template +T load(const typename internal::Simd4Traits::type* m) +{ + return load4(m); +} +template +T loadU(const typename internal::Simd4Traits::type* m) +{ + return load4U(m); +} +#endif + /* Implement most of 4xn functions by forwarding them to other functions when possible. * The functions forwarded here don't need to be implemented by each implementation. * For width=4 all functions are forwarded and for width=8 all but loadU4NOffset are forwarded. */ #if GMX_SIMD_HAVE_FLOAT -#if GMX_SIMD_FLOAT_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU -#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0 -#elif GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU -#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1 +#if GMX_SIMD_FLOAT_WIDTH < 4 +#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_FLOAT) +#elif GMX_SIMD_FLOAT_WIDTH == 4 +#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT GMX_SIMD_HAVE_LOADU //For GMX_SIMD_FLOAT_WIDTH>4 it is the reponsibility of the implementation to set //GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT #endif -#if GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU -static inline SimdFloat gmx_simdcall +#if GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT +#if GMX_SIMD_FLOAT_WIDTH < 4 +using Simd4NFloat = Simd4Float; +#define GMX_SIMD4N_FLOAT_WIDTH 4 +#else +using Simd4NFloat = SimdFloat; +#define GMX_SIMD4N_FLOAT_WIDTH GMX_SIMD_FLOAT_WIDTH +#endif + +#if GMX_SIMD_FLOAT_WIDTH <= 4 +static inline Simd4NFloat gmx_simdcall loadUNDuplicate4(const float* f) { - return SimdFloat(*f); + return Simd4NFloat(*f); } -static inline SimdFloat gmx_simdcall +static inline Simd4NFloat gmx_simdcall load4DuplicateN(const float* f) { - return load(f); + return load(f); } -static inline SimdFloat gmx_simdcall +static inline Simd4NFloat gmx_simdcall loadU4NOffset(const float* f, int) { - return loadU(f); + return loadU(f); } -#elif GMX_SIMD_FLOAT_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT && GMX_SIMD_HAVE_LOADU -static inline SimdFloat gmx_simdcall +#elif GMX_SIMD_FLOAT_WIDTH == 8 +static inline Simd4NFloat gmx_simdcall loadUNDuplicate4(const float* f) { return loadU1DualHsimd(f); } -static inline SimdFloat gmx_simdcall +static inline Simd4NFloat gmx_simdcall load4DuplicateN(const float* f) { return loadDuplicateHsimd(f); } #endif -#else //GMX_SIMD_HAVE_FLOAT +#endif //GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT +#else //GMX_SIMD_HAVE_FLOAT #define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0 #endif #if GMX_SIMD_HAVE_DOUBLE -#if GMX_SIMD_DOUBLE_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU -#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0 -#elif GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU -#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1 +#if GMX_SIMD_DOUBLE_WIDTH < 4 +#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_DOUBLE) +#elif GMX_SIMD_DOUBLE_WIDTH == 4 +#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE GMX_SIMD_HAVE_LOADU //For GMX_SIMD_DOUBLE_WIDTH>4 it is the reponsibility of the implementation to set //GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE #endif -#if GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU -static inline SimdDouble gmx_simdcall +#if GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE +#if GMX_SIMD_DOUBLE_WIDTH < 4 +using Simd4NDouble = Simd4Double; +#define GMX_SIMD4N_DOUBLE_WIDTH 4 +#else +using Simd4NDouble = SimdDouble; +#define GMX_SIMD4N_DOUBLE_WIDTH GMX_SIMD_DOUBLE_WIDTH +#endif + +#if GMX_SIMD_DOUBLE_WIDTH <= 4 +static inline Simd4NDouble gmx_simdcall loadUNDuplicate4(const double* f) { - return SimdDouble(*f); + return Simd4NDouble(*f); } -static inline SimdDouble gmx_simdcall +static inline Simd4NDouble gmx_simdcall load4DuplicateN(const double* f) { - return load(f); + return load(f); } -static inline SimdDouble gmx_simdcall +static inline Simd4NDouble gmx_simdcall loadU4NOffset(const double* f, int) { - return loadU(f); + return loadU(f); } -#elif GMX_SIMD_DOUBLE_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE && GMX_SIMD_HAVE_LOADU -static inline SimdDouble gmx_simdcall +#elif GMX_SIMD_DOUBLE_WIDTH == 8 +static inline Simd4NDouble gmx_simdcall loadUNDuplicate4(const double* f) { return loadU1DualHsimd(f); } -static inline SimdDouble gmx_simdcall +static inline Simd4NDouble gmx_simdcall load4DuplicateN(const double* f) { return loadDuplicateHsimd(f); } #endif -#else //GMX_SIMD_HAVE_DOUBLE +#endif //GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE +#else //GMX_SIMD_HAVE_DOUBLE #define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0 #endif @@ -664,6 +731,16 @@ load4DuplicateN(const double* f) #define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT #endif +#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL +#if GMX_DOUBLE +using Simd4NReal = Simd4NDouble; +#define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_DOUBLE_WIDTH +#else +using Simd4NReal = Simd4NFloat; +#define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_FLOAT_WIDTH +#endif +#endif + //! \} end of name-group proxy objects } // namespace gmx diff --git a/src/gromacs/simd/tests/simd_floatingpoint_util.cpp b/src/gromacs/simd/tests/simd_floatingpoint_util.cpp index 78338938a1..a43dfc5d7f 100644 --- a/src/gromacs/simd/tests/simd_floatingpoint_util.cpp +++ b/src/gromacs/simd/tests/simd_floatingpoint_util.cpp @@ -919,11 +919,12 @@ TEST_F(SimdFloatingpointUtilTest, reduceIncr4SumHsimd) #endif // GMX_SIMD_HAVE_HSIMD_UTIL_REAL -#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL +//Test Currently doesn't work for GMX_SIMD_REAL_WIDTH<4. Should be fixed by having GMX_EXPECT_SIMD_REAL_EQ which works for both Simd and Simd4 +#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL && GMX_SIMD_REAL_WIDTH >= 4 TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4) { - SimdReal v0, v1; + Simd4NReal v0, v1; int i; real data[GMX_SIMD_REAL_WIDTH/4]; std::iota(data, data+GMX_SIMD_REAL_WIDTH/4, 1); @@ -936,7 +937,7 @@ TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4) val0_[i*4] = val0_[i*4+1] = val0_[i*4+2] = val0_[i*4+3] = data[i]; } - v0 = load(val0_); + v0 = load(val0_); v1 = loadUNDuplicate4(data); GMX_EXPECT_SIMD_REAL_EQ(v0, v1); @@ -944,9 +945,9 @@ TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4) TEST_F(SimdFloatingpointUtilTest, load4DuplicateN) { - SimdReal v0, v1; - int i; - real data[4] = { 1, 2, 3, 4}; + Simd4NReal v0, v1; + int i; + real data[4] = { 1, 2, 3, 4}; for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++) { @@ -956,7 +957,7 @@ TEST_F(SimdFloatingpointUtilTest, load4DuplicateN) val0_[i*4+3] = data[3]; } - v0 = load(val0_); + v0 = load(val0_); v1 = load4DuplicateN(val0_); GMX_EXPECT_SIMD_REAL_EQ(v0, v1); @@ -977,8 +978,8 @@ TEST_F(SimdFloatingpointUtilTest, loadU4NOffset) val0_[i*4+3] = data[3+offset*i]; } - const SimdReal v0 = load(val0_); - const SimdReal v1 = loadU4NOffset(data, offset); + const Simd4NReal v0 = load(val0_); + const Simd4NReal v1 = loadU4NOffset(data, offset); GMX_EXPECT_SIMD_REAL_EQ(v0, v1); } diff --git a/src/gromacs/simd/tests/simd_memory.cpp b/src/gromacs/simd/tests/simd_memory.cpp index 55b1893adf..24aab65dbc 100644 --- a/src/gromacs/simd/tests/simd_memory.cpp +++ b/src/gromacs/simd/tests/simd_memory.cpp @@ -79,8 +79,8 @@ class ArrayRefTest : public test::SimdTest using ArrayRefType = TypeParam; using PointerType = typename ArrayRefType::pointer; using ValueType = typename ArrayRefType::value_type; - using ElementType = typename std::remove_const::type>::type; - static constexpr int width = SimdTraits::width; + using ElementType = typename std::remove_const::type>::type; + static constexpr int width = gmx::internal::SimdTraits::width; /*! \brief Run the same tests all the time * -- 2.11.4.GIT