From 868b62c12b8aa5c68a82d7ec154e8d215864446f Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland.schulz@intel.com>
Date: Mon, 11 Dec 2017 03:07:51 -0800
Subject: [PATCH] Support Simd4N for SimdRealWidth<4

If the SIMD with is smaller 4 but Simd4N is supported
then use Simd4 for Simd4N.

Also move Traits into internal namespace to signify that they
are not intended for usage outside of the simd module.

Fixes #2327

Change-Id: I3d49c57cebc5d565df442d01e322c89312771699
---
 docs/doxygen/lib/simd.md                           |  21 ++-
 src/gromacs/ewald/pme-gather.cpp                   |  30 ++--
 src/gromacs/simd/simd.h                            | 155 +++++++++++++++------
 src/gromacs/simd/tests/simd_floatingpoint_util.cpp |  19 +--
 src/gromacs/simd/tests/simd_memory.cpp             |   4 +-
 5 files changed, 163 insertions(+), 66 deletions(-)

diff --git a/docs/doxygen/lib/simd.md b/docs/doxygen/lib/simd.md
index 66f717d732..e597d1479c 100644
--- a/docs/doxygen/lib/simd.md
+++ b/docs/doxygen/lib/simd.md
@@ -374,7 +374,26 @@ Unfortunately reality is not that simple. Some algorithms like lattice
 summation need quartets of elements, so even when the SIMD width is >4 we
 need width-4 SIMD if it is supported. The availability of SIMD4 is indicated
 by \ref GMX_SIMD4_HAVE_FLOAT and \ref GMX_SIMD4_HAVE_DOUBLE. For now we only
-support a small subset of SIMD operations for SIMD4.
+support a small subset of SIMD operations for SIMD4. Because SIMD4 doesn't
+scale with increasingly large SIMD width it should be avoided for all new
+code and SIMD4N should be used instead.
+
+SIMD4N implementation
+---------------------
+
+Some code, like lattice summation, has inner loops which are smaller
+than the full SIMD width. In GROMACS algorithms 3 and 4 iterations are common
+because of PME order and three dimensions. This makes 4 an important special
+case. Vectorizing such loops efficiently requires to collapse the two
+most inner loops and using e.g. one 8-wide SIMD vector for 2 outer
+and 4 inner iterations or one 16-wide SIMD vector for 4 outer and 4 inner
+iterations. For this SIMD4N functions are
+provided. The availability of these function is indicated by
+\ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT and
+\ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE.
+These functions return the type alias Simd4NFloat / Simd4NDouble which is
+either the normal SIMD type or the SIMD4 type and thus only supports
+the operations the SIMD4 type supports.
 
 Predefined SIMD preprocessor macros
 ===================================
diff --git a/src/gromacs/ewald/pme-gather.cpp b/src/gromacs/ewald/pme-gather.cpp
index d94c6fab84..2c9e961fee 100644
--- a/src/gromacs/ewald/pme-gather.cpp
+++ b/src/gromacs/ewald/pme-gather.cpp
@@ -148,32 +148,32 @@ struct do_fspline
         const real *const gmx_restrict dthy = spline->dtheta[YY] + norder;
         const real *const gmx_restrict dthz = spline->dtheta[ZZ] + norder;
 
-        SimdReal                       fx_S = setZero();
-        SimdReal                       fy_S = setZero();
-        SimdReal                       fz_S = setZero();
+        Simd4NReal                     fx_S = setZero();
+        Simd4NReal                     fy_S = setZero();
+        Simd4NReal                     fz_S = setZero();
 
         /* With order 4 the z-spline is actually aligned */
-        const SimdReal tz_S = load4DuplicateN(thz);
-        const SimdReal dz_S = load4DuplicateN(dthz);
+        const Simd4NReal tz_S = load4DuplicateN(thz);
+        const Simd4NReal dz_S = load4DuplicateN(dthz);
 
         for (int ithx = 0; ithx < 4; ithx++)
         {
-            const int      index_x = (idxX + ithx)*gridNY*gridNZ;
-            const SimdReal tx_S    = SimdReal(thx[ithx]);
-            const SimdReal dx_S    = SimdReal(dthx[ithx]);
+            const int        index_x = (idxX + ithx)*gridNY*gridNZ;
+            const Simd4NReal tx_S    = Simd4NReal(thx[ithx]);
+            const Simd4NReal dx_S    = Simd4NReal(dthx[ithx]);
 
-            for (int ithy = 0; ithy < 4; ithy += GMX_SIMD_REAL_WIDTH/4)
+            for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH/4)
             {
-                const int      index_xy = index_x + (idxY+ithy)*gridNZ;
+                const int        index_xy = index_x + (idxY+ithy)*gridNZ;
 
-                const SimdReal ty_S = loadUNDuplicate4(thy +ithy);
-                const SimdReal dy_S = loadUNDuplicate4(dthy+ithy);
+                const Simd4NReal ty_S = loadUNDuplicate4(thy +ithy);
+                const Simd4NReal dy_S = loadUNDuplicate4(dthy+ithy);
 
-                const SimdReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
+                const Simd4NReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
 
 
-                const SimdReal fxy1_S = tz_S * gval_S;
-                const SimdReal fz1_S  = dz_S * gval_S;
+                const Simd4NReal fxy1_S = tz_S * gval_S;
+                const Simd4NReal fz1_S  = dz_S * gval_S;
 
                 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
                 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h
index 45d4baf6da..673cfa3369 100644
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -402,7 +402,17 @@ typedef Simd4FBool                Simd4Bool;
  * \{
  */
 
-/*! \libinternal \brief Simd traits */
+namespace internal
+{
+/*! \libinternal \brief Simd traits
+ *
+ * These traits are used to query data about SIMD types. Currently provided
+ * data is useful for SIMD loads (load function and helper classes for
+ * ArrayRef<> in simd_memory.h). Provided data:
+ *  - type: scalar type corresponding to the SIMD type
+ *  - width: SIMD width
+ *  - tag: tag used for type dispatch of load function
+ */
 template<typename T>
 struct SimdTraits {};
 
@@ -450,6 +460,7 @@ struct SimdTraits<const T>
     static constexpr int width = SimdTraits<T>::width;
     using tag = typename SimdTraits<T>::tag;
 };
+}   //namespace internal
 
 /*! \brief Load function that returns SIMD or scalar
  *
@@ -459,9 +470,9 @@ struct SimdTraits<const T>
  */
 template<typename T>
 static inline T
-load(const typename SimdTraits<T>::type *m) //disabled by SFINAE for non-SIMD types
+load(const typename internal::SimdTraits<T>::type *m) //disabled by SFINAE for non-SIMD types
 {
-    return simdLoad(m, typename SimdTraits<T>::tag());
+    return simdLoad(m, typename internal::SimdTraits<T>::tag());
 }
 
 template<typename T>
@@ -478,9 +489,9 @@ load(const typename std::enable_if<std::is_arithmetic<T>::value, T>::type *m)
 
 template <typename T, size_t N>
 static inline T gmx_simdcall
-load(const AlignedArray<typename SimdTraits<T>::type, N> &m)
+load(const AlignedArray<typename internal::SimdTraits<T>::type, N> &m)
 {
-    return simdLoad(m.data(), typename SimdTraits<T>::tag());
+    return simdLoad(m.data(), typename internal::SimdTraits<T>::tag());
 }
 
 /*! \brief Load function that returns SIMD or scalar based on template argument
@@ -491,9 +502,9 @@ load(const AlignedArray<typename SimdTraits<T>::type, N> &m)
  */
 template<typename T>
 static inline T
-loadU(const typename SimdTraits<T>::type *m)
+loadU(const typename internal::SimdTraits<T>::type *m)
 {
-    return simdLoadU(m, typename SimdTraits<T>::tag());
+    return simdLoadU(m, typename internal::SimdTraits<T>::tag());
 }
 
 template<typename T>
@@ -505,9 +516,9 @@ loadU(const typename std::enable_if<std::is_arithmetic<T>::value, T>::type *m)
 
 template <typename T, size_t N>
 static inline T gmx_simdcall
-loadU(const AlignedArray<typename SimdTraits<T>::type, N> &m)
+loadU(const AlignedArray<typename internal::SimdTraits<T>::type, N> &m)
 {
-    return simdLoadU(m.data(), typename SimdTraits<T>::tag());
+    return simdLoadU(m.data(), typename internal::SimdTraits<T>::tag());
 }
 
 class SimdSetZeroProxyInternal;
@@ -572,89 +583,145 @@ setZero()
     return {};
 }
 
+namespace internal
+{
+//TODO: Don't foward function but properly rename them and use proper traits
+template<typename T>
+struct Simd4Traits {};
+
+#if GMX_SIMD4_HAVE_FLOAT
+template<>
+struct Simd4Traits<Simd4Float>
+{
+    using type = float;
+};
+#endif
+
+#if GMX_SIMD4_HAVE_DOUBLE
+template<>
+struct Simd4Traits<Simd4Double>
+{
+    using type = double;
+};
+#endif
+}   //namespace internal
+
+#if GMX_SIMD4_HAVE_REAL
+template<typename T>
+T load(const typename internal::Simd4Traits<T>::type* m)
+{
+    return load4(m);
+}
+template<typename T>
+T loadU(const typename internal::Simd4Traits<T>::type* m)
+{
+    return load4U(m);
+}
+#endif
+
 /* Implement most of 4xn functions by forwarding them to other functions when possible.
  * The functions forwarded here don't need to be implemented by each implementation.
  * For width=4 all functions are forwarded and for width=8 all but loadU4NOffset are forwarded.
  */
 #if GMX_SIMD_HAVE_FLOAT
-#if GMX_SIMD_FLOAT_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
-#elif GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
+#if GMX_SIMD_FLOAT_WIDTH < 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_FLOAT)
+#elif GMX_SIMD_FLOAT_WIDTH == 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT GMX_SIMD_HAVE_LOADU
 //For GMX_SIMD_FLOAT_WIDTH>4 it is the reponsibility of the implementation to set
 //GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
 #endif
 
-#if GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-static inline SimdFloat gmx_simdcall
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
+#if GMX_SIMD_FLOAT_WIDTH < 4
+using Simd4NFloat = Simd4Float;
+#define GMX_SIMD4N_FLOAT_WIDTH 4
+#else
+using Simd4NFloat = SimdFloat;
+#define GMX_SIMD4N_FLOAT_WIDTH GMX_SIMD_FLOAT_WIDTH
+#endif
+
+#if GMX_SIMD_FLOAT_WIDTH <= 4
+static inline Simd4NFloat gmx_simdcall
 loadUNDuplicate4(const float* f)
 {
-    return SimdFloat(*f);
+    return Simd4NFloat(*f);
 }
-static inline SimdFloat gmx_simdcall
+static inline Simd4NFloat gmx_simdcall
 load4DuplicateN(const float* f)
 {
-    return load<SimdFloat>(f);
+    return load<Simd4NFloat>(f);
 }
-static inline SimdFloat gmx_simdcall
+static inline Simd4NFloat gmx_simdcall
 loadU4NOffset(const float* f, int)
 {
-    return loadU<SimdFloat>(f);
+    return loadU<Simd4NFloat>(f);
 }
-#elif GMX_SIMD_FLOAT_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT && GMX_SIMD_HAVE_LOADU
-static inline SimdFloat gmx_simdcall
+#elif GMX_SIMD_FLOAT_WIDTH == 8
+static inline Simd4NFloat gmx_simdcall
 loadUNDuplicate4(const float* f)
 {
     return loadU1DualHsimd(f);
 }
-static inline SimdFloat gmx_simdcall
+static inline Simd4NFloat gmx_simdcall
 load4DuplicateN(const float* f)
 {
     return loadDuplicateHsimd(f);
 }
 #endif
-#else //GMX_SIMD_HAVE_FLOAT
+#endif //GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
+#else  //GMX_SIMD_HAVE_FLOAT
 #define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
 #endif
 
 #if GMX_SIMD_HAVE_DOUBLE
-#if GMX_SIMD_DOUBLE_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
-#elif GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
+#if GMX_SIMD_DOUBLE_WIDTH < 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_DOUBLE)
+#elif GMX_SIMD_DOUBLE_WIDTH == 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE GMX_SIMD_HAVE_LOADU
 //For GMX_SIMD_DOUBLE_WIDTH>4 it is the reponsibility of the implementation to set
 //GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
 #endif
 
-#if GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-static inline SimdDouble gmx_simdcall
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
+#if GMX_SIMD_DOUBLE_WIDTH < 4
+using Simd4NDouble = Simd4Double;
+#define GMX_SIMD4N_DOUBLE_WIDTH 4
+#else
+using Simd4NDouble = SimdDouble;
+#define GMX_SIMD4N_DOUBLE_WIDTH GMX_SIMD_DOUBLE_WIDTH
+#endif
+
+#if GMX_SIMD_DOUBLE_WIDTH <= 4
+static inline Simd4NDouble gmx_simdcall
 loadUNDuplicate4(const double* f)
 {
-    return SimdDouble(*f);
+    return Simd4NDouble(*f);
 }
-static inline SimdDouble gmx_simdcall
+static inline Simd4NDouble gmx_simdcall
 load4DuplicateN(const double* f)
 {
-    return load<SimdDouble>(f);
+    return load<Simd4NDouble>(f);
 }
-static inline SimdDouble gmx_simdcall
+static inline Simd4NDouble gmx_simdcall
 loadU4NOffset(const double* f, int)
 {
-    return loadU<SimdDouble>(f);
+    return loadU<Simd4NDouble>(f);
 }
-#elif GMX_SIMD_DOUBLE_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE && GMX_SIMD_HAVE_LOADU
-static inline SimdDouble gmx_simdcall
+#elif GMX_SIMD_DOUBLE_WIDTH == 8
+static inline Simd4NDouble gmx_simdcall
 loadUNDuplicate4(const double* f)
 {
     return loadU1DualHsimd(f);
 }
-static inline SimdDouble gmx_simdcall
+static inline Simd4NDouble gmx_simdcall
 load4DuplicateN(const double* f)
 {
     return loadDuplicateHsimd(f);
 }
 #endif
-#else //GMX_SIMD_HAVE_DOUBLE
+#endif //GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
+#else  //GMX_SIMD_HAVE_DOUBLE
 #define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
 #endif
 
@@ -664,6 +731,16 @@ load4DuplicateN(const double* f)
 #define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
 #endif
 
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
+#if GMX_DOUBLE
+using Simd4NReal = Simd4NDouble;
+#define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_DOUBLE_WIDTH
+#else
+using Simd4NReal = Simd4NFloat;
+#define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_FLOAT_WIDTH
+#endif
+#endif
+
 //! \}  end of name-group proxy objects
 
 }      // namespace gmx
diff --git a/src/gromacs/simd/tests/simd_floatingpoint_util.cpp b/src/gromacs/simd/tests/simd_floatingpoint_util.cpp
index 78338938a1..a43dfc5d7f 100644
--- a/src/gromacs/simd/tests/simd_floatingpoint_util.cpp
+++ b/src/gromacs/simd/tests/simd_floatingpoint_util.cpp
@@ -919,11 +919,12 @@ TEST_F(SimdFloatingpointUtilTest, reduceIncr4SumHsimd)
 
 #endif      // GMX_SIMD_HAVE_HSIMD_UTIL_REAL
 
-#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
+//Test Currently doesn't work for GMX_SIMD_REAL_WIDTH<4. Should be fixed by having GMX_EXPECT_SIMD_REAL_EQ which works for both Simd and Simd4
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL && GMX_SIMD_REAL_WIDTH >= 4
 
 TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
 {
-    SimdReal        v0, v1;
+    Simd4NReal      v0, v1;
     int             i;
     real            data[GMX_SIMD_REAL_WIDTH/4];
     std::iota(data, data+GMX_SIMD_REAL_WIDTH/4, 1);
@@ -936,7 +937,7 @@ TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
         val0_[i*4] = val0_[i*4+1] = val0_[i*4+2] = val0_[i*4+3] = data[i];
     }
 
-    v0 = load<SimdReal>(val0_);
+    v0 = load<Simd4NReal>(val0_);
     v1 = loadUNDuplicate4(data);
 
     GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
@@ -944,9 +945,9 @@ TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
 
 TEST_F(SimdFloatingpointUtilTest, load4DuplicateN)
 {
-    SimdReal        v0, v1;
-    int             i;
-    real            data[4] = { 1, 2, 3, 4};
+    Simd4NReal        v0, v1;
+    int               i;
+    real              data[4] = { 1, 2, 3, 4};
 
     for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
     {
@@ -956,7 +957,7 @@ TEST_F(SimdFloatingpointUtilTest, load4DuplicateN)
         val0_[i*4+3] = data[3];
     }
 
-    v0 = load<SimdReal>(val0_);
+    v0 = load<Simd4NReal>(val0_);
     v1 = load4DuplicateN(val0_);
 
     GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
@@ -977,8 +978,8 @@ TEST_F(SimdFloatingpointUtilTest, loadU4NOffset)
         val0_[i*4+3] = data[3+offset*i];
     }
 
-    const SimdReal v0 = load<SimdReal>(val0_);
-    const SimdReal v1 = loadU4NOffset(data, offset);
+    const Simd4NReal v0 = load<Simd4NReal>(val0_);
+    const Simd4NReal v1 = loadU4NOffset(data, offset);
 
     GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
 }
diff --git a/src/gromacs/simd/tests/simd_memory.cpp b/src/gromacs/simd/tests/simd_memory.cpp
index 55b1893adf..24aab65dbc 100644
--- a/src/gromacs/simd/tests/simd_memory.cpp
+++ b/src/gromacs/simd/tests/simd_memory.cpp
@@ -79,8 +79,8 @@ class ArrayRefTest : public test::SimdTest
         using ArrayRefType = TypeParam;
         using PointerType  = typename ArrayRefType::pointer;
         using ValueType    = typename ArrayRefType::value_type;
-        using ElementType  = typename std::remove_const<typename SimdTraits<ValueType>::type>::type;
-        static constexpr int width = SimdTraits<ValueType>::width;
+        using ElementType  = typename std::remove_const<typename gmx::internal::SimdTraits<ValueType>::type>::type;
+        static constexpr int width = gmx::internal::SimdTraits<ValueType>::width;
 
         /*! \brief Run the same tests all the time
          *
-- 
2.11.4.GIT