From 3a895ab6d5c02e5f28bcdeed7ede260bea62b677 Mon Sep 17 00:00:00 2001
From: Erik Lindahl <erik@kth.se>
Date: Mon, 6 Jul 2015 22:32:41 +0200
Subject: [PATCH] Extended SIMD, impl for Arm Neon and 64-bit Neon Asimd

Tested with gcc-4.9 and 5.3, using Neon on Jetson
TK1 and TX1 (both in 32-bit mode) and Neon asimd on
APM X-Gene (64-bit mode).

Change-Id: I4b9f0da49b1dda3b199eeec8e45688d49a43783e
---
 cmake/gmxDetectSimd.cmake                          |   4 +-
 cmake/gmxManageSimd.cmake                          |  24 +-
 docs/doxygen/suppressions.txt                      |   2 -
 src/gromacs/hardware/cpuinfo.cpp                   |   8 +-
 src/gromacs/simd/impl_arm_neon/impl_arm_neon.h     |   8 +-
 .../simd/impl_arm_neon/impl_arm_neon_definitions.h |  81 ++
 .../impl_arm_neon_general.h}                       |  35 +-
 .../simd/impl_arm_neon/impl_arm_neon_simd4_float.h | 434 +++++++--
 .../simd/impl_arm_neon/impl_arm_neon_simd_float.h  | 982 +++++++++++++++++----
 .../simd/impl_arm_neon/impl_arm_neon_util_float.h  | 361 ++++++++
 .../simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h |   8 +-
 .../impl_arm_neon_asimd_definitions.h              |  83 ++
 .../impl_arm_neon_asimd_general.h}                 |   9 +-
 .../impl_arm_neon_asimd_simd4_float.h              | 124 +++
 .../impl_arm_neon_asimd_simd_double.h              | 911 +++++++++++++++----
 .../impl_arm_neon_asimd_simd_float.h               | 136 +--
 .../impl_arm_neon_asimd_util_double.h              | 306 +++++++
 .../impl_arm_neon_asimd_util_float.h}              |  35 +-
 src/gromacs/simd/simd.h                            |   4 +
 19 files changed, 2990 insertions(+), 565 deletions(-)
 create mode 100644 src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h
 rename src/gromacs/simd/{impl_arm_neon_asimd/impl_arm_neon_asimd_common.h => impl_arm_neon/impl_arm_neon_general.h} (64%)
 rewrite src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h (76%)
 create mode 100644 src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h
 create mode 100644 src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h
 copy src/gromacs/simd/{impl_arm_neon/impl_arm_neon.h => impl_arm_neon_asimd/impl_arm_neon_asimd_general.h} (89%)
 create mode 100644 src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h
 rewrite src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h (74%)
 create mode 100644 src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h
 rename src/gromacs/simd/{impl_arm_neon/impl_arm_neon_common.h => impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h} (55%)

diff --git a/cmake/gmxDetectSimd.cmake b/cmake/gmxDetectSimd.cmake
index f9e6a29c27..eb4c8679eb 100644
--- a/cmake/gmxDetectSimd.cmake
+++ b/cmake/gmxDetectSimd.cmake
@@ -118,10 +118,10 @@ function(gmx_suggest_simd _suggested_simd)
                 set(OUTPUT_SIMD "IBM_VMX")
             elseif(OUTPUT_TMP MATCHES " qpx ")
                 set(OUTPUT_SIMD "IBM_QPX")
-            elseif(OUTPUT_TMP MATCHES " neon ")
-                set(OUTPUT_SIMD "ARM_NEON")
             elseif(OUTPUT_TMP MATCHES " neon_asimd ")
                 set(OUTPUT_SIMD "ARM_NEON_ASIMD")
+            elseif(OUTPUT_TMP MATCHES " neon ")
+                set(OUTPUT_SIMD "ARM_NEON")
             endif()
         endif()
 
diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake
index c9c39e9999..76e337970f 100644
--- a/cmake/gmxManageSimd.cmake
+++ b/cmake/gmxManageSimd.cmake
@@ -328,45 +328,39 @@ elseif(GMX_SIMD STREQUAL "AVX_512ER")
 
 elseif(GMX_SIMD STREQUAL "ARM_NEON")
 
-    gmx_find_cflag_for_source(CFLAGS_ARM_NEON "C compiler 32-bit ARM NEON flag"
+    gmx_find_cflag_for_source(CFLAGS_ARM_NEON "C compiler ARM NEON flag"
                               "#include<arm_neon.h>
                               int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
                               SIMD_C_FLAGS
-                              "-mfpu=neon" "")
-    gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON "C++ compiler 32-bit ARM NEON flag"
+                              "-mfpu=neon-vfpv4" "-mfpu=neon" "")
+    gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON "C++ compiler ARM NEON flag"
                                 "#include<arm_neon.h>
                                 int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
                                 SIMD_CXX_FLAGS
-                                "-mfpu=neon" "-D__STDC_CONSTANT_MACROS" "")
+                                "-mfpu=neon-vfpv4" "-mfpu=neon" "-D__STDC_CONSTANT_MACROS" "")
 
     if(NOT CFLAGS_ARM_NEON OR NOT CXXFLAGS_ARM_NEON)
-        message(FATAL_ERROR "Cannot find ARM 32-bit NEON compiler flag. Use a newer compiler, or disable NEON SIMD.")
+        message(FATAL_ERROR "Cannot find ARM NEON compiler flag. Use a newer compiler, or disable NEON SIMD.")
     endif()
 
     set(GMX_SIMD_ARM_NEON 1)
     set(SIMD_STATUS_MESSAGE "Enabling 32-bit ARM NEON SIMD instructions")
 
 elseif(GMX_SIMD STREQUAL "ARM_NEON_ASIMD")
-    # Gcc-4.8.1 appears to have a bug where the c++ compiler requires
-    # -D__STDC_CONSTANT_MACROS if we include arm_neon.h
 
     gmx_find_cflag_for_source(CFLAGS_ARM_NEON_ASIMD "C compiler ARM NEON Advanced SIMD flag"
                               "#include<arm_neon.h>
-                              int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);return vgetq_lane_f64(x,0)>0;}"
+                               int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);x=vrndnq_f64(x);return vgetq_lane_f64(x,0)>0;}"
                               SIMD_C_FLAGS
                               "")
     gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON_ASIMD "C++ compiler ARM NEON Advanced SIMD flag"
                                 "#include<arm_neon.h>
-                                int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);return vgetq_lane_f64(x,0)>0;}"
+                                int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);x=vrndnq_f64(x);return vgetq_lane_f64(x,0)>0;}"
                                 SIMD_CXX_FLAGS
-                                "-D__STDC_CONSTANT_MACROS" "")
+                                "")
 
     if(NOT CFLAGS_ARM_NEON_ASIMD OR NOT CXXFLAGS_ARM_NEON_ASIMD)
-        message(FATAL_ERROR "Cannot find ARM (AArch64) NEON Advanced SIMD compiler flag. Use a newer compiler, or disable SIMD.")
-    endif()
-
-    if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9")
-        message(WARNING "At least gcc-4.8.1 has many bugs for ARM (AArch64) NEON Advanced SIMD compilation. You might need gcc version 4.9 or later.")
+        message(FATAL_ERROR "Compiler does not fully support ARM (AArch64) NEON Advanced SIMD. Use a newer compiler (gcc version 4.9 or later), or disable SIMD.")
     endif()
 
     if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.4")
diff --git a/docs/doxygen/suppressions.txt b/docs/doxygen/suppressions.txt
index 3b00cd0e5e..782db00290 100644
--- a/docs/doxygen/suppressions.txt
+++ b/docs/doxygen/suppressions.txt
@@ -46,8 +46,6 @@ src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.cpp: warning: inc
 src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h: warning: should include "nbnxn_simd.h"
 
 # Temporary while we change the SIMD implementation
-src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h: warning: should include "simd.h"
-src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h: warning: should include "simd.h"
 src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx_common.h: warning: should include "simd.h"
 src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_common.h: warning: should include "simd.h"
 src/gromacs/simd/impl_intel_mic/impl_intel_mic_common.h: warning: should include "simd.h"
diff --git a/src/gromacs/hardware/cpuinfo.cpp b/src/gromacs/hardware/cpuinfo.cpp
index 29fc25520e..afe09bbdf1 100644
--- a/src/gromacs/hardware/cpuinfo.cpp
+++ b/src/gromacs/hardware/cpuinfo.cpp
@@ -799,7 +799,13 @@ detectProcCpuInfoArm(const std::map<std::string, std::string>   &cpuInfo,
         }
         if (s.find("asimd") != std::string::npos)
         {
-            features->insert(CpuInfo::Feature::Arm_NeonAsimd);
+            // At least Jetson TX1 runs a 32-bit environment by default, although
+            // the kernel is 64-bits, and reports asimd feature flags. We cannot
+            // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
+            if (sizeof(void *) == 8)
+            {
+                features->insert(CpuInfo::Feature::Arm_NeonAsimd);
+            }
         }
     }
 }
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
index 5357dfc978..2c4718e200 100644
--- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
@@ -36,7 +36,13 @@
 #ifndef GMX_SIMD_IMPL_ARM_NEON_H
 #define GMX_SIMD_IMPL_ARM_NEON_H
 
+#include "impl_arm_neon_definitions.h"
+#include "impl_arm_neon_general.h"
+// Arm/Neon cannot do double precision SIMD4
 #include "impl_arm_neon_simd4_float.h"
+// Arm/Neon cannot do double precision SIMD
 #include "impl_arm_neon_simd_float.h"
+// Arm/Neon cannot do double precision SIMD utilities
+#include "impl_arm_neon_util_float.h"
 
-#endif /* GMX_SIMD_IMPL_ARM_NEON_H */
+#endif // GMX_SIMD_IMPL_ARM_NEON_H
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h
new file mode 100644
index 0000000000..adae1cc7da
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h
@@ -0,0 +1,81 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_DEFINITIONS_H
+#define GMX_SIMD_IMPL_ARM_NEON_DEFINITIONS_H
+
+#define GMX_SIMD                                1
+#define GMX_SIMD_HAVE_FLOAT                     1
+#define GMX_SIMD_HAVE_DOUBLE                    0
+#define GMX_SIMD_HAVE_LOADU                     1
+#define GMX_SIMD_HAVE_STOREU                    1
+#define GMX_SIMD_HAVE_LOGICAL                   1
+#define GMX_SIMD_HAVE_FMA                       1
+#define GMX_SIMD_HAVE_FINT32_EXTRACT            1
+#define GMX_SIMD_HAVE_FINT32_LOGICAL            1
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS        1
+#define GMX_SIMD_HAVE_DINT32_EXTRACT            0
+#define GMX_SIMD_HAVE_DINT32_LOGICAL            0
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS        0
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT     0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT   1
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT     1
+#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT          0
+#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT         0
+#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT          0
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE    0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE  0
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE    0
+#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE         0
+#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE        0
+#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE         0
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT   1
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE  0
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT          0  // No need for half-simd, width is 4
+#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE         0
+
+#define GMX_SIMD4_HAVE_FLOAT                    1
+#define GMX_SIMD4_HAVE_DOUBLE                   0
+
+// Implementation details
+#define GMX_SIMD_FLOAT_WIDTH                    4
+#undef  GMX_SIMD_DOUBLE_WIDTH
+#define GMX_SIMD_FINT32_WIDTH                   4
+#undef  GMX_SIMD_DINT32_WIDTH
+#define GMX_SIMD4_WIDTH                         4
+#define GMX_SIMD_RSQRT_BITS                     8
+#define GMX_SIMD_RCP_BITS                       8
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_DEFINITIONS_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_general.h
similarity index 64%
rename from src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h
rename to src/gromacs/simd/impl_arm_neon/impl_arm_neon_general.h
index 580016e0a9..696a53d902 100644
--- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_general.h
@@ -32,29 +32,20 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+#ifndef GMX_SIMD_IMPL_ARM_NEON_GENERAL_H
+#define GMX_SIMD_IMPL_ARM_NEON_GENERAL_H
 
-#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_COMMON_H
-#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_COMMON_H
+namespace gmx
+{
 
-/* ARM (AArch64) NEON Advanced SIMD */
+static inline void
+simdPrefetch(void * m)
+{
+#ifdef __GNUC__
+    __builtin_prefetch(m);
+#endif
+}
 
-/* Inherit single-precision and integer part from 32-bit arm */
-#include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
+}      // namespace gmx
 
-/* Override some capability definitions from ARM 32-bit NEON - we now have double */
-#undef  GMX_SIMD_HAVE_DOUBLE
-#define GMX_SIMD_HAVE_DOUBLE                  1
-#undef  GMX_SIMD_HAVE_DINT32
-#define GMX_SIMD_HAVE_DINT32                  1
-#undef  GMX_SIMD_HAVE_DINT32_EXTRACT
-#define GMX_SIMD_HAVE_DINT32_EXTRACT          1
-#undef  GMX_SIMD_HAVE_DINT32_LOGICAL
-#define GMX_SIMD_HAVE_DINT32_LOGICAL          1
-#undef  GMX_SIMD_HAVE_DINT32_ARITHMETICS
-#define GMX_SIMD_HAVE_DINT32_ARITHMETICS      1
-
-/* Implementation details */
-#define GMX_SIMD_DOUBLE_WIDTH                 2
-#define GMX_SIMD_DINT32_WIDTH                 2
-
-#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_COMMON_H */
+#endif // GMX_SIMD_IMPL_ARM_NEON_GENERAL_H
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h
index f2feb6d6ea..c05635ee66 100644
--- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h
@@ -32,66 +32,392 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-
 #ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
 #define GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
 
-#include <math.h>
+#include "config.h"
+
+#include <cassert>
+#include <cstddef>
 
 #include <arm_neon.h>
 
-#include "impl_arm_neon_common.h"
-#include "impl_arm_neon_simd_float.h"
-
-/* ARM 32-bit Neon is already 4-wide in single, so just reuse float type for SIMD4 */
-#define Simd4Float                 SimdFloat
-#define simd4LoadF                 simdLoadF
-#define simd4Load1F                simdLoad1F
-#define simd4Set1F                 simdSet1F
-#define simd4StoreF                simdStoreF
-#define simd4LoadUF                simdLoadUF
-#define simd4StoreUF               simdStoreUF
-#define simd4SetZeroF              simdSetZeroF
-#define simd4AddF                  simdAddF
-#define simd4SubF                  simdSubF
-#define simd4MulF                  simdMulF
-#define simd4FmaddF                simdFmaddF
-#define simd4FmsubF                simdFmsubF
-#define simd4FnmaddF               simdFnmaddF
-#define simd4FnmsubF               simdFnmsubF
-#define simd4AndF                  simdAndF
-#define simd4AndNotF               simdAndNotF
-#define simd4OrF                   simdOrF
-#define simd4XorF                  simdXorF
-#define simd4RsqrtF                simdRsqrtF
-#define simd4AbsF                  simdAbsF
-#define simd4NegF                  simdNegF
-#define simd4MaxF                  simdMaxF
-#define simd4MinF                  simdMinF
-#define simd4RoundF                simdRoundF
-#define simd4TruncF                simdTruncF
-#define simd4DotProductF           simd4DotProductF_arm_neon
-#define Simd4FBool                 SimdFBool
-#define simd4CmpEqF                simdCmpEqF
-#define simd4CmpLtF                simdCmpLtF
-#define simd4CmpLeF                simdCmpLeF
-#define simd4AndFB                 simdAndFB
-#define simd4OrFB                  simdOrFB
-#define simd4AnyTrueFB             simdAnyTrueFB
-#define simd4MaskF                 simdMaskF
-#define simd4MaskNotF              simdMaskNotF
-#define simd4BlendF                simdBlendF
-#define simd4ReduceF               simdReduceF
-
-/* SIMD4 Dot product helper function */
-static inline float
-simd4DotProductF_arm_neon(SimdFloat a, SimdFloat b)
-{
-    SimdFloat  c;
-    c = simdMulF(a, b);
+namespace gmx
+{
+
+class Simd4Float
+{
+    public:
+        Simd4Float() {}
+
+        Simd4Float(float f) : simdInternal_(vdupq_n_f32(f)) {}
+
+        // Internal utility constructor to simplify return statements
+        Simd4Float(float32x4_t simd) : simdInternal_(simd) {}
+
+        float32x4_t  simdInternal_;
+};
+
+class Simd4FBool
+{
+    public:
+        Simd4FBool() {}
+
+        //! \brief Construct from scalar bool
+        Simd4FBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        Simd4FBool(uint32x4_t simd) : simdInternal_(simd) {}
+
+        uint32x4_t  simdInternal_;
+};
+
+static inline Simd4Float gmx_simdcall
+load4(const float *m)
+{
+    assert(size_t(m) % 16 == 0);
+    return {
+               vld1q_f32(m)
+    };
+}
+
+static inline void gmx_simdcall
+store4(float *m, Simd4Float a)
+{
+    assert(size_t(m) % 16 == 0);
+    vst1q_f32(m, a.simdInternal_);
+}
+
+static inline Simd4Float gmx_simdcall
+load4U(const float *m)
+{
+    return {
+               vld1q_f32(m)
+    };
+}
+
+static inline void gmx_simdcall
+store4U(float *m, Simd4Float a)
+{
+    vst1q_f32(m, a.simdInternal_);
+}
+
+static inline Simd4Float gmx_simdcall
+simd4SetZeroF()
+{
+    return {
+               vdupq_n_f32(0.0f)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator&(Simd4Float a, Simd4Float b)
+{
+    return {
+               vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_),
+                                               vreinterpretq_s32_f32(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+andNot(Simd4Float a, Simd4Float b)
+{
+    return {
+               vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_),
+                                               vreinterpretq_s32_f32(a.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator|(Simd4Float a, Simd4Float b)
+{
+    return {
+               vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_),
+                                               vreinterpretq_s32_f32(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator^(Simd4Float a, Simd4Float b)
+{
+    return {
+               vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_),
+                                               vreinterpretq_s32_f32(b.simdInternal_)))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator+(Simd4Float a, Simd4Float b)
+{
+    return {
+               vaddq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator-(Simd4Float a, Simd4Float b)
+{
+    return {
+               vsubq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator-(Simd4Float x)
+{
+    return {
+               vnegq_f32(x.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+operator*(Simd4Float a, Simd4Float b)
+{
+    return {
+               vmulq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline Simd4Float gmx_simdcall
+fma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#else
+               vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#endif
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#else
+               vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#endif
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fnma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#else
+               vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#endif
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fnms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#else
+               vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#endif
+    };
+}
+#endif
+
+static inline Simd4Float gmx_simdcall
+rsqrt(Simd4Float x)
+{
+    return {
+               vrsqrteq_f32(x.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+abs(Simd4Float x)
+{
+    return {
+               vabsq_f32( x.simdInternal_ )
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+max(Simd4Float a, Simd4Float b)
+{
+    return {
+               vmaxq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+min(Simd4Float a, Simd4Float b)
+{
+    return {
+               vminq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline Simd4Float gmx_simdcall
+round(Simd4Float x)
+{
+    // Convert x to nearest integer
+    float32x4_t signBitOfX = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(x.simdInternal_)));
+    float32x4_t half       = vdupq_n_f32(0.5f);
+    float32x4_t corr       = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfX)));
+
+    int32x4_t   integerX   = vcvtq_s32_f32(vaddq_f32(x.simdInternal_, corr));
+
+    // Convert back to float
+
+    return {
+               vcvtq_f32_s32(integerX)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+trunc(Simd4Float x)
+{
+    return {
+               vcvtq_f32_s32( vcvtq_s32_f32(x.simdInternal_) )
+    };
+}
+#endif
+
+static inline void gmx_simdcall
+transpose(Simd4Float * v0, Simd4Float * v1,
+          Simd4Float * v2, Simd4Float * v3)
+{
+    float32x4x2_t  t0 = vuzpq_f32(v0->simdInternal_, v2->simdInternal_);
+    float32x4x2_t  t1 = vuzpq_f32(v1->simdInternal_, v3->simdInternal_);
+    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    v0->simdInternal_ = t2.val[0];
+    v1->simdInternal_ = t3.val[0];
+    v2->simdInternal_ = t2.val[1];
+    v3->simdInternal_ = t3.val[1];
+}
+
+static inline Simd4FBool gmx_simdcall
+operator==(Simd4Float a, Simd4Float b)
+{
+    return {
+               vceqq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator!=(Simd4Float a, Simd4Float b)
+{
+    return {
+               vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_))
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator<(Simd4Float a, Simd4Float b)
+{
+    return {
+               vcltq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator<=(Simd4Float a, Simd4Float b)
+{
+    return {
+               vcleq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator&&(Simd4FBool a, Simd4FBool b)
+{
+    return {
+               vandq_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline Simd4FBool gmx_simdcall
+operator||(Simd4FBool a, Simd4FBool b)
+{
+    return {
+               vorrq_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline bool gmx_simdcall
+anyTrue(Simd4FBool a)
+{
+    uint32x4_t x = a.simdInternal_;
+    uint32x4_t y = vextq_u32(x, x, 2);
+
+    x = vorrq_u32(x, y);
+    y = vextq_u32(x, x, 1);
+    x = vorrq_u32(x, y);
+    return (vgetq_lane_u32(x, 0) != 0);
+}
+#endif
+
+static inline Simd4Float gmx_simdcall
+selectByMask(Simd4Float a, Simd4FBool m)
+{
+    return {
+               vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_),
+                                               m.simdInternal_))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+selectByNotMask(Simd4Float a, Simd4FBool m)
+{
+    return {
+               vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_),
+                                               m.simdInternal_))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
+{
+    return {
+               vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline float gmx_simdcall
+reduce(Simd4Float a)
+{
+    float32x4_t x = a.simdInternal_;
+    float32x4_t y = vextq_f32(x, x, 2);
+
+    x = vaddq_f32(x, y);
+    y = vextq_f32(x, x, 1);
+    x = vaddq_f32(x, y);
+    return vgetq_lane_f32(x, 0);
+}
+
+static inline float gmx_simdcall
+dotProduct(Simd4Float a, Simd4Float b)
+{
+    Simd4Float c;
+
+    c = a * b;
     /* set 4th element to 0, then add all of them */
-    c = vsetq_lane_f32(0.0f, c, 3);
-    return simdReduceF_arm_neon(c);
+    c.simdInternal_ = vsetq_lane_f32(0.0f, c.simdInternal_, 3);
+    return reduce(c);
 }
+#endif
+
+}      // namespace gmx
 
-#endif /* GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H */
+#endif // GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h
dissimilarity index 76%
index 287371e005..a347619a80 100644
--- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h
@@ -1,196 +1,786 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
-#define GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
-
-#include <math.h>
-
-#include <arm_neon.h>
-
-#include "impl_arm_neon_common.h"
-
-/****************************************************
- *      SINGLE PRECISION SIMD IMPLEMENTATION        *
- ****************************************************/
-#define SimdFloat            float32x4_t
-#define simdLoadF            vld1q_f32
-#define simdLoad1F           vld1q_dup_f32
-#define simdSet1F            vdupq_n_f32
-#define simdStoreF           vst1q_f32
-#define simdLoadUF           vld1q_f32
-#define simdStoreUF          vst1q_f32
-#define simdSetZeroF()       vdupq_n_f32(0.0f)
-#define simdAddF             vaddq_f32
-#define simdSubF             vsubq_f32
-#define simdMulF             vmulq_f32
-#ifdef __ARM_FEATURE_FMA
-#    define simdFmaddF(a, b, c)  vfmaq_f32(c, b, a)
-#    define simdFmsubF(a, b, c)  vnegq_f32(vfmsq_f32(c, b, a))
-#    define simdFnmaddF(a, b, c) vfmaq_f32(c, b, a)
-#    define simdFnmsubF(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
-#else
-#    define simdFmaddF(a, b, c)  vmlaq_f32(c, b, a)
-#    define simdFmsubF(a, b, c)  vnegq_f32(vmlsq_f32(c, b, a))
-#    define simdFnmaddF(a, b, c) vmlsq_f32(c, b, a)
-#    define simdFnmsubF(a, b, c) vnegq_f32(vmlaq_f32(c, b, a))
-#endif
-#define simdAndF(a, b)        vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)))
-#define simdAndNotF(a, b)     vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a)))
-#define simdOrF(a, b)         vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)))
-#define simdXorF(a, b)        vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)))
-#define simdRsqrtF            vrsqrteq_f32
-#define simdRsqrtIterF(lu, x) vmulq_f32(lu, vrsqrtsq_f32(vmulq_f32(lu, lu), x))
-#define simdRcpF              vrecpeq_f32
-#define simdRcpIterF(lu, x)   vmulq_f32(lu, vrecpsq_f32(lu, x))
-#define simdAbsF(x)         vabsq_f32(x)
-#define simdNegF(x)         vnegq_f32(x)
-#define simdMaxF             vmaxq_f32
-#define simdMinF             vminq_f32
-#define simdRoundF(x)        simdCvtI2F(simdCvtF2I(x))
-#define simdTruncF(x)        simdCvtI2F(simdCvttF2I(x))
-#define simdFractionF(x)     vsubq_f32(x, simdTruncF(x))
-#define simdGetExponentF    simdGetExponentF_arm_neon
-#define simdGetMantissaF    simdGetMantissaF_arm_neon
-#define simdSetExponentF    simdSetExponentF_arm_neon
-/* integer datatype corresponding to float: SimdFInt32 */
-#define SimdFInt32         int32x4_t
-#define simdLoadFI(m)        vld1q_s32(m)
-#define simdSet1FI           vdupq_n_s32
-#define simdStoreFI(m, x)    vst1q_s32(m, x)
-#define simdLoadUFI(m)       vld1q_s32(m)
-#define simdStoreUFI(m, x)   vst1q_s32(m, x)
-#define simdSetZeroFI()      vdupq_n_s32(0)
-#define simdCvttF2I          vcvtq_s32_f32
-#define simdCvtF2I(x)        vcvtq_s32_f32(simdAddF(simdOrF(simdAndF(vdupq_n_f32(-0.0f), x), vdupq_n_f32(0.5f)), x))
-#define simdCvtI2F           vcvtq_f32_s32
-#define simdExtractFI(x, i)  vgetq_lane_s32(x, i)
-/* Integer logical ops on SimdFInt32 */
-#define simdSlliFI           vshlq_n_s32
-#define simdSrliFI           vshrq_n_s32
-#define simdAndFI            vandq_s32
-#define simdAndNotFI(a, b)   vbicq_s32(b, a)
-#define simdOrFI             vorrq_s32
-#define simdXorFI            veorq_s32
-/* Integer arithmetic ops on SimdFInt32 */
-#define simdAddFI            vaddq_s32
-#define simdSubFI            vsubq_s32
-#define simdMulFI            vmulq_s32
-/* Boolean & comparison operations on SimdFloat */
-#define SimdFBool           uint32x4_t
-#define simdCmpEqF           vceqq_f32
-#define simdCmpLtF           vcltq_f32
-#define simdCmpLeF           vcleq_f32
-#define simdAndFB            vandq_u32
-#define simdOrFB             vorrq_u32
-#define simdAnyTrueFB        simdAnyTrueFB_arm_neon
-#define simdMaskF(a, sel)     vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), sel))
-#define simdMaskNotF(a, sel)  vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), sel))
-#define simdBlendF(a, b, sel)     vbslq_f32(sel, b, a)
-#define simdReduceF(a)       simdReduceF_arm_neon(a)
-/* Boolean & comparison operations on SimdFInt32 */
-#define SimdFIBool          uint32x4_t
-#define simdCmpEqFI          vceqq_s32
-#define simdCmpLtFI          vcltq_s32
-#define simdAndFIB           vandq_u32
-#define simdOrFIB            vorrq_u32
-#define simdAnyTrueFIB       simdAnyTrueFB
-#define simdMaskFI(a, sel)     vandq_s32(a, vreinterpretq_s32_u32(sel))
-#define simdMaskNotFI(a, sel)  vbicq_s32(a, vreinterpretq_s32_u32(sel))
-#define simdBlendFI(a, b, sel)     vbslq_s32(sel, b, a)
-/* Conversions between different booleans */
-#define simdCvtFB2FIB(x)     (x)
-#define simdCvtFIB2FB(x)     (x)
-
-/****************************************************
- * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
- ****************************************************/
-static inline SimdFloat
-simdGetExponentF_arm_neon(SimdFloat x)
-{
-    const float32x4_t expmask    = vreinterpretq_f32_s32( vdupq_n_s32(0x7F800000) );
-    int32x4_t         iexp;
-
-    iexp = vreinterpretq_s32_f32(simdAndF(x, expmask));
-    iexp = vsubq_s32(vshrq_n_s32(iexp, 23), vdupq_n_s32(127));
-    return vcvtq_f32_s32(iexp);
-}
-
-
-static inline SimdFloat
-simdGetMantissaF_arm_neon(SimdFloat x)
-{
-    const float32x4_t mantmask   = vreinterpretq_f32_s32( vdupq_n_s32(0x007FFFFF) );
-    const float32x4_t one        = vdupq_n_f32(1.0f);
-
-    /* Get mantissa */
-    x = simdAndF(mantmask, x);
-    /* Reset zero (but correctly biased) exponent */
-    return simdOrF(x, one);
-}
-
-
-static inline SimdFloat
-simdSetExponentF_arm_neon(SimdFloat x)
-{
-    int32x4_t  iexp = simdCvtF2I(x);
-
-    iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
-    return vreinterpretq_f32_s32(iexp);
-}
-
-static inline float
-simdReduceF_arm_neon(SimdFloat a)
-{
-    float32x4_t b = vextq_f32(a, a, 2);
-
-    a = vaddq_f32(a, b);
-    b = vextq_f32(a, a, 1);
-    a = vaddq_f32(a, b);
-    return vgetq_lane_f32(a, 0);
-}
-
-static inline int
-simdAnyTrueFB_arm_neon(SimdFBool a)
-{
-    uint32x4_t b = vextq_u32(a, a, 2);
-
-    a = simdOrFB(a, b);
-    b = vextq_u32(a, a, 1);
-    a = simdOrFB(a, b);
-    return (vgetq_lane_u32(a, 0) != 0);
-}
-
-#endif /* GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H */
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
+#define GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <arm_neon.h>
+
+namespace gmx
+{
+
+class SimdFloat
+{
+    public:
+        SimdFloat() {}
+
+        SimdFloat(float f) : simdInternal_(vdupq_n_f32(f)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFloat(float32x4_t simd) : simdInternal_(simd) {}
+
+        float32x4_t  simdInternal_;
+};
+
+class SimdFInt32
+{
+    public:
+        SimdFInt32() {}
+
+        SimdFInt32(std::int32_t i) : simdInternal_(vdupq_n_s32(i)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFInt32(int32x4_t simd) : simdInternal_(simd) {}
+
+        int32x4_t  simdInternal_;
+};
+
+class SimdFBool
+{
+    public:
+        SimdFBool() {}
+
+        SimdFBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFBool(uint32x4_t simd) : simdInternal_(simd) {}
+
+        uint32x4_t  simdInternal_;
+};
+
+class SimdFIBool
+{
+    public:
+        SimdFIBool() {}
+
+        SimdFIBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdFIBool(uint32x4_t simd) : simdInternal_(simd) {}
+
+        uint32x4_t  simdInternal_;
+};
+
+static inline SimdFloat gmx_simdcall
+load(const float *m)
+{
+    assert(std::size_t(m) % 16 == 0);
+    return {
+               vld1q_f32(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(float *m, SimdFloat a)
+{
+    assert(std::size_t(m) % 16 == 0);
+    vst1q_f32(m, a.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+loadU(const float *m)
+{
+    return {
+               vld1q_f32(m)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(float *m, SimdFloat a)
+{
+    vst1q_f32(m, a.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+setZeroF()
+{
+    return {
+               vdupq_n_f32(0.0f)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+loadFI(const std::int32_t * m)
+{
+    assert(std::size_t(m) % 16 == 0);
+    return {
+               vld1q_s32(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(std::int32_t * m, SimdFInt32 a)
+{
+    assert(std::size_t(m) % 16 == 0);
+    vst1q_s32(m, a.simdInternal_);
+}
+
+static inline SimdFInt32 gmx_simdcall
+loadUFI(const std::int32_t *m)
+{
+    return {
+               vld1q_s32(m)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(std::int32_t * m, SimdFInt32 a)
+{
+    vst1q_s32(m, a.simdInternal_);
+}
+
+static inline SimdFInt32 gmx_simdcall
+setZeroFI()
+{
+    return {
+               vdupq_n_s32(0)
+    };
+}
+
+template<int index> gmx_simdcall
+static inline std::int32_t
+extract(SimdFInt32 a)
+{
+    return vgetq_lane_s32(a.simdInternal_, index);
+}
+
+static inline SimdFloat gmx_simdcall
+operator&(SimdFloat a, SimdFloat b)
+{
+    return {
+               vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_),
+                                               vreinterpretq_s32_f32(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+andNot(SimdFloat a, SimdFloat b)
+{
+    return {
+               vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_),
+                                               vreinterpretq_s32_f32(a.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator|(SimdFloat a, SimdFloat b)
+{
+    return {
+               vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_),
+                                               vreinterpretq_s32_f32(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator^(SimdFloat a, SimdFloat b)
+{
+    return {
+               vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_),
+                                               vreinterpretq_s32_f32(b.simdInternal_)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator+(SimdFloat a, SimdFloat b)
+{
+    return {
+               vaddq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator-(SimdFloat a, SimdFloat b)
+{
+    return {
+               vsubq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator-(SimdFloat x)
+{
+    return {
+               vnegq_f32(x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+operator*(SimdFloat a, SimdFloat b)
+{
+    return {
+               vmulq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline SimdFloat gmx_simdcall
+fma(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#else
+               vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#endif
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#else
+               vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#endif
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fnma(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#else
+               vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+#endif
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+fnms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+#ifdef __ARM_FEATURE_FMA
+               vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#else
+               vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+#endif
+    };
+}
+#endif
+
+static inline SimdFloat gmx_simdcall
+rsqrt(SimdFloat x)
+{
+    return {
+               vrsqrteq_f32(x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+rsqrtIter(SimdFloat lu, SimdFloat x)
+{
+    return {
+               vmulq_f32(lu.simdInternal_, vrsqrtsq_f32(vmulq_f32(lu.simdInternal_, lu.simdInternal_), x.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+rcp(SimdFloat x)
+{
+    return {
+               vrecpeq_f32(x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+rcpIter(SimdFloat lu, SimdFloat x)
+{
+    return {
+               vmulq_f32(lu.simdInternal_, vrecpsq_f32(lu.simdInternal_, x.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskAdd(SimdFloat a, SimdFloat b, SimdFBool m)
+{
+    b.simdInternal_ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(b.simdInternal_),
+                                                      m.simdInternal_));
+
+    return {
+               vaddq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzMul(SimdFloat a, SimdFloat b, SimdFBool m)
+{
+    SimdFloat tmp = a * b;
+
+    return {
+               vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp.simdInternal_),
+                                               m.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m)
+{
+#ifdef __ARM_FEATURE_FMA
+    float32x4_t tmp = vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_);
+#else
+    float32x4_t tmp = vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_);
+#endif
+
+    return {
+               vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp),
+                                               m.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzRsqrt(SimdFloat x, SimdFBool m)
+{
+#ifndef NDEBUG
+    x.simdInternal_ = vbslq_f32(m, vdupq_n_f32(1.0f), x.simdInternal_);
+#endif
+    return {
+               vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrsqrteq_f32(x.simdInternal_)),
+                                               m.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+maskzRcp(SimdFloat x, SimdFBool m)
+{
+#ifndef NDEBUG
+    x.simdInternal_ = vbslq_f32(m, vdupq_n_f32(1.0f), x.simdInternal_);
+#endif
+    return {
+               vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrecpeq_f32(x.simdInternal_)),
+                                               m.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+abs(SimdFloat x)
+{
+    return {
+               vabsq_f32( x.simdInternal_ )
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+max(SimdFloat a, SimdFloat b)
+{
+    return {
+               vmaxq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+min(SimdFloat a, SimdFloat b)
+{
+    return {
+               vminq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Round and trunc operations are defined at the end of this file, since they
+// need to use float-to-integer and integer-to-float conversions.
+
+static inline SimdFloat gmx_simdcall
+frexp(SimdFloat value, SimdFInt32 * exponent)
+{
+    const int32x4_t    exponentMask   = vdupq_n_s32(0x7F800000);
+    const int32x4_t    mantissaMask   = vdupq_n_s32(0x807FFFFF);
+    const int32x4_t    exponentBias   = vdupq_n_s32(126); // add 1 to make our definition identical to frexp()
+    const float32x4_t  half           = vdupq_n_f32(0.5f);
+    int32x4_t          iExponent;
+
+    iExponent               = vandq_s32(vreinterpretq_s32_f32(value.simdInternal_), exponentMask);
+    iExponent               = vsubq_s32(vshrq_n_s32(iExponent, 23), exponentBias);
+    exponent->simdInternal_ = iExponent;
+
+    return {
+               vreinterpretq_f32_s32(vorrq_s32(vandq_s32(vreinterpretq_s32_f32(value.simdInternal_),
+                                                         mantissaMask),
+                                               vreinterpretq_s32_f32(half)))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+ldexp(SimdFloat value, SimdFInt32 exponent)
+{
+    const int32x4_t exponentBias = vdupq_n_s32(127);
+    int32x4_t       iExponent;
+
+    iExponent = vshlq_n_s32( vaddq_s32(exponent.simdInternal_, exponentBias), 23);
+
+    return {
+               vmulq_f32(value.simdInternal_, vreinterpretq_f32_s32(iExponent))
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline float gmx_simdcall
+reduce(SimdFloat a)
+{
+    float32x4_t x = a.simdInternal_;
+    float32x4_t y = vextq_f32(x, x, 2);
+
+    x = vaddq_f32(x, y);
+    y = vextq_f32(x, x, 1);
+    x = vaddq_f32(x, y);
+    return vgetq_lane_f32(x, 0);
+}
+#endif
+
+static inline SimdFBool gmx_simdcall
+operator==(SimdFloat a, SimdFloat b)
+{
+    return {
+               vceqq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator!=(SimdFloat a, SimdFloat b)
+{
+    return {
+               vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_))
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator<(SimdFloat a, SimdFloat b)
+{
+    return {
+               vcltq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator<=(SimdFloat a, SimdFloat b)
+{
+    return {
+               vcleq_f32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+testBits(SimdFloat a)
+{
+    uint32x4_t tmp = vreinterpretq_u32_f32(a.simdInternal_);
+
+    return {
+               vtstq_u32(tmp, tmp)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator&&(SimdFBool a, SimdFBool b)
+{
+
+    return {
+               vandq_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+operator||(SimdFBool a, SimdFBool b)
+{
+    return {
+               vorrq_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline bool gmx_simdcall
+anyTrue(SimdFBool a)
+{
+    uint32x4_t x = a.simdInternal_;
+    uint32x4_t y = vextq_u32(x, x, 2);
+
+    x = vorrq_u32(x, y);
+    y = vextq_u32(x, x, 1);
+    x = vorrq_u32(x, y);
+    return (vgetq_lane_u32(x, 0) != 0);
+}
+#endif
+
+static inline SimdFloat gmx_simdcall
+selectByMask(SimdFloat a, SimdFBool m)
+{
+    return {
+               vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_),
+                                               m.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+selectByNotMask(SimdFloat a, SimdFBool m)
+{
+    return {
+               vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_),
+                                               m.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+blend(SimdFloat a, SimdFloat b, SimdFBool sel)
+{
+    return {
+               vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator<<(SimdFInt32 a, int n)
+{
+    return {
+               vshlq_n_s32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator>>(SimdFInt32 a, int n)
+{
+    return {
+               vshrq_n_s32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator&(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vandq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+andNot(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vbicq_s32(b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator|(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vorrq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator^(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               veorq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator+(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vaddq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator-(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vsubq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+operator*(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vmulq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator==(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vceqq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+testBits(SimdFInt32 a)
+{
+    return {
+               vtstq_s32(a.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator<(SimdFInt32 a, SimdFInt32 b)
+{
+    return {
+               vcltq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator&&(SimdFIBool a, SimdFIBool b)
+{
+    return {
+               vandq_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+operator||(SimdFIBool a, SimdFIBool b)
+{
+    return {
+               vorrq_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline bool gmx_simdcall
+anyTrue(SimdFIBool a)
+{
+    uint32x4_t x = a.simdInternal_;
+    uint32x4_t y = vextq_u32(x, x, 2);
+
+    x = vorrq_u32(x, y);
+    y = vextq_u32(x, x, 1);
+    x = vorrq_u32(x, y);
+    return (vgetq_lane_u32(x, 0) != 0);
+}
+#endif
+
+static inline SimdFInt32 gmx_simdcall
+selectByMask(SimdFInt32 a, SimdFIBool m)
+{
+    return {
+               vandq_s32(a.simdInternal_, vreinterpretq_s32_u32(m.simdInternal_))
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+selectByNotMask(SimdFInt32 a, SimdFIBool m)
+{
+    return {
+               vbicq_s32(a.simdInternal_, vreinterpretq_s32_u32(m.simdInternal_))
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel)
+{
+    return {
+               vbslq_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline SimdFInt32 gmx_simdcall
+cvtR2I(SimdFloat a)
+{
+    float32x4_t signBitOfA = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(a.simdInternal_)));
+    float32x4_t half       = vdupq_n_f32(0.5f);
+    float32x4_t corr       = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfA)));
+
+    return {
+               vcvtq_s32_f32(vaddq_f32(a.simdInternal_, corr))
+    };
+}
+#endif
+
+static inline SimdFInt32 gmx_simdcall
+cvttR2I(SimdFloat a)
+{
+    return {
+               vcvtq_s32_f32(a.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+cvtI2R(SimdFInt32 a)
+{
+    return {
+               vcvtq_f32_s32(a.simdInternal_)
+    };
+}
+
+static inline SimdFIBool gmx_simdcall
+cvtB2IB(SimdFBool a)
+{
+    return {
+               a.simdInternal_
+    };
+}
+
+static inline SimdFBool gmx_simdcall
+cvtIB2B(SimdFIBool a)
+{
+    return {
+               a.simdInternal_
+    };
+}
+
+// Override for Neon-Asimd
+#if GMX_SIMD_ARM_NEON
+static inline SimdFloat gmx_simdcall
+round(SimdFloat x)
+{
+    return cvtI2R(cvtR2I(x));
+}
+
+static inline SimdFloat gmx_simdcall
+trunc(SimdFloat x)
+{
+    return cvtI2R(cvttR2I(x));
+}
+#endif
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h
new file mode 100644
index 0000000000..b9d91598be
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h
@@ -0,0 +1,361 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
+#define GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <arm_neon.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_arm_neon_simd_float.h"
+
+
+namespace gmx
+{
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const float *        base,
+                    const std::int32_t   offset[],
+                    SimdFloat *          v0,
+                    SimdFloat *          v1,
+                    SimdFloat *          v2,
+                    SimdFloat *          v3)
+{
+    assert(std::size_t(offset) % 16 == 0);
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 4 == 0);
+
+    // Unfortunately we cannot use the beautiful Neon structured load
+    // instructions since the data comes from four different memory locations.
+    float32x4x2_t  t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
+    float32x4x2_t  t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
+    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    v0->simdInternal_ = t2.val[0];
+    v1->simdInternal_ = t3.val[0];
+    v2->simdInternal_ = t2.val[1];
+    v3->simdInternal_ = t3.val[1];
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadTranspose(const float *        base,
+                    const std::int32_t   offset[],
+                    SimdFloat *          v0,
+                    SimdFloat *          v1)
+{
+    assert(std::size_t(offset) % 16 == 0);
+    assert(std::size_t(base) % 8 == 0);
+    assert(align % 2 == 0);
+
+    v0->simdInternal_  = vcombine_f32(vld1_f32( base + align * offset[0] ),
+                                      vld1_f32( base + align * offset[2] ));
+    v1->simdInternal_  = vcombine_f32(vld1_f32( base + align * offset[1] ),
+                                      vld1_f32( base + align * offset[3] ));
+
+    float32x4x2_t tmp  = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
+
+    v0->simdInternal_  = tmp.val[0];
+    v1->simdInternal_  = tmp.val[1];
+}
+
+static const int c_simdBestPairAlignmentFloat = 2;
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTranspose(const float *        base,
+                     const std::int32_t   offset[],
+                     SimdFloat *          v0,
+                     SimdFloat *          v1,
+                     SimdFloat *          v2)
+{
+    assert(std::size_t(offset) % 16 == 0);
+
+    float32x4x2_t  t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
+    float32x4x2_t  t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
+    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    v0->simdInternal_ = t2.val[0];
+    v1->simdInternal_ = t3.val[0];
+    v2->simdInternal_ = t2.val[1];
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterStoreU(float *              base,
+                       const std::int32_t   offset[],
+                       SimdFloat            v0,
+                       SimdFloat            v1,
+                       SimdFloat            v2)
+{
+    assert(std::size_t(offset) % 16 == 0);
+
+    float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
+
+    vst1_f32( base + align * offset[0], vget_low_f32(tmp.val[0]) );
+    vst1_f32( base + align * offset[1], vget_low_f32(tmp.val[1]) );
+    vst1_f32( base + align * offset[2], vget_high_f32(tmp.val[0]) );
+    vst1_f32( base + align * offset[3], vget_high_f32(tmp.val[1]) );
+
+    vst1q_lane_f32( base + align * offset[0] + 2, v2.simdInternal_, 0);
+    vst1q_lane_f32( base + align * offset[1] + 2, v2.simdInternal_, 1);
+    vst1q_lane_f32( base + align * offset[2] + 2, v2.simdInternal_, 2);
+    vst1q_lane_f32( base + align * offset[3] + 2, v2.simdInternal_, 3);
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterIncrU(float *              base,
+                      const std::int32_t   offset[],
+                      SimdFloat            v0,
+                      SimdFloat            v1,
+                      SimdFloat            v2)
+{
+    assert(std::size_t(offset) % 16 == 0);
+
+    if (align < 4)
+    {
+        float32x2_t   t0, t1, t2, t3;
+        float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
+
+        t0 = vget_low_f32(tmp.val[0]);
+        t1 = vget_low_f32(tmp.val[1]);
+        t2 = vget_high_f32(tmp.val[0]);
+        t3 = vget_high_f32(tmp.val[1]);
+
+        t0 = vadd_f32(t0, vld1_f32(base + align * offset[0]));
+        vst1_f32(base + align * offset[0], t0);
+        base[ align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
+
+        t1 = vadd_f32(t1, vld1_f32(base + align * offset[1]));
+        vst1_f32(base + align * offset[1], t1);
+        base[ align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
+
+        t2 = vadd_f32(t2, vld1_f32(base + align * offset[2]));
+        vst1_f32(base + align * offset[2], t2);
+        base[ align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
+
+        t3 = vadd_f32(t3, vld1_f32(base + align * offset[3]));
+        vst1_f32(base + align * offset[3], t3);
+        base[ align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
+    }
+    else
+    {
+        // Extra elements means we can use full width-4 load/store operations
+        float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
+        float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0f));
+        float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+        float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+        float32x4_t    t4 = t2.val[0];
+        float32x4_t    t5 = t3.val[0];
+        float32x4_t    t6 = t2.val[1];
+        float32x4_t    t7 = t3.val[1];
+
+        vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0])));
+        vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1])));
+        vst1q_f32(base + align * offset[2], vaddq_f32(t6, vld1q_f32(base + align * offset[2])));
+        vst1q_f32(base + align * offset[3], vaddq_f32(t7, vld1q_f32(base + align * offset[3])));
+    }
+}
+
+template <int align>
+static inline void gmx_simdcall
+transposeScatterDecrU(float *              base,
+                      const std::int32_t   offset[],
+                      SimdFloat            v0,
+                      SimdFloat            v1,
+                      SimdFloat            v2)
+{
+    assert(std::size_t(offset) % 16 == 0);
+
+    if (align < 4)
+    {
+        float32x2_t   t0, t1, t2, t3;
+        float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
+
+        t0 = vget_low_f32(tmp.val[0]);
+        t1 = vget_low_f32(tmp.val[1]);
+        t2 = vget_high_f32(tmp.val[0]);
+        t3 = vget_high_f32(tmp.val[1]);
+
+        t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0);
+        vst1_f32(base + align * offset[0], t0);
+        base[ align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
+
+        t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1);
+        vst1_f32(base + align * offset[1], t1);
+        base[ align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
+
+        t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2);
+        vst1_f32(base + align * offset[2], t2);
+        base[ align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
+
+        t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3);
+        vst1_f32(base + align * offset[3], t3);
+        base[ align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
+    }
+    else
+    {
+        // Extra elements means we can use full width-4 load/store operations
+        float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
+        float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0f));
+        float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+        float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+        float32x4_t    t4 = t2.val[0];
+        float32x4_t    t5 = t3.val[0];
+        float32x4_t    t6 = t2.val[1];
+        float32x4_t    t7 = t3.val[1];
+
+        vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4));
+        vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5));
+        vst1q_f32(base + align * offset[2], vsubq_f32(vld1q_f32(base + align * offset[2]), t6));
+        vst1q_f32(base + align * offset[3], vsubq_f32(vld1q_f32(base + align * offset[3]), t7));
+    }
+}
+
+static inline void gmx_simdcall
+expandScalarsToTriplets(SimdFloat    scalar,
+                        SimdFloat *  triplets0,
+                        SimdFloat *  triplets1,
+                        SimdFloat *  triplets2)
+{
+    float32x2_t lo, hi;
+    float32x4_t t0, t1, t2, t3;
+
+    lo = vget_low_f32(scalar.simdInternal_);
+    hi = vget_high_f32(scalar.simdInternal_);
+
+    t0 = vdupq_lane_f32(lo, 0);
+    t1 = vdupq_lane_f32(lo, 1);
+    t2 = vdupq_lane_f32(hi, 0);
+    t3 = vdupq_lane_f32(hi, 1);
+
+    triplets0->simdInternal_ = vextq_f32(t0, t1, 1);
+    triplets1->simdInternal_ = vextq_f32(t1, t2, 2);
+    triplets2->simdInternal_ = vextq_f32(t2, t3, 3);
+}
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const float *  base,
+                             SimdFInt32     offset,
+                             SimdFloat *    v0,
+                             SimdFloat *    v1,
+                             SimdFloat *    v2,
+                             SimdFloat *    v3)
+{
+    GMX_ALIGNED(int, GMX_SIMD_FINT32_WIDTH)  ioffset[GMX_SIMD_FINT32_WIDTH];
+
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 4 == 0);
+
+    store(ioffset, offset);
+    gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
+}
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const float *   base,
+                             SimdFInt32      offset,
+                             SimdFloat *     v0,
+                             SimdFloat *     v1)
+{
+    GMX_ALIGNED(int, GMX_SIMD_FINT32_WIDTH)  ioffset[GMX_SIMD_FINT32_WIDTH];
+
+    store(ioffset, offset);
+    gatherLoadTranspose<align>(base, ioffset, v0, v1);
+}
+
+
+
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUBySimdIntTranspose(const float *  base,
+                              SimdFInt32     offset,
+                              SimdFloat *    v0,
+                              SimdFloat *    v1)
+{
+    GMX_ALIGNED(int, GMX_SIMD_FINT32_WIDTH)  ioffset[GMX_SIMD_FINT32_WIDTH];
+
+    store(ioffset, offset);
+    v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[0] ),
+                                     vld1_f32( base + align * ioffset[2] ));
+    v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[1] ),
+                                     vld1_f32( base + align * ioffset[3] ));
+    float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_ );
+    v0->simdInternal_ = tmp.val[0];
+    v1->simdInternal_ = tmp.val[1];
+}
+
+static inline float gmx_simdcall
+reduceIncr4ReturnSum(float *    m,
+                     SimdFloat  v0,
+                     SimdFloat  v1,
+                     SimdFloat  v2,
+                     SimdFloat  v3)
+{
+    assert(std::size_t(m) % 16 == 0);
+
+    float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
+    float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
+    float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
+    float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
+    v0.simdInternal_ = t2.val[0];
+    v1.simdInternal_ = t3.val[0];
+    v2.simdInternal_ = t2.val[1];
+    v3.simdInternal_ = t3.val[1];
+
+    v0 = v0 + v1;
+    v2 = v2 + v3;
+    v0 = v0 + v2;
+    v2 = v0 + load(m);
+    store(m, v2);
+
+    return reduce(v0);
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h
index d8d58e84f7..72798b1fb8 100644
--- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h
@@ -36,7 +36,13 @@
 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
 
+#include "impl_arm_neon_asimd_definitions.h"
+#include "impl_arm_neon_asimd_general.h"
+// No double precision SIMD4 on neon Asimd
+#include "impl_arm_neon_asimd_simd4_float.h"
 #include "impl_arm_neon_asimd_simd_double.h"
 #include "impl_arm_neon_asimd_simd_float.h"
+#include "impl_arm_neon_asimd_util_double.h"
+#include "impl_arm_neon_asimd_util_float.h"
 
-#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_H */
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h
new file mode 100644
index 0000000000..ab1a7783af
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h
@@ -0,0 +1,83 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_DEFINITIONS_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_DEFINITIONS_H
+
+// ARM (AArch64) NEON Advanced SIMD
+
+#define GMX_SIMD                                1
+#define GMX_SIMD_HAVE_FLOAT                     1
+#define GMX_SIMD_HAVE_DOUBLE                    1
+#define GMX_SIMD_HAVE_LOADU                     1
+#define GMX_SIMD_HAVE_STOREU                    1
+#define GMX_SIMD_HAVE_LOGICAL                   1
+#define GMX_SIMD_HAVE_FMA                       1
+#define GMX_SIMD_HAVE_FINT32_EXTRACT            1
+#define GMX_SIMD_HAVE_FINT32_LOGICAL            1
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS        1
+#define GMX_SIMD_HAVE_DINT32_EXTRACT            1
+#define GMX_SIMD_HAVE_DINT32_LOGICAL            1
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS        1
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT     0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT   1
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT     1
+#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT          0
+#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT         0
+#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT          0
+#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE    0
+#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE  1
+#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE    1
+#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE         0
+#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE        0
+#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE         0
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT   1
+#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE  1
+#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT          0  // No need for half-simd, width is 4
+#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE         0
+
+#define GMX_SIMD4_HAVE_FLOAT                    1
+#define GMX_SIMD4_HAVE_DOUBLE                   0
+
+// Implementation details
+#define GMX_SIMD_FLOAT_WIDTH                    4
+#define GMX_SIMD_DOUBLE_WIDTH                   2
+#define GMX_SIMD_FINT32_WIDTH                   4
+#define GMX_SIMD_DINT32_WIDTH                   2
+#define GMX_SIMD4_WIDTH                         4
+#define GMX_SIMD_RSQRT_BITS                     8
+#define GMX_SIMD_RCP_BITS                       8
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_DEFINITIONS_H
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_general.h
similarity index 89%
copy from src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
copy to src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_general.h
index 5357dfc978..2b7162367d 100644
--- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_general.h
@@ -33,10 +33,9 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-#ifndef GMX_SIMD_IMPL_ARM_NEON_H
-#define GMX_SIMD_IMPL_ARM_NEON_H
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_GENERAL_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_GENERAL_H
 
-#include "impl_arm_neon_simd4_float.h"
-#include "impl_arm_neon_simd_float.h"
+#include "gromacs/simd/impl_arm_neon/impl_arm_neon_general.h"
 
-#endif /* GMX_SIMD_IMPL_ARM_NEON_H */
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_GENERAL_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h
new file mode 100644
index 0000000000..47d691f18d
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD4_FLOAT_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD4_FLOAT_H
+
+#include "config.h"
+
+#include <arm_neon.h>
+
+#include "gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h"
+
+namespace gmx
+{
+
+static inline Simd4Float gmx_simdcall
+fma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fnma(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+fnms(Simd4Float a, Simd4Float b, Simd4Float c)
+{
+    return {
+               vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+round(Simd4Float x)
+{
+    return {
+               vrndnq_f32(x.simdInternal_)
+    };
+}
+
+static inline Simd4Float gmx_simdcall
+trunc(Simd4Float x)
+{
+    return {
+               vrndq_f32(x.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(Simd4FBool a)
+{
+    return (vmaxvq_u32(a.simdInternal_) != 0);
+}
+
+static inline float gmx_simdcall
+reduce(Simd4Float a)
+{
+    float32x4_t b = a.simdInternal_;
+    b = vpaddq_f32(b, b);
+    b = vpaddq_f32(b, b);
+    return vgetq_lane_f32(b, 0);
+}
+
+static inline float gmx_simdcall
+dotProduct(Simd4Float a, Simd4Float b)
+{
+    Simd4Float c;
+
+    c = a * b;
+    /* set 4th element to 0, then add all of them */
+    c.simdInternal_ = vsetq_lane_f32(0.0f, c.simdInternal_, 3);
+    return reduce(c);
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD4_FLOAT_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
dissimilarity index 74%
index c984185dfe..c33a7f6e65 100644
--- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
@@ -1,178 +1,733 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
-#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
-
-#include <math.h>
-
-#include <arm_neon.h>
-
-#include "impl_arm_neon_asimd_common.h"
-
-/****************************************************
- *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
- ****************************************************/
-#define SimdDouble          float64x2_t
-#define simdLoadD            vld1q_f64
-#define simdLoad1D           vld1q_dup_f64
-#define simdSet1D            vdupq_n_f64
-#define simdStoreD           vst1q_f64
-#define simdLoadUD           vld1q_f64
-#define simdStoreUD          vst1q_f64
-#define simdSetZeroD()       vdupq_n_f64(0.0)
-#define simdAddD             vaddq_f64
-#define simdSubD             vsubq_f64
-#define simdMulD             vmulq_f64
-#define simdFmaddD(a, b, c)  vfmaq_f64(c, b, a)
-#define simdFmsubD(a, b, c)  vnegq_f64(vfmsq_f64(c, b, a))
-#define simdFnmaddD(a, b, c) vfmsq_f64(c, b, a)
-#define simdFnmsubD(a, b, c) vnegq_f64(vfmaq_f64(c, b, a))
-#define simdAndD(a, b)        (float64x2_t)(vandq_s64((int64x2_t)(a), (int64x2_t)(b)))
-#define simdAndNotD(a, b)     (float64x2_t)(vbicq_s64((int64x2_t)(b), (int64x2_t)(a)))
-#define simdOrD(a, b)         (float64x2_t)(vorrq_s64((int64x2_t)(a), (int64x2_t)(b)))
-#define simdXorD(a, b)        (float64x2_t)(veorq_s64((int64x2_t)(a), (int64x2_t)(b)))
-#define simdRsqrtD            vrsqrteq_f64
-#define simdRsqrtIterD(lu, x) vmulq_f64(lu, vrsqrtsq_f64(vmulq_f64(lu, lu), x))
-#define simdRcpD              vrecpeq_f64
-#define simdRcpIterD(lu, x)   vmulq_f64(lu, vrecpsq_f64(lu, x))
-#define simdAbsD(x)         vabsq_f64(x)
-#define simdNegD(x)         vnegq_f64(x)
-#define simdMaxD             vmaxq_f64
-#define simdMinD             vminq_f64
-#define simdRoundD(x)        vrndnq_f64(x)
-#define simdTruncD(x)        vrndq_f64(x)
-#define simdFractionD(x)     vsubq_f64(x, simdTruncD(x))
-#define simdGetExponentD    simdGetExponentD_arm_neon_asimd
-#define simdGetMantissaD    simdGetMantissaD_arm_neon_asimd
-#define simdSetExponentD    simdSetExponentD_arm_neon_asimd
-/* integer datatype corresponding to double: SimdDInt32 */
-#define SimdDInt32          int32x2_t
-#define simdLoadDI(m)        vld1_s32(m)
-#define simdSet1DI           vdup_n_s32
-#define simdStoreDI(m, x)    vst1_s32(m, x)
-#define simdLoadUDI(m)       vld1_s32(m)
-#define simdStoreUDI(m, x)   vst1_s32(m, x)
-#define simdSetZeroDI()      vdup_n_s32(0)
-#define simdCvttD2I(x)       vmovn_s64(vcvtq_s64_f64(x))
-#define simdCvtD2I(x)        vmovn_s64(vcvtnq_s64_f64(x))
-#define simdCvtI2D(x)        vcvtq_f64_s64(vmovl_s32(x))
-#define simdExtractDI(x, i)  vget_lane_s32(x, i)
-/* Integer logical ops on SimdDInt32 */
-#define simdSlliDI           vshl_n_s32
-#define simdSrliDI           vshr_n_s32
-#define simdAndDI            vand_s32
-#define simdAndNotDI(a, b)    vbic_s32(b, a)
-#define simdOrDI             vorr_s32
-#define simdXorDI            veor_s32
-/* Integer arithmetic ops on SimdDInt32 */
-#define simdAddDI            vadd_s32
-#define simdSubDI            vsub_s32
-#define simdMulDI            vmul_s32
-/* Boolean & comparison operations on SimdDouble */
-#define SimdDBool           uint64x2_t
-#define simdCmpEqD           vceqq_f64
-#define simdCmpLtD           vcltq_f64
-#define simdCmpLeD           vcleq_f64
-#define simdAndDB            vandq_u64
-#define simdOrDB             vorrq_u64
-#define simdAnyTrueDB(x)     (vmaxvq_u32((uint32x4_t)(x)) != 0)
-#define simdMaskD(a, sel)     (float64x2_t)(vandq_u64((uint64x2_t)(a), sel))
-#define simdMaskNotD(a, sel)  (float64x2_t)(vbicq_u64((uint64x2_t)(a), sel))
-#define simdBlendD(a, b, sel)     vbslq_f64(sel, b, a)
-#define simdReduceD(a)       simdReduceD_arm_neon_asimd(a)
-/* Boolean & comparison operations on SimdDInt32 */
-#define SimdDIBool          uint32x2_t
-#define simdCmpEqDI          vceq_s32
-#define simdCmpLtDI          vclt_s32
-#define simdAndDIB           vand_u32
-#define simdOrDIB            vorr_u32
-#define simdAnyTrueDIB(x)    (vmaxv_u32(x) != 0)
-#define simdMaskDI(a, sel)      vand_s32(a, vreinterpret_s32_u32(sel))
-#define simdMaskNotDI(a, sel)  vbic_s32(a, vreinterpret_s32_u32(sel))
-#define simdBlendDI(a, b, sel)     vbsl_s32(sel, b, a)
-/* Conversions between different booleans */
-#define simdCvtDB2DIB(x)     vqmovn_u64(x)
-#define simdCvtDIB2DB(x)     vorrq_u64(vmovl_u32(x), vshlq_n_u64(vmovl_u32(x), 32))
-
-/* Float/double conversion */
-#define simdCvtF2DD(f, d0, d1)  { *d0 = vcvt_f64_f32(vget_low_f32(f)); *d1 = vcvt_high_f64_f32(f); }
-#define simdCvtDD2F(d0, d1)     vcvt_high_f32_f64(vcvt_f32_f64(d0), d1)
-
-/****************************************************
- * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
- ****************************************************/
-static inline SimdDouble
-simdGetExponentD_arm_neon_asimd(SimdDouble x)
-{
-    const float64x2_t expmask    = (float64x2_t)( vdupq_n_s64(0x7FF0000000000000LL) );
-    int64x2_t         iexp;
-
-    iexp = (int64x2_t)(simdAndD(x, expmask));
-    iexp = vsubq_s64(vshrq_n_s64(iexp, 52), vdupq_n_s64(1023));
-    return vcvtq_f64_s64(iexp);
-}
-
-
-static inline SimdDouble
-simdGetMantissaD_arm_neon_asimd(SimdDouble x)
-{
-    const float64x2_t mantmask   = (float64x2_t)( vdupq_n_s64(0x000FFFFFFFFFFFFFLL) );
-    const float64x2_t one        = vdupq_n_f64(1.0);
-
-    /* Get mantissa */
-    x = simdAndD(mantmask, x);
-    /* Reset zero (but correctly biased) exponent */
-    return simdOrD(x, one);
-}
-
-
-static inline SimdDouble
-simdSetExponentD_arm_neon_asimd(SimdDouble x)
-{
-    int64x2_t  iexp = vcvtnq_s64_f64(x);
-
-    iexp = vshlq_n_s64(vaddq_s64(iexp, vdupq_n_s64(1023)), 52);
-    return (float64x2_t)(iexp);
-}
-
-static inline double
-simdReduceD_arm_neon_asimd(SimdDouble a)
-{
-    a = vpaddq_f64(a, a);
-    return vgetq_lane_f64(a, 0);
-}
-
-#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H */
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+
+#include <arm_neon.h>
+
+#include "impl_arm_neon_asimd_simd_float.h"
+
+namespace gmx
+{
+
+class SimdDouble
+{
+    public:
+        SimdDouble() {}
+
+        SimdDouble(double d) : simdInternal_(vdupq_n_f64(d)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDouble(float64x2_t simd) : simdInternal_(simd) {}
+
+        float64x2_t  simdInternal_;
+};
+
+class SimdDInt32
+{
+    public:
+        SimdDInt32() {}
+
+        SimdDInt32(std::int32_t i) : simdInternal_(vdup_n_s32(i)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDInt32(int32x2_t simd) : simdInternal_(simd) {}
+
+        int32x2_t  simdInternal_;
+};
+
+class SimdDBool
+{
+    public:
+        SimdDBool() {}
+
+        SimdDBool(bool b) : simdInternal_(vdupq_n_u64( b ? 0xFFFFFFFFFFFFFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDBool(uint64x2_t simd) : simdInternal_(simd) {}
+
+        uint64x2_t  simdInternal_;
+};
+
+class SimdDIBool
+{
+    public:
+        SimdDIBool() {}
+
+        SimdDIBool(bool b) : simdInternal_(vdup_n_u32( b ? 0xFFFFFFFF : 0)) {}
+
+        // Internal utility constructor to simplify return statements
+        SimdDIBool(uint32x2_t simd) : simdInternal_(simd) {}
+
+        uint32x2_t  simdInternal_;
+};
+
+static inline SimdDouble gmx_simdcall
+load(const double *m)
+{
+    assert(std::size_t(m) % 16 == 0);
+    return {
+               vld1q_f64(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(double *m, SimdDouble a)
+{
+    assert(std::size_t(m) % 16 == 0);
+    vst1q_f64(m, a.simdInternal_);
+}
+
+static inline SimdDouble gmx_simdcall
+loadU(const double *m)
+{
+    return {
+               vld1q_f64(m)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(double *m, SimdDouble a)
+{
+    vst1q_f64(m, a.simdInternal_);
+}
+
+static inline SimdDouble gmx_simdcall
+setZeroD()
+{
+    return {
+               vdupq_n_f64(0.0)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+loadDI(const std::int32_t * m)
+{
+    assert(std::size_t(m) % 8 == 0);
+    return {
+               vld1_s32(m)
+    };
+}
+
+static inline void gmx_simdcall
+store(std::int32_t * m, SimdDInt32 a)
+{
+    assert(std::size_t(m) % 8 == 0);
+    vst1_s32(m, a.simdInternal_);
+}
+
+static inline SimdDInt32 gmx_simdcall
+loadUDI(const std::int32_t *m)
+{
+    return {
+               vld1_s32(m)
+    };
+}
+
+static inline void gmx_simdcall
+storeU(std::int32_t * m, SimdDInt32 a)
+{
+    vst1_s32(m, a.simdInternal_);
+}
+
+static inline SimdDInt32 gmx_simdcall
+setZeroDI()
+{
+    return {
+               vdup_n_s32(0)
+    };
+}
+
+template<int index> gmx_simdcall
+static inline std::int32_t
+extract(SimdDInt32 a)
+{
+    return vget_lane_s32(a.simdInternal_, index);
+}
+
+static inline SimdDouble gmx_simdcall
+operator&(SimdDouble a, SimdDouble b)
+{
+    return {
+               float64x2_t(vandq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+andNot(SimdDouble a, SimdDouble b)
+{
+    return {
+               float64x2_t(vbicq_s64(int64x2_t(b.simdInternal_), int64x2_t(a.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator|(SimdDouble a, SimdDouble b)
+{
+    return {
+               float64x2_t(vorrq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator^(SimdDouble a, SimdDouble b)
+{
+    return {
+               float64x2_t(veorq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_)))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator+(SimdDouble a, SimdDouble b)
+{
+    return {
+               vaddq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator-(SimdDouble a, SimdDouble b)
+{
+    return {
+               vsubq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator-(SimdDouble x)
+{
+    return {
+               vnegq_f64(x.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+operator*(SimdDouble a, SimdDouble b)
+{
+    return {
+               vmulq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fma(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fms(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               vnegq_f64(vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fnma(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+fnms(SimdDouble a, SimdDouble b, SimdDouble c)
+{
+    return {
+               vnegq_f64(vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+rsqrt(SimdDouble x)
+{
+    return {
+               vrsqrteq_f64(x.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+rsqrtIter(SimdDouble lu, SimdDouble x)
+{
+    return {
+               vmulq_f64(lu.simdInternal_, vrsqrtsq_f64(vmulq_f64(lu.simdInternal_, lu.simdInternal_), x.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+rcp(SimdDouble x)
+{
+    return {
+               vrecpeq_f64(x.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+rcpIter(SimdDouble lu, SimdDouble x)
+{
+    return {
+               vmulq_f64(lu.simdInternal_, vrecpsq_f64(lu.simdInternal_, x.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
+{
+    float64x2_t addend = float64x2_t(vandq_u64(uint64x2_t(b.simdInternal_), m.simdInternal_));
+
+    return {
+               vaddq_f64(a.simdInternal_, addend)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
+{
+    float64x2_t prod = vmulq_f64(a.simdInternal_, b.simdInternal_);
+    return {
+               float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
+{
+    float64x2_t prod = vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_);
+
+    return {
+               float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzRsqrt(SimdDouble x, SimdDBool m)
+{
+    // The result will always be correct since we mask the result with m, but
+    // for debug builds we also want to make sure not to generate FP exceptions
+#ifndef NDEBUG
+    x.simdInternal_ = vbslq_f64(m.simdInternal_, vdupq_n_f64(1.0, x.simdInternal_);
+#endif
+    return {
+               float64x2_t(vandq_u64(uint64x2_t(vrsqrteq_f64(x.simdInternal_)), m.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+maskzRcp(SimdDouble x, SimdDBool m)
+{
+    // The result will always be correct since we mask the result with m, but
+    // for debug builds we also want to make sure not to generate FP exceptions
+#ifndef NDEBUG
+    x.simdInternal_ = vbslq_f64(m.simdInternal_, vdupq_n_f64(1.0, x.simdInternal_);
+#endif
+    return {
+               float64x2_t(vandq_u64(uint64x2_t(vrecpeq_f64(x.simdInternal_)), m.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+abs(SimdDouble x)
+{
+    return {
+               vabsq_f64( x.simdInternal_ )
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+max(SimdDouble a, SimdDouble b)
+{
+    return {
+               vmaxq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+min(SimdDouble a, SimdDouble b)
+{
+    return {
+               vminq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+round(SimdDouble x)
+{
+    return {
+               vrndnq_f64(x.simdInternal_)
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+trunc(SimdDouble x)
+{
+    return {
+               vrndq_f64( x.simdInternal_ )
+    };
+}
+
+static inline SimdDouble
+frexp(SimdDouble value, SimdDInt32 * exponent)
+{
+    const float64x2_t exponentMask = float64x2_t( vdupq_n_s64(0x7FF0000000000000LL) );
+    const float64x2_t mantissaMask = float64x2_t( vdupq_n_s64(0x800FFFFFFFFFFFFFLL) );
+
+    const int64x2_t   exponentBias = vdupq_n_s64(1022); // add 1 to make our definition identical to frexp()
+    const float64x2_t half         = vdupq_n_f64(0.5);
+    int64x2_t         iExponent;
+
+    iExponent               = vandq_s64( int64x2_t(value.simdInternal_), int64x2_t(exponentMask) );
+    iExponent               = vsubq_s64(vshrq_n_s64(iExponent, 52), exponentBias);
+    exponent->simdInternal_ = vmovn_s64(iExponent);
+
+    return {
+               float64x2_t(vorrq_s64(vandq_s64(int64x2_t(value.simdInternal_), int64x2_t(mantissaMask)), int64x2_t(half)))
+    };
+}
+
+static inline SimdDouble
+ldexp(SimdDouble value, SimdDInt32 exponent)
+{
+    const int64x2_t  exponentBias = vdupq_n_s64(1023);
+    int64x2_t        iExponent;
+
+    iExponent = vmovl_s32(exponent.simdInternal_);
+    iExponent = vshlq_n_s64(vaddq_s64(iExponent, exponentBias), 52);
+
+    return {
+               vmulq_f64(value.simdInternal_, float64x2_t(iExponent))
+    };
+}
+
+static inline double gmx_simdcall
+reduce(SimdDouble a)
+{
+    float64x2_t b = vpaddq_f64(a.simdInternal_, a.simdInternal_);
+    return vgetq_lane_f64(b, 0);
+}
+
+static inline SimdDBool gmx_simdcall
+operator==(SimdDouble a, SimdDouble b)
+{
+    return {
+               vceqq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator!=(SimdDouble a, SimdDouble b)
+{
+    return {
+               vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.simdInternal_, b.simdInternal_))))
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator<(SimdDouble a, SimdDouble b)
+{
+    return {
+               vcltq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator<=(SimdDouble a, SimdDouble b)
+{
+    return {
+               vcleq_f64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+testBits(SimdDouble a)
+{
+    return {
+               vtstq_s64( int64x2_t(a.simdInternal_), int64x2_t(a.simdInternal_) )
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator&&(SimdDBool a, SimdDBool b)
+{
+    return {
+               vandq_u64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+operator||(SimdDBool a, SimdDBool b)
+{
+    return {
+               vorrq_u64(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdDBool a)
+{
+    return (vmaxvq_u32((uint32x4_t)(a.simdInternal_)) != 0);
+}
+
+static inline SimdDouble gmx_simdcall
+selectByMask(SimdDouble a, SimdDBool m)
+{
+    return {
+        float64x2_t(vandq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+selectByNotMask(SimdDouble a, SimdDBool m)
+{
+    return {
+        float64x2_t(vbicq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+blend(SimdDouble a, SimdDouble b, SimdDBool sel)
+{
+    return {
+        vbslq_f64(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator<<(SimdDInt32 a, int n)
+{
+    return {
+        vshl_n_s32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator>>(SimdDInt32 a, int n)
+{
+    return {
+        vshr_n_s32(a.simdInternal_, n)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator&(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vand_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+andNot(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vbic_s32(b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator|(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vorr_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator^(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        veor_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator+(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vadd_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator-(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vsub_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+operator*(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vmul_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator==(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vceq_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+testBits(SimdDInt32 a)
+{
+    return {
+        vtst_s32( a.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator<(SimdDInt32 a, SimdDInt32 b)
+{
+    return {
+        vclt_s32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator&&(SimdDIBool a, SimdDIBool b)
+{
+    return {
+        vand_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+operator||(SimdDIBool a, SimdDIBool b)
+{
+    return {
+        vorr_u32(a.simdInternal_, b.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdDIBool a)
+{
+    return (vmaxv_u32(a.simdInternal_) != 0);
+}
+
+static inline SimdDInt32 gmx_simdcall
+selectByMask(SimdDInt32 a, SimdDIBool m)
+{
+    return {
+        vand_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_))
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+selectByNotMask(SimdDInt32 a, SimdDIBool m)
+{
+    return {
+        vbic_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_))
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
+{
+    return {
+        vbsl_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+cvtR2I(SimdDouble a)
+{
+    return {
+        vmovn_s64(vcvtnq_s64_f64(a.simdInternal_))
+    };
+}
+
+static inline SimdDInt32 gmx_simdcall
+cvttR2I(SimdDouble a)
+{
+    return {
+        vmovn_s64(vcvtq_s64_f64(a.simdInternal_))
+    };
+}
+
+static inline SimdDouble gmx_simdcall
+cvtI2R(SimdDInt32 a)
+{
+    return {
+        vcvtq_f64_s64(vmovl_s32(a.simdInternal_))
+    };
+}
+
+static inline SimdDIBool gmx_simdcall
+cvtB2IB(SimdDBool a)
+{
+    return {
+        vqmovn_u64(a.simdInternal_)
+    };
+}
+
+static inline SimdDBool gmx_simdcall
+cvtIB2B(SimdDIBool a)
+{
+    return {
+        vorrq_u64(vmovl_u32(a.simdInternal_), vshlq_n_u64(vmovl_u32(a.simdInternal_), 32))
+    };
+}
+
+static inline void gmx_simdcall
+cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
+{
+    d0->simdInternal_ = vcvt_f64_f32(vget_low_f32(f.simdInternal_));
+    d1->simdInternal_ = vcvt_high_f64_f32(f.simdInternal_);
+}
+
+static inline SimdFloat gmx_simdcall
+cvtDD2F(SimdDouble d0, SimdDouble d1)
+{
+    return {
+        vcvt_high_f32_f64(vcvt_f32_f64(d0.simdInternal_), d1.simdInternal_)
+    };
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h
index ee8c3f1a7b..340f9298d9 100644
--- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h
@@ -36,70 +36,92 @@
 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H
 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H
 
-#include <math.h>
+#include "config.h"
 
 #include <arm_neon.h>
 
-#include "impl_arm_neon_asimd_common.h"
-
-/* NEON ASIMD always has FMA support, so make sure we use that for single too. */
-#undef  simdFmaddF
-#define simdFmaddF(a, b, c)  vfmaq_f32(c, b, a)
-#undef  simdFmsubF
-#define simdFmsubF(a, b, c)  vnegq_f32(vfmsq_f32(c, b, a))
-#undef  simdFnmaddF
-#define simdFnmaddF(a, b, c) vfmsq_f32(c, b, a)
-#undef  simdFnmsubF
-#define simdFnmsubF(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
-
-/* The rounding instructions were actually added already in ARMv8, but most
- * compilers did not add intrinsics for them. Make sure we use them for single
- * precision too when enabling NEON Advanced SIMD.
- */
-#undef  simdRoundF
-#define simdRoundF(x)        vrndnq_f32(x)
-#undef  simdTruncF
-#define simdTruncF(x)        vrndq_f32(x)
-
-/* NEON Advanced SIMD has a real rounding conversion instruction */
-#undef  simdCvtF2I
-#define simdCvtF2I(x)        vcvtnq_s32_f32(x)
-
-/* Since we redefine rounding/conversion-with-rounding, make
- * sure we use the new operations by redefining the routine
- * to set the exponent too.
- */
-#undef  simdSetExponentF
-#define simdSetExponentF    simdSetExponentF_arm_neon_asimd
-
-/* We can do more efficient reduce with vector pairwise arithmetic */
-#undef  simdReduceF
-#define simdReduceF(a)       simdReduceF_arm_neon_asimd(a)
-
-/* Pick the largest unsigned integer as a shortcut for any-true */
-#undef  simdAnyTrueFB
-#define simdAnyTrueFB(x)     (vmaxvq_u32(x) != 0)
-#undef  simdAnyTrueFIB
-#define simdAnyTrueFIB(x)    (vmaxvq_u32(x) != 0)
-
-/****************************************************
- * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
- ****************************************************/
-static inline SimdFloat
-simdSetExponentF_arm_neon_asimd(SimdFloat x)
+#include "gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h"
+
+namespace gmx
+{
+
+static inline SimdFloat gmx_simdcall
+fma(SimdFloat a, SimdFloat b, SimdFloat c)
 {
-    int32x4_t  iexp = vcvtnq_s32_f32(x);
+    return {
+               vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
+}
 
-    iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
-    return vreinterpretq_f32_s32(iexp);
+static inline SimdFloat gmx_simdcall
+fms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+               vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+    };
 }
 
-static inline float
-simdReduceF_arm_neon_asimd(SimdFloat a)
+static inline SimdFloat gmx_simdcall
+fnma(SimdFloat a, SimdFloat b, SimdFloat c)
 {
-    a = vpaddq_f32(a, a);
-    a = vpaddq_f32(a, a);
-    return vgetq_lane_f32(a, 0);
+    return {
+               vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)
+    };
 }
 
-#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H */
+static inline SimdFloat gmx_simdcall
+fnms(SimdFloat a, SimdFloat b, SimdFloat c)
+{
+    return {
+               vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_))
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+round(SimdFloat x)
+{
+    return {
+               vrndnq_f32(x.simdInternal_)
+    };
+}
+
+static inline SimdFloat gmx_simdcall
+trunc(SimdFloat x)
+{
+    return {
+               vrndq_f32(x.simdInternal_)
+    };
+}
+
+static inline SimdFInt32 gmx_simdcall
+cvtR2I(SimdFloat a)
+{
+    return {
+               vcvtnq_s32_f32(a.simdInternal_)
+    };
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdFBool a)
+{
+    return (vmaxvq_u32(a.simdInternal_) != 0);
+}
+
+static inline bool gmx_simdcall
+anyTrue(SimdFIBool a)
+{
+    return (vmaxvq_u32(a.simdInternal_) != 0);
+}
+
+static inline float gmx_simdcall
+reduce(SimdFloat a)
+{
+    float32x4_t b = a.simdInternal_;
+    b = vpaddq_f32(b, b);
+    b = vpaddq_f32(b, b);
+    return vgetq_lane_f32(b, 0);
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h
new file mode 100644
index 0000000000..45abcbbe16
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h
@@ -0,0 +1,306 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
+
+#include "config.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <arm_neon.h>
+
+#include "gromacs/utility/basedefinitions.h"
+
+#include "impl_arm_neon_asimd_simd_double.h"
+
+namespace gmx
+{
+
+template <int align>
+static gmx_inline void gmx_simdcall
+gatherLoadTranspose(const double *        base,
+                    const std::int32_t    offset[],
+                    SimdDouble *          v0,
+                    SimdDouble *          v1,
+                    SimdDouble *          v2,
+                    SimdDouble *          v3)
+{
+    float64x2_t t1, t2, t3, t4;
+
+    assert(std::size_t(offset) % 8 == 0);
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 2 == 0);
+
+    t1                = vld1q_f64(base + align * offset[0]);
+    t2                = vld1q_f64(base + align * offset[1]);
+    t3                = vld1q_f64(base + align * offset[0] + 2);
+    t4                = vld1q_f64(base + align * offset[1] + 2);
+    v0->simdInternal_ = vuzp1q_f64(t1, t2);
+    v1->simdInternal_ = vuzp2q_f64(t1, t2);
+    v2->simdInternal_ = vuzp1q_f64(t3, t4);
+    v3->simdInternal_ = vuzp2q_f64(t3, t4);
+}
+
+template <int align>
+static gmx_inline void gmx_simdcall
+gatherLoadTranspose(const double *        base,
+                    const std::int32_t    offset[],
+                    SimdDouble *          v0,
+                    SimdDouble *          v1)
+{
+    float64x2_t t1, t2;
+
+    assert(std::size_t(offset) % 8 == 0);
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 2 == 0);
+
+    t1                = vld1q_f64(base + align * offset[0]);
+    t2                = vld1q_f64(base + align * offset[1]);
+    v0->simdInternal_ = vuzp1q_f64(t1, t2);
+    v1->simdInternal_ = vuzp2q_f64(t1, t2);
+}
+
+static const int c_simdBestPairAlignmentDouble = 2;
+
+template <int align>
+static gmx_inline void gmx_simdcall
+gatherLoadUTranspose(const double *        base,
+                     const std::int32_t    offset[],
+                     SimdDouble *          v0,
+                     SimdDouble *          v1,
+                     SimdDouble *          v2)
+{
+    float64x2_t t1, t2;
+    float64x1_t t3, t4;
+
+    assert(std::size_t(offset) % 8 == 0);
+
+    t1                = vld1q_f64(base + align * offset[0]);
+    t2                = vld1q_f64(base + align * offset[1]);
+    t3                = vld1_f64(base + align * offset[0] + 2);
+    t4                = vld1_f64(base + align * offset[1] + 2);
+    v0->simdInternal_ = vuzp1q_f64(t1, t2);
+    v1->simdInternal_ = vuzp2q_f64(t1, t2);
+    v2->simdInternal_ = vcombine_f64(t3, t4);
+}
+
+template <int align>
+static gmx_inline void gmx_simdcall
+transposeScatterStoreU(double *             base,
+                       const std::int32_t   offset[],
+                       SimdDouble           v0,
+                       SimdDouble           v1,
+                       SimdDouble           v2)
+{
+    float64x2_t t0, t1;
+
+    assert(std::size_t(offset) % 8 == 0);
+
+    t0  = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
+    t1  = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
+    vst1q_f64(base + align * offset[0], t0);
+    vst1q_f64(base + align * offset[1], t1);
+    vst1_f64(base + align * offset[0] + 2, vget_low_f64(v2.simdInternal_));
+    vst1_f64(base + align * offset[1] + 2, vget_high_f64(v2.simdInternal_));
+}
+
+template <int align>
+static gmx_inline void gmx_simdcall
+transposeScatterIncrU(double *             base,
+                      const std::int32_t   offset[],
+                      SimdDouble           v0,
+                      SimdDouble           v1,
+                      SimdDouble           v2)
+{
+    float64x2_t t0, t1, t2;
+    float64x1_t t3;
+
+    assert(std::size_t(offset) % 8 == 0);
+
+    t0  = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
+    t1  = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
+
+    t2 = vld1q_f64(base + align * offset[0]);
+    t2 = vaddq_f64(t2, t0);
+    vst1q_f64(base + align * offset[0], t2);
+
+    t3 = vld1_f64(base + align * offset[0] + 2);
+    t3 = vadd_f64(t3, vget_low_f64(v2.simdInternal_));
+    vst1_f64(base + align * offset[0] + 2, t3);
+
+    t2 = vld1q_f64(base + align * offset[1]);
+    t2 = vaddq_f64(t2, t1);
+    vst1q_f64(base + align * offset[1], t2);
+
+    t3 = vld1_f64(base + align * offset[1] + 2);
+    t3 = vadd_f64(t3, vget_high_f64(v2.simdInternal_));
+    vst1_f64(base + align * offset[1] + 2, t3);
+}
+
+template <int align>
+static gmx_inline void gmx_simdcall
+transposeScatterDecrU(double *             base,
+                      const std::int32_t   offset[],
+                      SimdDouble           v0,
+                      SimdDouble           v1,
+                      SimdDouble           v2)
+{
+    float64x2_t t0, t1, t2;
+    float64x1_t t3;
+
+    assert(std::size_t(offset) % 8 == 0);
+
+    t0  = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0
+    t1  = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1
+
+    t2 = vld1q_f64(base + align * offset[0]);
+    t2 = vsubq_f64(t2, t0);
+    vst1q_f64(base + align * offset[0], t2);
+
+    t3 = vld1_f64(base + align * offset[0] + 2);
+    t3 = vsub_f64(t3, vget_low_f64(v2.simdInternal_));
+    vst1_f64(base + align * offset[0] + 2, t3);
+
+    t2 = vld1q_f64(base + align * offset[1]);
+    t2 = vsubq_f64(t2, t1);
+    vst1q_f64(base + align * offset[1], t2);
+
+    t3 = vld1_f64(base + align * offset[1] + 2);
+    t3 = vsub_f64(t3, vget_high_f64(v2.simdInternal_));
+    vst1_f64(base + align * offset[1] + 2, t3);
+}
+
+static gmx_inline void gmx_simdcall
+expandScalarsToTriplets(SimdDouble    scalar,
+                        SimdDouble *  triplets0,
+                        SimdDouble *  triplets1,
+                        SimdDouble *  triplets2)
+{
+    triplets0->simdInternal_ = vuzp1q_f64(scalar.simdInternal_, scalar.simdInternal_);
+    triplets1->simdInternal_ = scalar.simdInternal_;
+    triplets2->simdInternal_ = vuzp2q_f64(scalar.simdInternal_, scalar.simdInternal_);
+}
+
+template <int align>
+static gmx_inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const double *  base,
+                             SimdDInt32      offset,
+                             SimdDouble *    v0,
+                             SimdDouble *    v1,
+                             SimdDouble *    v2,
+                             SimdDouble *    v3)
+{
+    GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH)  ioffset[GMX_SIMD_DINT32_WIDTH];
+
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 2 == 0);
+
+    vst1_s32(ioffset, offset.simdInternal_);
+    gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
+}
+
+
+template <int align>
+static gmx_inline void gmx_simdcall
+gatherLoadBySimdIntTranspose(const double *  base,
+                             SimdDInt32      offset,
+                             SimdDouble *    v0,
+                             SimdDouble *    v1)
+{
+    GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH)  ioffset[GMX_SIMD_DINT32_WIDTH];
+
+    assert(std::size_t(base) % 16 == 0);
+    assert(align % 2 == 0);
+
+    vst1_s32(ioffset, offset.simdInternal_);
+    gatherLoadTranspose<align>(base, ioffset, v0, v1);
+}
+
+template <int align>
+static gmx_inline void gmx_simdcall
+gatherLoadUBySimdIntTranspose(const double *  base,
+                              SimdDInt32      offset,
+                              SimdDouble *    v0,
+                              SimdDouble *    v1)
+{
+    GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH)  ioffset[GMX_SIMD_DINT32_WIDTH];
+
+    vst1_s32(ioffset, offset.simdInternal_);
+
+    float64x2_t t1, t2;
+
+    t1                = vld1q_f64(base + align * ioffset[0]);
+    t2                = vld1q_f64(base + align * ioffset[1]);
+    v0->simdInternal_ = vuzp1q_f64(t1, t2);
+    v1->simdInternal_ = vuzp2q_f64(t1, t2);
+}
+
+
+static gmx_inline double gmx_simdcall
+reduceIncr4ReturnSum(double *    m,
+                     SimdDouble  v0,
+                     SimdDouble  v1,
+                     SimdDouble  v2,
+                     SimdDouble  v3)
+{
+    float64x2_t t1, t2, t3, t4;
+
+    assert(std::size_t(m) % 8 == 0);
+
+    t1 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_);
+    t2 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_);
+    t3 = vuzp1q_f64(v2.simdInternal_, v3.simdInternal_);
+    t4 = vuzp2q_f64(v2.simdInternal_, v3.simdInternal_);
+
+    t1 = vaddq_f64(t1, t2);
+    t3 = vaddq_f64(t3, t4);
+
+    t2 = vaddq_f64(t1, vld1q_f64(m));
+    t4 = vaddq_f64(t3, vld1q_f64(m + 2));
+    vst1q_f64(m, t2);
+    vst1q_f64(m + 2, t4);
+
+    t1 = vaddq_f64(t1, t3);
+    t2 = vpaddq_f64(t1, t1);
+
+    return vgetq_lane_f64(t2, 0);
+}
+
+}      // namespace gmx
+
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h
similarity index 55%
rename from src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h
rename to src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h
index 536b67c97a..177cc7ddda 100644
--- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h
@@ -33,36 +33,9 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-#ifndef GMX_SIMD_IMPL_ARM_NEON_COMMON_H
-#define GMX_SIMD_IMPL_ARM_NEON_COMMON_H
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_FLOAT_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_FLOAT_H
 
-/* Capability definitions for ARM 32-bit NEON */
-#define GMX_SIMD                             1
-#define GMX_SIMD_HAVE_FLOAT                  1
-#define GMX_SIMD_HAVE_DOUBLE                 0
-#define GMX_SIMD_HAVE_LOADU                  1
-#define GMX_SIMD_HAVE_STOREU                 1
-#define GMX_SIMD_HAVE_LOGICAL                1
-#define GMX_SIMD_HAVE_FMA                    1
-#define GMX_SIMD_HAVE_FRACTION               0
-#define GMX_SIMD_HAVE_FINT32                 1
-#define GMX_SIMD_HAVE_FINT32_EXTRACT         1
-#define GMX_SIMD_HAVE_FINT32_LOGICAL         1
-#define GMX_SIMD_HAVE_FINT32_ARITHMETICS     1
-#define GMX_SIMD_HAVE_DINT32                 0
-#define GMX_SIMD_HAVE_DINT32_EXTRACT         0
-#define GMX_SIMD_HAVE_DINT32_LOGICAL         0
-#define GMX_SIMD_HAVE_DINT32_ARITHMETICS     0
-#define GMX_SIMD4_HAVE_FLOAT                 1
-#define GMX_SIMD4_HAVE_DOUBLE                0
+#include "gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h"
 
-/* Implementation details */
-#define GMX_SIMD_FLOAT_WIDTH                 4
-#undef  GMX_SIMD_DOUBLE_WIDTH
-#define GMX_SIMD_FINT32_WIDTH                4
-#undef  GMX_SIMD_DINT32_WIDTH
-#define GMX_SIMD4_WIDTH                      4
-#define GMX_SIMD_RSQRT_BITS                  8
-#define GMX_SIMD_RCP_BITS                    8
-
-#endif /* GMX_SIMD_IMPL_ARM_NEON_COMMON_H */
+#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_FLOAT_H
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h
index 0d969933bd..e345aff722 100644
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -108,6 +108,10 @@
 #    include "impl_x86_sse2/impl_x86_sse2.h"
 #elif GMX_SIMD_IBM_QPX
 #    include "impl_ibm_qpx/impl_ibm_qpx.h"
+#elif GMX_SIMD_ARM_NEON_ASIMD
+#    include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
+#elif GMX_SIMD_ARM_NEON
+#    include "impl_arm_neon/impl_arm_neon.h"
 #elif (GMX_SIMD_REFERENCE || defined DOXYGEN)
 #    include "impl_reference/impl_reference.h" // Includes doxygen documentation
 #else
-- 
2.11.4.GIT