From 3a895ab6d5c02e5f28bcdeed7ede260bea62b677 Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Mon, 6 Jul 2015 22:32:41 +0200 Subject: [PATCH] Extended SIMD, impl for Arm Neon and 64-bit Neon Asimd Tested with gcc-4.9 and 5.3, using Neon on Jetson TK1 and TX1 (both in 32-bit mode) and Neon asimd on APM X-Gene (64-bit mode). Change-Id: I4b9f0da49b1dda3b199eeec8e45688d49a43783e --- cmake/gmxDetectSimd.cmake | 4 +- cmake/gmxManageSimd.cmake | 24 +- docs/doxygen/suppressions.txt | 2 - src/gromacs/hardware/cpuinfo.cpp | 8 +- src/gromacs/simd/impl_arm_neon/impl_arm_neon.h | 8 +- .../simd/impl_arm_neon/impl_arm_neon_definitions.h | 81 ++ .../impl_arm_neon_general.h} | 35 +- .../simd/impl_arm_neon/impl_arm_neon_simd4_float.h | 434 +++++++-- .../simd/impl_arm_neon/impl_arm_neon_simd_float.h | 982 +++++++++++++++++---- .../simd/impl_arm_neon/impl_arm_neon_util_float.h | 361 ++++++++ .../simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h | 8 +- .../impl_arm_neon_asimd_definitions.h | 83 ++ .../impl_arm_neon_asimd_general.h} | 9 +- .../impl_arm_neon_asimd_simd4_float.h | 124 +++ .../impl_arm_neon_asimd_simd_double.h | 911 +++++++++++++++---- .../impl_arm_neon_asimd_simd_float.h | 136 +-- .../impl_arm_neon_asimd_util_double.h | 306 +++++++ .../impl_arm_neon_asimd_util_float.h} | 35 +- src/gromacs/simd/simd.h | 4 + 19 files changed, 2990 insertions(+), 565 deletions(-) create mode 100644 src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h rename src/gromacs/simd/{impl_arm_neon_asimd/impl_arm_neon_asimd_common.h => impl_arm_neon/impl_arm_neon_general.h} (64%) rewrite src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h (76%) create mode 100644 src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h create mode 100644 src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h copy src/gromacs/simd/{impl_arm_neon/impl_arm_neon.h => impl_arm_neon_asimd/impl_arm_neon_asimd_general.h} (89%) create mode 100644 src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h rewrite src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h (74%) create mode 100644 src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h rename src/gromacs/simd/{impl_arm_neon/impl_arm_neon_common.h => impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h} (55%) diff --git a/cmake/gmxDetectSimd.cmake b/cmake/gmxDetectSimd.cmake index f9e6a29c27..eb4c8679eb 100644 --- a/cmake/gmxDetectSimd.cmake +++ b/cmake/gmxDetectSimd.cmake @@ -118,10 +118,10 @@ function(gmx_suggest_simd _suggested_simd) set(OUTPUT_SIMD "IBM_VMX") elseif(OUTPUT_TMP MATCHES " qpx ") set(OUTPUT_SIMD "IBM_QPX") - elseif(OUTPUT_TMP MATCHES " neon ") - set(OUTPUT_SIMD "ARM_NEON") elseif(OUTPUT_TMP MATCHES " neon_asimd ") set(OUTPUT_SIMD "ARM_NEON_ASIMD") + elseif(OUTPUT_TMP MATCHES " neon ") + set(OUTPUT_SIMD "ARM_NEON") endif() endif() diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake index c9c39e9999..76e337970f 100644 --- a/cmake/gmxManageSimd.cmake +++ b/cmake/gmxManageSimd.cmake @@ -328,45 +328,39 @@ elseif(GMX_SIMD STREQUAL "AVX_512ER") elseif(GMX_SIMD STREQUAL "ARM_NEON") - gmx_find_cflag_for_source(CFLAGS_ARM_NEON "C compiler 32-bit ARM NEON flag" + gmx_find_cflag_for_source(CFLAGS_ARM_NEON "C compiler ARM NEON flag" "#include int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}" SIMD_C_FLAGS - "-mfpu=neon" "") - gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON "C++ compiler 32-bit ARM NEON flag" + "-mfpu=neon-vfpv4" "-mfpu=neon" "") + gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON "C++ compiler ARM NEON flag" "#include int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}" SIMD_CXX_FLAGS - "-mfpu=neon" "-D__STDC_CONSTANT_MACROS" "") + "-mfpu=neon-vfpv4" "-mfpu=neon" "-D__STDC_CONSTANT_MACROS" "") if(NOT CFLAGS_ARM_NEON OR NOT CXXFLAGS_ARM_NEON) - message(FATAL_ERROR "Cannot find ARM 32-bit NEON compiler flag. Use a newer compiler, or disable NEON SIMD.") + message(FATAL_ERROR "Cannot find ARM NEON compiler flag. Use a newer compiler, or disable NEON SIMD.") endif() set(GMX_SIMD_ARM_NEON 1) set(SIMD_STATUS_MESSAGE "Enabling 32-bit ARM NEON SIMD instructions") elseif(GMX_SIMD STREQUAL "ARM_NEON_ASIMD") - # Gcc-4.8.1 appears to have a bug where the c++ compiler requires - # -D__STDC_CONSTANT_MACROS if we include arm_neon.h gmx_find_cflag_for_source(CFLAGS_ARM_NEON_ASIMD "C compiler ARM NEON Advanced SIMD flag" "#include - int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);return vgetq_lane_f64(x,0)>0;}" + int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);x=vrndnq_f64(x);return vgetq_lane_f64(x,0)>0;}" SIMD_C_FLAGS "") gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON_ASIMD "C++ compiler ARM NEON Advanced SIMD flag" "#include - int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);return vgetq_lane_f64(x,0)>0;}" + int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);x=vrndnq_f64(x);return vgetq_lane_f64(x,0)>0;}" SIMD_CXX_FLAGS - "-D__STDC_CONSTANT_MACROS" "") + "") if(NOT CFLAGS_ARM_NEON_ASIMD OR NOT CXXFLAGS_ARM_NEON_ASIMD) - message(FATAL_ERROR "Cannot find ARM (AArch64) NEON Advanced SIMD compiler flag. Use a newer compiler, or disable SIMD.") - endif() - - if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9") - message(WARNING "At least gcc-4.8.1 has many bugs for ARM (AArch64) NEON Advanced SIMD compilation. You might need gcc version 4.9 or later.") + message(FATAL_ERROR "Compiler does not fully support ARM (AArch64) NEON Advanced SIMD. Use a newer compiler (gcc version 4.9 or later), or disable SIMD.") endif() if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.4") diff --git a/docs/doxygen/suppressions.txt b/docs/doxygen/suppressions.txt index 3b00cd0e5e..782db00290 100644 --- a/docs/doxygen/suppressions.txt +++ b/docs/doxygen/suppressions.txt @@ -46,8 +46,6 @@ src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.cpp: warning: inc src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h: warning: should include "nbnxn_simd.h" # Temporary while we change the SIMD implementation -src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h: warning: should include "simd.h" -src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h: warning: should include "simd.h" src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx_common.h: warning: should include "simd.h" src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_common.h: warning: should include "simd.h" src/gromacs/simd/impl_intel_mic/impl_intel_mic_common.h: warning: should include "simd.h" diff --git a/src/gromacs/hardware/cpuinfo.cpp b/src/gromacs/hardware/cpuinfo.cpp index 29fc25520e..afe09bbdf1 100644 --- a/src/gromacs/hardware/cpuinfo.cpp +++ b/src/gromacs/hardware/cpuinfo.cpp @@ -799,7 +799,13 @@ detectProcCpuInfoArm(const std::map &cpuInfo, } if (s.find("asimd") != std::string::npos) { - features->insert(CpuInfo::Feature::Arm_NeonAsimd); + // At least Jetson TX1 runs a 32-bit environment by default, although + // the kernel is 64-bits, and reports asimd feature flags. We cannot + // use Neon-asimd in this case, so make sure we are on a 64-bit platform. + if (sizeof(void *) == 8) + { + features->insert(CpuInfo::Feature::Arm_NeonAsimd); + } } } } diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h index 5357dfc978..2c4718e200 100644 --- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h @@ -36,7 +36,13 @@ #ifndef GMX_SIMD_IMPL_ARM_NEON_H #define GMX_SIMD_IMPL_ARM_NEON_H +#include "impl_arm_neon_definitions.h" +#include "impl_arm_neon_general.h" +// Arm/Neon cannot do double precision SIMD4 #include "impl_arm_neon_simd4_float.h" +// Arm/Neon cannot do double precision SIMD #include "impl_arm_neon_simd_float.h" +// Arm/Neon cannot do double precision SIMD utilities +#include "impl_arm_neon_util_float.h" -#endif /* GMX_SIMD_IMPL_ARM_NEON_H */ +#endif // GMX_SIMD_IMPL_ARM_NEON_H diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h new file mode 100644 index 0000000000..adae1cc7da --- /dev/null +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_definitions.h @@ -0,0 +1,81 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPL_ARM_NEON_DEFINITIONS_H +#define GMX_SIMD_IMPL_ARM_NEON_DEFINITIONS_H + +#define GMX_SIMD 1 +#define GMX_SIMD_HAVE_FLOAT 1 +#define GMX_SIMD_HAVE_DOUBLE 0 +#define GMX_SIMD_HAVE_LOADU 1 +#define GMX_SIMD_HAVE_STOREU 1 +#define GMX_SIMD_HAVE_LOGICAL 1 +#define GMX_SIMD_HAVE_FMA 1 +#define GMX_SIMD_HAVE_FINT32_EXTRACT 1 +#define GMX_SIMD_HAVE_FINT32_LOGICAL 1 +#define GMX_SIMD_HAVE_FINT32_ARITHMETICS 1 +#define GMX_SIMD_HAVE_DINT32_EXTRACT 0 +#define GMX_SIMD_HAVE_DINT32_LOGICAL 0 +#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 0 +#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT 1 +#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT 1 +#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0 +#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1 +#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 0 +#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4 +#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 + +#define GMX_SIMD4_HAVE_FLOAT 1 +#define GMX_SIMD4_HAVE_DOUBLE 0 + +// Implementation details +#define GMX_SIMD_FLOAT_WIDTH 4 +#undef GMX_SIMD_DOUBLE_WIDTH +#define GMX_SIMD_FINT32_WIDTH 4 +#undef GMX_SIMD_DINT32_WIDTH +#define GMX_SIMD4_WIDTH 4 +#define GMX_SIMD_RSQRT_BITS 8 +#define GMX_SIMD_RCP_BITS 8 + +#endif // GMX_SIMD_IMPL_ARM_NEON_DEFINITIONS_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_general.h similarity index 64% rename from src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h rename to src/gromacs/simd/impl_arm_neon/impl_arm_neon_general.h index 580016e0a9..696a53d902 100644 --- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_common.h +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_general.h @@ -32,29 +32,20 @@ * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ +#ifndef GMX_SIMD_IMPL_ARM_NEON_GENERAL_H +#define GMX_SIMD_IMPL_ARM_NEON_GENERAL_H -#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_COMMON_H -#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_COMMON_H +namespace gmx +{ -/* ARM (AArch64) NEON Advanced SIMD */ +static inline void +simdPrefetch(void * m) +{ +#ifdef __GNUC__ + __builtin_prefetch(m); +#endif +} -/* Inherit single-precision and integer part from 32-bit arm */ -#include "gromacs/simd/impl_arm_neon/impl_arm_neon.h" +} // namespace gmx -/* Override some capability definitions from ARM 32-bit NEON - we now have double */ -#undef GMX_SIMD_HAVE_DOUBLE -#define GMX_SIMD_HAVE_DOUBLE 1 -#undef GMX_SIMD_HAVE_DINT32 -#define GMX_SIMD_HAVE_DINT32 1 -#undef GMX_SIMD_HAVE_DINT32_EXTRACT -#define GMX_SIMD_HAVE_DINT32_EXTRACT 1 -#undef GMX_SIMD_HAVE_DINT32_LOGICAL -#define GMX_SIMD_HAVE_DINT32_LOGICAL 1 -#undef GMX_SIMD_HAVE_DINT32_ARITHMETICS -#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 1 - -/* Implementation details */ -#define GMX_SIMD_DOUBLE_WIDTH 2 -#define GMX_SIMD_DINT32_WIDTH 2 - -#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_COMMON_H */ +#endif // GMX_SIMD_IMPL_ARM_NEON_GENERAL_H diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h index f2feb6d6ea..c05635ee66 100644 --- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h @@ -32,66 +32,392 @@ * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ - #ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H #define GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H -#include +#include "config.h" + +#include +#include #include -#include "impl_arm_neon_common.h" -#include "impl_arm_neon_simd_float.h" - -/* ARM 32-bit Neon is already 4-wide in single, so just reuse float type for SIMD4 */ -#define Simd4Float SimdFloat -#define simd4LoadF simdLoadF -#define simd4Load1F simdLoad1F -#define simd4Set1F simdSet1F -#define simd4StoreF simdStoreF -#define simd4LoadUF simdLoadUF -#define simd4StoreUF simdStoreUF -#define simd4SetZeroF simdSetZeroF -#define simd4AddF simdAddF -#define simd4SubF simdSubF -#define simd4MulF simdMulF -#define simd4FmaddF simdFmaddF -#define simd4FmsubF simdFmsubF -#define simd4FnmaddF simdFnmaddF -#define simd4FnmsubF simdFnmsubF -#define simd4AndF simdAndF -#define simd4AndNotF simdAndNotF -#define simd4OrF simdOrF -#define simd4XorF simdXorF -#define simd4RsqrtF simdRsqrtF -#define simd4AbsF simdAbsF -#define simd4NegF simdNegF -#define simd4MaxF simdMaxF -#define simd4MinF simdMinF -#define simd4RoundF simdRoundF -#define simd4TruncF simdTruncF -#define simd4DotProductF simd4DotProductF_arm_neon -#define Simd4FBool SimdFBool -#define simd4CmpEqF simdCmpEqF -#define simd4CmpLtF simdCmpLtF -#define simd4CmpLeF simdCmpLeF -#define simd4AndFB simdAndFB -#define simd4OrFB simdOrFB -#define simd4AnyTrueFB simdAnyTrueFB -#define simd4MaskF simdMaskF -#define simd4MaskNotF simdMaskNotF -#define simd4BlendF simdBlendF -#define simd4ReduceF simdReduceF - -/* SIMD4 Dot product helper function */ -static inline float -simd4DotProductF_arm_neon(SimdFloat a, SimdFloat b) -{ - SimdFloat c; - c = simdMulF(a, b); +namespace gmx +{ + +class Simd4Float +{ + public: + Simd4Float() {} + + Simd4Float(float f) : simdInternal_(vdupq_n_f32(f)) {} + + // Internal utility constructor to simplify return statements + Simd4Float(float32x4_t simd) : simdInternal_(simd) {} + + float32x4_t simdInternal_; +}; + +class Simd4FBool +{ + public: + Simd4FBool() {} + + //! \brief Construct from scalar bool + Simd4FBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {} + + // Internal utility constructor to simplify return statements + Simd4FBool(uint32x4_t simd) : simdInternal_(simd) {} + + uint32x4_t simdInternal_; +}; + +static inline Simd4Float gmx_simdcall +load4(const float *m) +{ + assert(size_t(m) % 16 == 0); + return { + vld1q_f32(m) + }; +} + +static inline void gmx_simdcall +store4(float *m, Simd4Float a) +{ + assert(size_t(m) % 16 == 0); + vst1q_f32(m, a.simdInternal_); +} + +static inline Simd4Float gmx_simdcall +load4U(const float *m) +{ + return { + vld1q_f32(m) + }; +} + +static inline void gmx_simdcall +store4U(float *m, Simd4Float a) +{ + vst1q_f32(m, a.simdInternal_); +} + +static inline Simd4Float gmx_simdcall +simd4SetZeroF() +{ + return { + vdupq_n_f32(0.0f) + }; +} + +static inline Simd4Float gmx_simdcall +operator&(Simd4Float a, Simd4Float b) +{ + return { + vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_), + vreinterpretq_s32_f32(b.simdInternal_))) + }; +} + +static inline Simd4Float gmx_simdcall +andNot(Simd4Float a, Simd4Float b) +{ + return { + vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_), + vreinterpretq_s32_f32(a.simdInternal_))) + }; +} + +static inline Simd4Float gmx_simdcall +operator|(Simd4Float a, Simd4Float b) +{ + return { + vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_), + vreinterpretq_s32_f32(b.simdInternal_))) + }; +} + +static inline Simd4Float gmx_simdcall +operator^(Simd4Float a, Simd4Float b) +{ + return { + vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_), + vreinterpretq_s32_f32(b.simdInternal_))) + }; +} + +static inline Simd4Float gmx_simdcall +operator+(Simd4Float a, Simd4Float b) +{ + return { + vaddq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +operator-(Simd4Float a, Simd4Float b) +{ + return { + vsubq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +operator-(Simd4Float x) +{ + return { + vnegq_f32(x.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +operator*(Simd4Float a, Simd4Float b) +{ + return { + vmulq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline Simd4Float gmx_simdcall +fma(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#else + vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#endif + }; +} + +static inline Simd4Float gmx_simdcall +fms(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#else + vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#endif + }; +} + +static inline Simd4Float gmx_simdcall +fnma(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#else + vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#endif + }; +} + +static inline Simd4Float gmx_simdcall +fnms(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#else + vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#endif + }; +} +#endif + +static inline Simd4Float gmx_simdcall +rsqrt(Simd4Float x) +{ + return { + vrsqrteq_f32(x.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +abs(Simd4Float x) +{ + return { + vabsq_f32( x.simdInternal_ ) + }; +} + +static inline Simd4Float gmx_simdcall +max(Simd4Float a, Simd4Float b) +{ + return { + vmaxq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +min(Simd4Float a, Simd4Float b) +{ + return { + vminq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline Simd4Float gmx_simdcall +round(Simd4Float x) +{ + // Convert x to nearest integer + float32x4_t signBitOfX = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(x.simdInternal_))); + float32x4_t half = vdupq_n_f32(0.5f); + float32x4_t corr = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfX))); + + int32x4_t integerX = vcvtq_s32_f32(vaddq_f32(x.simdInternal_, corr)); + + // Convert back to float + + return { + vcvtq_f32_s32(integerX) + }; +} + +static inline Simd4Float gmx_simdcall +trunc(Simd4Float x) +{ + return { + vcvtq_f32_s32( vcvtq_s32_f32(x.simdInternal_) ) + }; +} +#endif + +static inline void gmx_simdcall +transpose(Simd4Float * v0, Simd4Float * v1, + Simd4Float * v2, Simd4Float * v3) +{ + float32x4x2_t t0 = vuzpq_f32(v0->simdInternal_, v2->simdInternal_); + float32x4x2_t t1 = vuzpq_f32(v1->simdInternal_, v3->simdInternal_); + float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]); + float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]); + v0->simdInternal_ = t2.val[0]; + v1->simdInternal_ = t3.val[0]; + v2->simdInternal_ = t2.val[1]; + v3->simdInternal_ = t3.val[1]; +} + +static inline Simd4FBool gmx_simdcall +operator==(Simd4Float a, Simd4Float b) +{ + return { + vceqq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4FBool gmx_simdcall +operator!=(Simd4Float a, Simd4Float b) +{ + return { + vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_)) + }; +} + +static inline Simd4FBool gmx_simdcall +operator<(Simd4Float a, Simd4Float b) +{ + return { + vcltq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4FBool gmx_simdcall +operator<=(Simd4Float a, Simd4Float b) +{ + return { + vcleq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4FBool gmx_simdcall +operator&&(Simd4FBool a, Simd4FBool b) +{ + return { + vandq_u32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline Simd4FBool gmx_simdcall +operator||(Simd4FBool a, Simd4FBool b) +{ + return { + vorrq_u32(a.simdInternal_, b.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline bool gmx_simdcall +anyTrue(Simd4FBool a) +{ + uint32x4_t x = a.simdInternal_; + uint32x4_t y = vextq_u32(x, x, 2); + + x = vorrq_u32(x, y); + y = vextq_u32(x, x, 1); + x = vorrq_u32(x, y); + return (vgetq_lane_u32(x, 0) != 0); +} +#endif + +static inline Simd4Float gmx_simdcall +selectByMask(Simd4Float a, Simd4FBool m) +{ + return { + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_), + m.simdInternal_)) + }; +} + +static inline Simd4Float gmx_simdcall +selectByNotMask(Simd4Float a, Simd4FBool m) +{ + return { + vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_), + m.simdInternal_)) + }; +} + +static inline Simd4Float gmx_simdcall +blend(Simd4Float a, Simd4Float b, Simd4FBool sel) +{ + return { + vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline float gmx_simdcall +reduce(Simd4Float a) +{ + float32x4_t x = a.simdInternal_; + float32x4_t y = vextq_f32(x, x, 2); + + x = vaddq_f32(x, y); + y = vextq_f32(x, x, 1); + x = vaddq_f32(x, y); + return vgetq_lane_f32(x, 0); +} + +static inline float gmx_simdcall +dotProduct(Simd4Float a, Simd4Float b) +{ + Simd4Float c; + + c = a * b; /* set 4th element to 0, then add all of them */ - c = vsetq_lane_f32(0.0f, c, 3); - return simdReduceF_arm_neon(c); + c.simdInternal_ = vsetq_lane_f32(0.0f, c.simdInternal_, 3); + return reduce(c); } +#endif + +} // namespace gmx -#endif /* GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H */ +#endif // GMX_SIMD_IMPL_ARM_NEON_SIMD4_FLOAT_H diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h dissimilarity index 76% index 287371e005..a347619a80 100644 --- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h @@ -1,196 +1,786 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2014,2015, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ - -#ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H -#define GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H - -#include - -#include - -#include "impl_arm_neon_common.h" - -/**************************************************** - * SINGLE PRECISION SIMD IMPLEMENTATION * - ****************************************************/ -#define SimdFloat float32x4_t -#define simdLoadF vld1q_f32 -#define simdLoad1F vld1q_dup_f32 -#define simdSet1F vdupq_n_f32 -#define simdStoreF vst1q_f32 -#define simdLoadUF vld1q_f32 -#define simdStoreUF vst1q_f32 -#define simdSetZeroF() vdupq_n_f32(0.0f) -#define simdAddF vaddq_f32 -#define simdSubF vsubq_f32 -#define simdMulF vmulq_f32 -#ifdef __ARM_FEATURE_FMA -# define simdFmaddF(a, b, c) vfmaq_f32(c, b, a) -# define simdFmsubF(a, b, c) vnegq_f32(vfmsq_f32(c, b, a)) -# define simdFnmaddF(a, b, c) vfmaq_f32(c, b, a) -# define simdFnmsubF(a, b, c) vnegq_f32(vfmaq_f32(c, b, a)) -#else -# define simdFmaddF(a, b, c) vmlaq_f32(c, b, a) -# define simdFmsubF(a, b, c) vnegq_f32(vmlsq_f32(c, b, a)) -# define simdFnmaddF(a, b, c) vmlsq_f32(c, b, a) -# define simdFnmsubF(a, b, c) vnegq_f32(vmlaq_f32(c, b, a)) -#endif -#define simdAndF(a, b) vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) -#define simdAndNotF(a, b) vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a))) -#define simdOrF(a, b) vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) -#define simdXorF(a, b) vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) -#define simdRsqrtF vrsqrteq_f32 -#define simdRsqrtIterF(lu, x) vmulq_f32(lu, vrsqrtsq_f32(vmulq_f32(lu, lu), x)) -#define simdRcpF vrecpeq_f32 -#define simdRcpIterF(lu, x) vmulq_f32(lu, vrecpsq_f32(lu, x)) -#define simdAbsF(x) vabsq_f32(x) -#define simdNegF(x) vnegq_f32(x) -#define simdMaxF vmaxq_f32 -#define simdMinF vminq_f32 -#define simdRoundF(x) simdCvtI2F(simdCvtF2I(x)) -#define simdTruncF(x) simdCvtI2F(simdCvttF2I(x)) -#define simdFractionF(x) vsubq_f32(x, simdTruncF(x)) -#define simdGetExponentF simdGetExponentF_arm_neon -#define simdGetMantissaF simdGetMantissaF_arm_neon -#define simdSetExponentF simdSetExponentF_arm_neon -/* integer datatype corresponding to float: SimdFInt32 */ -#define SimdFInt32 int32x4_t -#define simdLoadFI(m) vld1q_s32(m) -#define simdSet1FI vdupq_n_s32 -#define simdStoreFI(m, x) vst1q_s32(m, x) -#define simdLoadUFI(m) vld1q_s32(m) -#define simdStoreUFI(m, x) vst1q_s32(m, x) -#define simdSetZeroFI() vdupq_n_s32(0) -#define simdCvttF2I vcvtq_s32_f32 -#define simdCvtF2I(x) vcvtq_s32_f32(simdAddF(simdOrF(simdAndF(vdupq_n_f32(-0.0f), x), vdupq_n_f32(0.5f)), x)) -#define simdCvtI2F vcvtq_f32_s32 -#define simdExtractFI(x, i) vgetq_lane_s32(x, i) -/* Integer logical ops on SimdFInt32 */ -#define simdSlliFI vshlq_n_s32 -#define simdSrliFI vshrq_n_s32 -#define simdAndFI vandq_s32 -#define simdAndNotFI(a, b) vbicq_s32(b, a) -#define simdOrFI vorrq_s32 -#define simdXorFI veorq_s32 -/* Integer arithmetic ops on SimdFInt32 */ -#define simdAddFI vaddq_s32 -#define simdSubFI vsubq_s32 -#define simdMulFI vmulq_s32 -/* Boolean & comparison operations on SimdFloat */ -#define SimdFBool uint32x4_t -#define simdCmpEqF vceqq_f32 -#define simdCmpLtF vcltq_f32 -#define simdCmpLeF vcleq_f32 -#define simdAndFB vandq_u32 -#define simdOrFB vorrq_u32 -#define simdAnyTrueFB simdAnyTrueFB_arm_neon -#define simdMaskF(a, sel) vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), sel)) -#define simdMaskNotF(a, sel) vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), sel)) -#define simdBlendF(a, b, sel) vbslq_f32(sel, b, a) -#define simdReduceF(a) simdReduceF_arm_neon(a) -/* Boolean & comparison operations on SimdFInt32 */ -#define SimdFIBool uint32x4_t -#define simdCmpEqFI vceqq_s32 -#define simdCmpLtFI vcltq_s32 -#define simdAndFIB vandq_u32 -#define simdOrFIB vorrq_u32 -#define simdAnyTrueFIB simdAnyTrueFB -#define simdMaskFI(a, sel) vandq_s32(a, vreinterpretq_s32_u32(sel)) -#define simdMaskNotFI(a, sel) vbicq_s32(a, vreinterpretq_s32_u32(sel)) -#define simdBlendFI(a, b, sel) vbslq_s32(sel, b, a) -/* Conversions between different booleans */ -#define simdCvtFB2FIB(x) (x) -#define simdCvtFIB2FB(x) (x) - -/**************************************************** - * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS * - ****************************************************/ -static inline SimdFloat -simdGetExponentF_arm_neon(SimdFloat x) -{ - const float32x4_t expmask = vreinterpretq_f32_s32( vdupq_n_s32(0x7F800000) ); - int32x4_t iexp; - - iexp = vreinterpretq_s32_f32(simdAndF(x, expmask)); - iexp = vsubq_s32(vshrq_n_s32(iexp, 23), vdupq_n_s32(127)); - return vcvtq_f32_s32(iexp); -} - - -static inline SimdFloat -simdGetMantissaF_arm_neon(SimdFloat x) -{ - const float32x4_t mantmask = vreinterpretq_f32_s32( vdupq_n_s32(0x007FFFFF) ); - const float32x4_t one = vdupq_n_f32(1.0f); - - /* Get mantissa */ - x = simdAndF(mantmask, x); - /* Reset zero (but correctly biased) exponent */ - return simdOrF(x, one); -} - - -static inline SimdFloat -simdSetExponentF_arm_neon(SimdFloat x) -{ - int32x4_t iexp = simdCvtF2I(x); - - iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23); - return vreinterpretq_f32_s32(iexp); -} - -static inline float -simdReduceF_arm_neon(SimdFloat a) -{ - float32x4_t b = vextq_f32(a, a, 2); - - a = vaddq_f32(a, b); - b = vextq_f32(a, a, 1); - a = vaddq_f32(a, b); - return vgetq_lane_f32(a, 0); -} - -static inline int -simdAnyTrueFB_arm_neon(SimdFBool a) -{ - uint32x4_t b = vextq_u32(a, a, 2); - - a = simdOrFB(a, b); - b = vextq_u32(a, a, 1); - a = simdOrFB(a, b); - return (vgetq_lane_u32(a, 0) != 0); -} - -#endif /* GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H */ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H +#define GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H + +#include "config.h" + +#include +#include +#include + +#include + +namespace gmx +{ + +class SimdFloat +{ + public: + SimdFloat() {} + + SimdFloat(float f) : simdInternal_(vdupq_n_f32(f)) {} + + // Internal utility constructor to simplify return statements + SimdFloat(float32x4_t simd) : simdInternal_(simd) {} + + float32x4_t simdInternal_; +}; + +class SimdFInt32 +{ + public: + SimdFInt32() {} + + SimdFInt32(std::int32_t i) : simdInternal_(vdupq_n_s32(i)) {} + + // Internal utility constructor to simplify return statements + SimdFInt32(int32x4_t simd) : simdInternal_(simd) {} + + int32x4_t simdInternal_; +}; + +class SimdFBool +{ + public: + SimdFBool() {} + + SimdFBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {} + + // Internal utility constructor to simplify return statements + SimdFBool(uint32x4_t simd) : simdInternal_(simd) {} + + uint32x4_t simdInternal_; +}; + +class SimdFIBool +{ + public: + SimdFIBool() {} + + SimdFIBool(bool b) : simdInternal_(vdupq_n_u32( b ? 0xFFFFFFFF : 0)) {} + + // Internal utility constructor to simplify return statements + SimdFIBool(uint32x4_t simd) : simdInternal_(simd) {} + + uint32x4_t simdInternal_; +}; + +static inline SimdFloat gmx_simdcall +load(const float *m) +{ + assert(std::size_t(m) % 16 == 0); + return { + vld1q_f32(m) + }; +} + +static inline void gmx_simdcall +store(float *m, SimdFloat a) +{ + assert(std::size_t(m) % 16 == 0); + vst1q_f32(m, a.simdInternal_); +} + +static inline SimdFloat gmx_simdcall +loadU(const float *m) +{ + return { + vld1q_f32(m) + }; +} + +static inline void gmx_simdcall +storeU(float *m, SimdFloat a) +{ + vst1q_f32(m, a.simdInternal_); +} + +static inline SimdFloat gmx_simdcall +setZeroF() +{ + return { + vdupq_n_f32(0.0f) + }; +} + +static inline SimdFInt32 gmx_simdcall +loadFI(const std::int32_t * m) +{ + assert(std::size_t(m) % 16 == 0); + return { + vld1q_s32(m) + }; +} + +static inline void gmx_simdcall +store(std::int32_t * m, SimdFInt32 a) +{ + assert(std::size_t(m) % 16 == 0); + vst1q_s32(m, a.simdInternal_); +} + +static inline SimdFInt32 gmx_simdcall +loadUFI(const std::int32_t *m) +{ + return { + vld1q_s32(m) + }; +} + +static inline void gmx_simdcall +storeU(std::int32_t * m, SimdFInt32 a) +{ + vst1q_s32(m, a.simdInternal_); +} + +static inline SimdFInt32 gmx_simdcall +setZeroFI() +{ + return { + vdupq_n_s32(0) + }; +} + +template gmx_simdcall +static inline std::int32_t +extract(SimdFInt32 a) +{ + return vgetq_lane_s32(a.simdInternal_, index); +} + +static inline SimdFloat gmx_simdcall +operator&(SimdFloat a, SimdFloat b) +{ + return { + vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a.simdInternal_), + vreinterpretq_s32_f32(b.simdInternal_))) + }; +} + +static inline SimdFloat gmx_simdcall +andNot(SimdFloat a, SimdFloat b) +{ + return { + vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b.simdInternal_), + vreinterpretq_s32_f32(a.simdInternal_))) + }; +} + +static inline SimdFloat gmx_simdcall +operator|(SimdFloat a, SimdFloat b) +{ + return { + vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a.simdInternal_), + vreinterpretq_s32_f32(b.simdInternal_))) + }; +} + +static inline SimdFloat gmx_simdcall +operator^(SimdFloat a, SimdFloat b) +{ + return { + vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a.simdInternal_), + vreinterpretq_s32_f32(b.simdInternal_))) + }; +} + +static inline SimdFloat gmx_simdcall +operator+(SimdFloat a, SimdFloat b) +{ + return { + vaddq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +operator-(SimdFloat a, SimdFloat b) +{ + return { + vsubq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +operator-(SimdFloat x) +{ + return { + vnegq_f32(x.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +operator*(SimdFloat a, SimdFloat b) +{ + return { + vmulq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline SimdFloat gmx_simdcall +fma(SimdFloat a, SimdFloat b, SimdFloat c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#else + vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#endif + }; +} + +static inline SimdFloat gmx_simdcall +fms(SimdFloat a, SimdFloat b, SimdFloat c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#else + vnegq_f32(vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#endif + }; +} + +static inline SimdFloat gmx_simdcall +fnma(SimdFloat a, SimdFloat b, SimdFloat c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#else + vmlsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) +#endif + }; +} + +static inline SimdFloat gmx_simdcall +fnms(SimdFloat a, SimdFloat b, SimdFloat c) +{ + return { +#ifdef __ARM_FEATURE_FMA + vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#else + vnegq_f32(vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) +#endif + }; +} +#endif + +static inline SimdFloat gmx_simdcall +rsqrt(SimdFloat x) +{ + return { + vrsqrteq_f32(x.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +rsqrtIter(SimdFloat lu, SimdFloat x) +{ + return { + vmulq_f32(lu.simdInternal_, vrsqrtsq_f32(vmulq_f32(lu.simdInternal_, lu.simdInternal_), x.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +rcp(SimdFloat x) +{ + return { + vrecpeq_f32(x.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +rcpIter(SimdFloat lu, SimdFloat x) +{ + return { + vmulq_f32(lu.simdInternal_, vrecpsq_f32(lu.simdInternal_, x.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +maskAdd(SimdFloat a, SimdFloat b, SimdFBool m) +{ + b.simdInternal_ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(b.simdInternal_), + m.simdInternal_)); + + return { + vaddq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +maskzMul(SimdFloat a, SimdFloat b, SimdFBool m) +{ + SimdFloat tmp = a * b; + + return { + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp.simdInternal_), + m.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +maskzFma(SimdFloat a, SimdFloat b, SimdFloat c, SimdFBool m) +{ +#ifdef __ARM_FEATURE_FMA + float32x4_t tmp = vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_); +#else + float32x4_t tmp = vmlaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_); +#endif + + return { + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp), + m.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +maskzRsqrt(SimdFloat x, SimdFBool m) +{ +#ifndef NDEBUG + x.simdInternal_ = vbslq_f32(m, vdupq_n_f32(1.0f), x.simdInternal_); +#endif + return { + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrsqrteq_f32(x.simdInternal_)), + m.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +maskzRcp(SimdFloat x, SimdFBool m) +{ +#ifndef NDEBUG + x.simdInternal_ = vbslq_f32(m, vdupq_n_f32(1.0f), x.simdInternal_); +#endif + return { + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrecpeq_f32(x.simdInternal_)), + m.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +abs(SimdFloat x) +{ + return { + vabsq_f32( x.simdInternal_ ) + }; +} + +static inline SimdFloat gmx_simdcall +max(SimdFloat a, SimdFloat b) +{ + return { + vmaxq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +min(SimdFloat a, SimdFloat b) +{ + return { + vminq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +// Round and trunc operations are defined at the end of this file, since they +// need to use float-to-integer and integer-to-float conversions. + +static inline SimdFloat gmx_simdcall +frexp(SimdFloat value, SimdFInt32 * exponent) +{ + const int32x4_t exponentMask = vdupq_n_s32(0x7F800000); + const int32x4_t mantissaMask = vdupq_n_s32(0x807FFFFF); + const int32x4_t exponentBias = vdupq_n_s32(126); // add 1 to make our definition identical to frexp() + const float32x4_t half = vdupq_n_f32(0.5f); + int32x4_t iExponent; + + iExponent = vandq_s32(vreinterpretq_s32_f32(value.simdInternal_), exponentMask); + iExponent = vsubq_s32(vshrq_n_s32(iExponent, 23), exponentBias); + exponent->simdInternal_ = iExponent; + + return { + vreinterpretq_f32_s32(vorrq_s32(vandq_s32(vreinterpretq_s32_f32(value.simdInternal_), + mantissaMask), + vreinterpretq_s32_f32(half))) + }; +} + +static inline SimdFloat gmx_simdcall +ldexp(SimdFloat value, SimdFInt32 exponent) +{ + const int32x4_t exponentBias = vdupq_n_s32(127); + int32x4_t iExponent; + + iExponent = vshlq_n_s32( vaddq_s32(exponent.simdInternal_, exponentBias), 23); + + return { + vmulq_f32(value.simdInternal_, vreinterpretq_f32_s32(iExponent)) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline float gmx_simdcall +reduce(SimdFloat a) +{ + float32x4_t x = a.simdInternal_; + float32x4_t y = vextq_f32(x, x, 2); + + x = vaddq_f32(x, y); + y = vextq_f32(x, x, 1); + x = vaddq_f32(x, y); + return vgetq_lane_f32(x, 0); +} +#endif + +static inline SimdFBool gmx_simdcall +operator==(SimdFloat a, SimdFloat b) +{ + return { + vceqq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFBool gmx_simdcall +operator!=(SimdFloat a, SimdFloat b) +{ + return { + vmvnq_u32(vceqq_f32(a.simdInternal_, b.simdInternal_)) + }; +} + +static inline SimdFBool gmx_simdcall +operator<(SimdFloat a, SimdFloat b) +{ + return { + vcltq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFBool gmx_simdcall +operator<=(SimdFloat a, SimdFloat b) +{ + return { + vcleq_f32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFBool gmx_simdcall +testBits(SimdFloat a) +{ + uint32x4_t tmp = vreinterpretq_u32_f32(a.simdInternal_); + + return { + vtstq_u32(tmp, tmp) + }; +} + +static inline SimdFBool gmx_simdcall +operator&&(SimdFBool a, SimdFBool b) +{ + + return { + vandq_u32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFBool gmx_simdcall +operator||(SimdFBool a, SimdFBool b) +{ + return { + vorrq_u32(a.simdInternal_, b.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline bool gmx_simdcall +anyTrue(SimdFBool a) +{ + uint32x4_t x = a.simdInternal_; + uint32x4_t y = vextq_u32(x, x, 2); + + x = vorrq_u32(x, y); + y = vextq_u32(x, x, 1); + x = vorrq_u32(x, y); + return (vgetq_lane_u32(x, 0) != 0); +} +#endif + +static inline SimdFloat gmx_simdcall +selectByMask(SimdFloat a, SimdFBool m) +{ + return { + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.simdInternal_), + m.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +selectByNotMask(SimdFloat a, SimdFBool m) +{ + return { + vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.simdInternal_), + m.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +blend(SimdFloat a, SimdFloat b, SimdFBool sel) +{ + return { + vbslq_f32(sel.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator<<(SimdFInt32 a, int n) +{ + return { + vshlq_n_s32(a.simdInternal_, n) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator>>(SimdFInt32 a, int n) +{ + return { + vshrq_n_s32(a.simdInternal_, n) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator&(SimdFInt32 a, SimdFInt32 b) +{ + return { + vandq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +andNot(SimdFInt32 a, SimdFInt32 b) +{ + return { + vbicq_s32(b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator|(SimdFInt32 a, SimdFInt32 b) +{ + return { + vorrq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator^(SimdFInt32 a, SimdFInt32 b) +{ + return { + veorq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator+(SimdFInt32 a, SimdFInt32 b) +{ + return { + vaddq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator-(SimdFInt32 a, SimdFInt32 b) +{ + return { + vsubq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +operator*(SimdFInt32 a, SimdFInt32 b) +{ + return { + vmulq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFIBool gmx_simdcall +operator==(SimdFInt32 a, SimdFInt32 b) +{ + return { + vceqq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFIBool gmx_simdcall +testBits(SimdFInt32 a) +{ + return { + vtstq_s32(a.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdFIBool gmx_simdcall +operator<(SimdFInt32 a, SimdFInt32 b) +{ + return { + vcltq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFIBool gmx_simdcall +operator&&(SimdFIBool a, SimdFIBool b) +{ + return { + vandq_u32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdFIBool gmx_simdcall +operator||(SimdFIBool a, SimdFIBool b) +{ + return { + vorrq_u32(a.simdInternal_, b.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline bool gmx_simdcall +anyTrue(SimdFIBool a) +{ + uint32x4_t x = a.simdInternal_; + uint32x4_t y = vextq_u32(x, x, 2); + + x = vorrq_u32(x, y); + y = vextq_u32(x, x, 1); + x = vorrq_u32(x, y); + return (vgetq_lane_u32(x, 0) != 0); +} +#endif + +static inline SimdFInt32 gmx_simdcall +selectByMask(SimdFInt32 a, SimdFIBool m) +{ + return { + vandq_s32(a.simdInternal_, vreinterpretq_s32_u32(m.simdInternal_)) + }; +} + +static inline SimdFInt32 gmx_simdcall +selectByNotMask(SimdFInt32 a, SimdFIBool m) +{ + return { + vbicq_s32(a.simdInternal_, vreinterpretq_s32_u32(m.simdInternal_)) + }; +} + +static inline SimdFInt32 gmx_simdcall +blend(SimdFInt32 a, SimdFInt32 b, SimdFIBool sel) +{ + return { + vbslq_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline SimdFInt32 gmx_simdcall +cvtR2I(SimdFloat a) +{ + float32x4_t signBitOfA = vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(a.simdInternal_))); + float32x4_t half = vdupq_n_f32(0.5f); + float32x4_t corr = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half), vreinterpretq_u32_f32(signBitOfA))); + + return { + vcvtq_s32_f32(vaddq_f32(a.simdInternal_, corr)) + }; +} +#endif + +static inline SimdFInt32 gmx_simdcall +cvttR2I(SimdFloat a) +{ + return { + vcvtq_s32_f32(a.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +cvtI2R(SimdFInt32 a) +{ + return { + vcvtq_f32_s32(a.simdInternal_) + }; +} + +static inline SimdFIBool gmx_simdcall +cvtB2IB(SimdFBool a) +{ + return { + a.simdInternal_ + }; +} + +static inline SimdFBool gmx_simdcall +cvtIB2B(SimdFIBool a) +{ + return { + a.simdInternal_ + }; +} + +// Override for Neon-Asimd +#if GMX_SIMD_ARM_NEON +static inline SimdFloat gmx_simdcall +round(SimdFloat x) +{ + return cvtI2R(cvtR2I(x)); +} + +static inline SimdFloat gmx_simdcall +trunc(SimdFloat x) +{ + return cvtI2R(cvttR2I(x)); +} +#endif + +} // namespace gmx + +#endif // GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h new file mode 100644 index 0000000000..b9d91598be --- /dev/null +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h @@ -0,0 +1,361 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H +#define GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H + +#include "config.h" + +#include +#include +#include + +#include + +#include "gromacs/utility/basedefinitions.h" + +#include "impl_arm_neon_simd_float.h" + + +namespace gmx +{ + +template +static inline void gmx_simdcall +gatherLoadTranspose(const float * base, + const std::int32_t offset[], + SimdFloat * v0, + SimdFloat * v1, + SimdFloat * v2, + SimdFloat * v3) +{ + assert(std::size_t(offset) % 16 == 0); + assert(std::size_t(base) % 16 == 0); + assert(align % 4 == 0); + + // Unfortunately we cannot use the beautiful Neon structured load + // instructions since the data comes from four different memory locations. + float32x4x2_t t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] )); + float32x4x2_t t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] )); + float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]); + float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]); + v0->simdInternal_ = t2.val[0]; + v1->simdInternal_ = t3.val[0]; + v2->simdInternal_ = t2.val[1]; + v3->simdInternal_ = t3.val[1]; +} + +template +static inline void gmx_simdcall +gatherLoadTranspose(const float * base, + const std::int32_t offset[], + SimdFloat * v0, + SimdFloat * v1) +{ + assert(std::size_t(offset) % 16 == 0); + assert(std::size_t(base) % 8 == 0); + assert(align % 2 == 0); + + v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * offset[0] ), + vld1_f32( base + align * offset[2] )); + v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * offset[1] ), + vld1_f32( base + align * offset[3] )); + + float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_); + + v0->simdInternal_ = tmp.val[0]; + v1->simdInternal_ = tmp.val[1]; +} + +static const int c_simdBestPairAlignmentFloat = 2; + +template +static inline void gmx_simdcall +gatherLoadUTranspose(const float * base, + const std::int32_t offset[], + SimdFloat * v0, + SimdFloat * v1, + SimdFloat * v2) +{ + assert(std::size_t(offset) % 16 == 0); + + float32x4x2_t t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] )); + float32x4x2_t t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] )); + float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]); + float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]); + v0->simdInternal_ = t2.val[0]; + v1->simdInternal_ = t3.val[0]; + v2->simdInternal_ = t2.val[1]; +} + + +template +static inline void gmx_simdcall +transposeScatterStoreU(float * base, + const std::int32_t offset[], + SimdFloat v0, + SimdFloat v1, + SimdFloat v2) +{ + assert(std::size_t(offset) % 16 == 0); + + float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_); + + vst1_f32( base + align * offset[0], vget_low_f32(tmp.val[0]) ); + vst1_f32( base + align * offset[1], vget_low_f32(tmp.val[1]) ); + vst1_f32( base + align * offset[2], vget_high_f32(tmp.val[0]) ); + vst1_f32( base + align * offset[3], vget_high_f32(tmp.val[1]) ); + + vst1q_lane_f32( base + align * offset[0] + 2, v2.simdInternal_, 0); + vst1q_lane_f32( base + align * offset[1] + 2, v2.simdInternal_, 1); + vst1q_lane_f32( base + align * offset[2] + 2, v2.simdInternal_, 2); + vst1q_lane_f32( base + align * offset[3] + 2, v2.simdInternal_, 3); +} + + +template +static inline void gmx_simdcall +transposeScatterIncrU(float * base, + const std::int32_t offset[], + SimdFloat v0, + SimdFloat v1, + SimdFloat v2) +{ + assert(std::size_t(offset) % 16 == 0); + + if (align < 4) + { + float32x2_t t0, t1, t2, t3; + float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_); + + t0 = vget_low_f32(tmp.val[0]); + t1 = vget_low_f32(tmp.val[1]); + t2 = vget_high_f32(tmp.val[0]); + t3 = vget_high_f32(tmp.val[1]); + + t0 = vadd_f32(t0, vld1_f32(base + align * offset[0])); + vst1_f32(base + align * offset[0], t0); + base[ align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0); + + t1 = vadd_f32(t1, vld1_f32(base + align * offset[1])); + vst1_f32(base + align * offset[1], t1); + base[ align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1); + + t2 = vadd_f32(t2, vld1_f32(base + align * offset[2])); + vst1_f32(base + align * offset[2], t2); + base[ align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2); + + t3 = vadd_f32(t3, vld1_f32(base + align * offset[3])); + vst1_f32(base + align * offset[3], t3); + base[ align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3); + } + else + { + // Extra elements means we can use full width-4 load/store operations + float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_); + float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0f)); + float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]); + float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]); + float32x4_t t4 = t2.val[0]; + float32x4_t t5 = t3.val[0]; + float32x4_t t6 = t2.val[1]; + float32x4_t t7 = t3.val[1]; + + vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0]))); + vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1]))); + vst1q_f32(base + align * offset[2], vaddq_f32(t6, vld1q_f32(base + align * offset[2]))); + vst1q_f32(base + align * offset[3], vaddq_f32(t7, vld1q_f32(base + align * offset[3]))); + } +} + +template +static inline void gmx_simdcall +transposeScatterDecrU(float * base, + const std::int32_t offset[], + SimdFloat v0, + SimdFloat v1, + SimdFloat v2) +{ + assert(std::size_t(offset) % 16 == 0); + + if (align < 4) + { + float32x2_t t0, t1, t2, t3; + float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_); + + t0 = vget_low_f32(tmp.val[0]); + t1 = vget_low_f32(tmp.val[1]); + t2 = vget_high_f32(tmp.val[0]); + t3 = vget_high_f32(tmp.val[1]); + + t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0); + vst1_f32(base + align * offset[0], t0); + base[ align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0); + + t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1); + vst1_f32(base + align * offset[1], t1); + base[ align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1); + + t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2); + vst1_f32(base + align * offset[2], t2); + base[ align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2); + + t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3); + vst1_f32(base + align * offset[3], t3); + base[ align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3); + } + else + { + // Extra elements means we can use full width-4 load/store operations + float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_); + float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0f)); + float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]); + float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]); + float32x4_t t4 = t2.val[0]; + float32x4_t t5 = t3.val[0]; + float32x4_t t6 = t2.val[1]; + float32x4_t t7 = t3.val[1]; + + vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4)); + vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5)); + vst1q_f32(base + align * offset[2], vsubq_f32(vld1q_f32(base + align * offset[2]), t6)); + vst1q_f32(base + align * offset[3], vsubq_f32(vld1q_f32(base + align * offset[3]), t7)); + } +} + +static inline void gmx_simdcall +expandScalarsToTriplets(SimdFloat scalar, + SimdFloat * triplets0, + SimdFloat * triplets1, + SimdFloat * triplets2) +{ + float32x2_t lo, hi; + float32x4_t t0, t1, t2, t3; + + lo = vget_low_f32(scalar.simdInternal_); + hi = vget_high_f32(scalar.simdInternal_); + + t0 = vdupq_lane_f32(lo, 0); + t1 = vdupq_lane_f32(lo, 1); + t2 = vdupq_lane_f32(hi, 0); + t3 = vdupq_lane_f32(hi, 1); + + triplets0->simdInternal_ = vextq_f32(t0, t1, 1); + triplets1->simdInternal_ = vextq_f32(t1, t2, 2); + triplets2->simdInternal_ = vextq_f32(t2, t3, 3); +} + + +template +static inline void gmx_simdcall +gatherLoadBySimdIntTranspose(const float * base, + SimdFInt32 offset, + SimdFloat * v0, + SimdFloat * v1, + SimdFloat * v2, + SimdFloat * v3) +{ + GMX_ALIGNED(int, GMX_SIMD_FINT32_WIDTH) ioffset[GMX_SIMD_FINT32_WIDTH]; + + assert(std::size_t(base) % 16 == 0); + assert(align % 4 == 0); + + store(ioffset, offset); + gatherLoadTranspose(base, ioffset, v0, v1, v2, v3); +} + +template +static inline void gmx_simdcall +gatherLoadBySimdIntTranspose(const float * base, + SimdFInt32 offset, + SimdFloat * v0, + SimdFloat * v1) +{ + GMX_ALIGNED(int, GMX_SIMD_FINT32_WIDTH) ioffset[GMX_SIMD_FINT32_WIDTH]; + + store(ioffset, offset); + gatherLoadTranspose(base, ioffset, v0, v1); +} + + + +template +static inline void gmx_simdcall +gatherLoadUBySimdIntTranspose(const float * base, + SimdFInt32 offset, + SimdFloat * v0, + SimdFloat * v1) +{ + GMX_ALIGNED(int, GMX_SIMD_FINT32_WIDTH) ioffset[GMX_SIMD_FINT32_WIDTH]; + + store(ioffset, offset); + v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[0] ), + vld1_f32( base + align * ioffset[2] )); + v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[1] ), + vld1_f32( base + align * ioffset[3] )); + float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_ ); + v0->simdInternal_ = tmp.val[0]; + v1->simdInternal_ = tmp.val[1]; +} + +static inline float gmx_simdcall +reduceIncr4ReturnSum(float * m, + SimdFloat v0, + SimdFloat v1, + SimdFloat v2, + SimdFloat v3) +{ + assert(std::size_t(m) % 16 == 0); + + float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_); + float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_); + float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]); + float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]); + v0.simdInternal_ = t2.val[0]; + v1.simdInternal_ = t3.val[0]; + v2.simdInternal_ = t2.val[1]; + v3.simdInternal_ = t3.val[1]; + + v0 = v0 + v1; + v2 = v2 + v3; + v0 = v0 + v2; + v2 = v0 + load(m); + store(m, v2); + + return reduce(v0); +} + +} // namespace gmx + +#endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h index d8d58e84f7..72798b1fb8 100644 --- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h @@ -36,7 +36,13 @@ #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_H #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_H +#include "impl_arm_neon_asimd_definitions.h" +#include "impl_arm_neon_asimd_general.h" +// No double precision SIMD4 on neon Asimd +#include "impl_arm_neon_asimd_simd4_float.h" #include "impl_arm_neon_asimd_simd_double.h" #include "impl_arm_neon_asimd_simd_float.h" +#include "impl_arm_neon_asimd_util_double.h" +#include "impl_arm_neon_asimd_util_float.h" -#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_H */ +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h new file mode 100644 index 0000000000..ab1a7783af --- /dev/null +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_definitions.h @@ -0,0 +1,83 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_DEFINITIONS_H +#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_DEFINITIONS_H + +// ARM (AArch64) NEON Advanced SIMD + +#define GMX_SIMD 1 +#define GMX_SIMD_HAVE_FLOAT 1 +#define GMX_SIMD_HAVE_DOUBLE 1 +#define GMX_SIMD_HAVE_LOADU 1 +#define GMX_SIMD_HAVE_STOREU 1 +#define GMX_SIMD_HAVE_LOGICAL 1 +#define GMX_SIMD_HAVE_FMA 1 +#define GMX_SIMD_HAVE_FINT32_EXTRACT 1 +#define GMX_SIMD_HAVE_FINT32_LOGICAL 1 +#define GMX_SIMD_HAVE_FINT32_ARITHMETICS 1 +#define GMX_SIMD_HAVE_DINT32_EXTRACT 1 +#define GMX_SIMD_HAVE_DINT32_LOGICAL 1 +#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 1 +#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT 1 +#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT 1 +#define GMX_SIMD_HAVE_NATIVE_LOG_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_EXP_FLOAT 0 +#define GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE 1 +#define GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE 1 +#define GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE 0 +#define GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE 0 +#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT 1 +#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1 +#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 0 // No need for half-simd, width is 4 +#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 + +#define GMX_SIMD4_HAVE_FLOAT 1 +#define GMX_SIMD4_HAVE_DOUBLE 0 + +// Implementation details +#define GMX_SIMD_FLOAT_WIDTH 4 +#define GMX_SIMD_DOUBLE_WIDTH 2 +#define GMX_SIMD_FINT32_WIDTH 4 +#define GMX_SIMD_DINT32_WIDTH 2 +#define GMX_SIMD4_WIDTH 4 +#define GMX_SIMD_RSQRT_BITS 8 +#define GMX_SIMD_RCP_BITS 8 + +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_DEFINITIONS_H diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_general.h similarity index 89% copy from src/gromacs/simd/impl_arm_neon/impl_arm_neon.h copy to src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_general.h index 5357dfc978..2b7162367d 100644 --- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_general.h @@ -33,10 +33,9 @@ * the research papers on the package. Check out http://www.gromacs.org. */ -#ifndef GMX_SIMD_IMPL_ARM_NEON_H -#define GMX_SIMD_IMPL_ARM_NEON_H +#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_GENERAL_H +#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_GENERAL_H -#include "impl_arm_neon_simd4_float.h" -#include "impl_arm_neon_simd_float.h" +#include "gromacs/simd/impl_arm_neon/impl_arm_neon_general.h" -#endif /* GMX_SIMD_IMPL_ARM_NEON_H */ +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_GENERAL_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h new file mode 100644 index 0000000000..47d691f18d --- /dev/null +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd4_float.h @@ -0,0 +1,124 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD4_FLOAT_H +#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD4_FLOAT_H + +#include "config.h" + +#include + +#include "gromacs/simd/impl_arm_neon/impl_arm_neon_simd4_float.h" + +namespace gmx +{ + +static inline Simd4Float gmx_simdcall +fma(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { + vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +fms(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { + vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) + }; +} + +static inline Simd4Float gmx_simdcall +fnma(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { + vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +fnms(Simd4Float a, Simd4Float b, Simd4Float c) +{ + return { + vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) + }; +} + +static inline Simd4Float gmx_simdcall +round(Simd4Float x) +{ + return { + vrndnq_f32(x.simdInternal_) + }; +} + +static inline Simd4Float gmx_simdcall +trunc(Simd4Float x) +{ + return { + vrndq_f32(x.simdInternal_) + }; +} + +static inline bool gmx_simdcall +anyTrue(Simd4FBool a) +{ + return (vmaxvq_u32(a.simdInternal_) != 0); +} + +static inline float gmx_simdcall +reduce(Simd4Float a) +{ + float32x4_t b = a.simdInternal_; + b = vpaddq_f32(b, b); + b = vpaddq_f32(b, b); + return vgetq_lane_f32(b, 0); +} + +static inline float gmx_simdcall +dotProduct(Simd4Float a, Simd4Float b) +{ + Simd4Float c; + + c = a * b; + /* set 4th element to 0, then add all of them */ + c.simdInternal_ = vsetq_lane_f32(0.0f, c.simdInternal_, 3); + return reduce(c); +} + +} // namespace gmx + +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD4_FLOAT_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h dissimilarity index 74% index c984185dfe..c33a7f6e65 100644 --- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h @@ -1,178 +1,733 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2014,2015, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ - -#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H -#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H - -#include - -#include - -#include "impl_arm_neon_asimd_common.h" - -/**************************************************** - * DOUBLE PRECISION SIMD IMPLEMENTATION * - ****************************************************/ -#define SimdDouble float64x2_t -#define simdLoadD vld1q_f64 -#define simdLoad1D vld1q_dup_f64 -#define simdSet1D vdupq_n_f64 -#define simdStoreD vst1q_f64 -#define simdLoadUD vld1q_f64 -#define simdStoreUD vst1q_f64 -#define simdSetZeroD() vdupq_n_f64(0.0) -#define simdAddD vaddq_f64 -#define simdSubD vsubq_f64 -#define simdMulD vmulq_f64 -#define simdFmaddD(a, b, c) vfmaq_f64(c, b, a) -#define simdFmsubD(a, b, c) vnegq_f64(vfmsq_f64(c, b, a)) -#define simdFnmaddD(a, b, c) vfmsq_f64(c, b, a) -#define simdFnmsubD(a, b, c) vnegq_f64(vfmaq_f64(c, b, a)) -#define simdAndD(a, b) (float64x2_t)(vandq_s64((int64x2_t)(a), (int64x2_t)(b))) -#define simdAndNotD(a, b) (float64x2_t)(vbicq_s64((int64x2_t)(b), (int64x2_t)(a))) -#define simdOrD(a, b) (float64x2_t)(vorrq_s64((int64x2_t)(a), (int64x2_t)(b))) -#define simdXorD(a, b) (float64x2_t)(veorq_s64((int64x2_t)(a), (int64x2_t)(b))) -#define simdRsqrtD vrsqrteq_f64 -#define simdRsqrtIterD(lu, x) vmulq_f64(lu, vrsqrtsq_f64(vmulq_f64(lu, lu), x)) -#define simdRcpD vrecpeq_f64 -#define simdRcpIterD(lu, x) vmulq_f64(lu, vrecpsq_f64(lu, x)) -#define simdAbsD(x) vabsq_f64(x) -#define simdNegD(x) vnegq_f64(x) -#define simdMaxD vmaxq_f64 -#define simdMinD vminq_f64 -#define simdRoundD(x) vrndnq_f64(x) -#define simdTruncD(x) vrndq_f64(x) -#define simdFractionD(x) vsubq_f64(x, simdTruncD(x)) -#define simdGetExponentD simdGetExponentD_arm_neon_asimd -#define simdGetMantissaD simdGetMantissaD_arm_neon_asimd -#define simdSetExponentD simdSetExponentD_arm_neon_asimd -/* integer datatype corresponding to double: SimdDInt32 */ -#define SimdDInt32 int32x2_t -#define simdLoadDI(m) vld1_s32(m) -#define simdSet1DI vdup_n_s32 -#define simdStoreDI(m, x) vst1_s32(m, x) -#define simdLoadUDI(m) vld1_s32(m) -#define simdStoreUDI(m, x) vst1_s32(m, x) -#define simdSetZeroDI() vdup_n_s32(0) -#define simdCvttD2I(x) vmovn_s64(vcvtq_s64_f64(x)) -#define simdCvtD2I(x) vmovn_s64(vcvtnq_s64_f64(x)) -#define simdCvtI2D(x) vcvtq_f64_s64(vmovl_s32(x)) -#define simdExtractDI(x, i) vget_lane_s32(x, i) -/* Integer logical ops on SimdDInt32 */ -#define simdSlliDI vshl_n_s32 -#define simdSrliDI vshr_n_s32 -#define simdAndDI vand_s32 -#define simdAndNotDI(a, b) vbic_s32(b, a) -#define simdOrDI vorr_s32 -#define simdXorDI veor_s32 -/* Integer arithmetic ops on SimdDInt32 */ -#define simdAddDI vadd_s32 -#define simdSubDI vsub_s32 -#define simdMulDI vmul_s32 -/* Boolean & comparison operations on SimdDouble */ -#define SimdDBool uint64x2_t -#define simdCmpEqD vceqq_f64 -#define simdCmpLtD vcltq_f64 -#define simdCmpLeD vcleq_f64 -#define simdAndDB vandq_u64 -#define simdOrDB vorrq_u64 -#define simdAnyTrueDB(x) (vmaxvq_u32((uint32x4_t)(x)) != 0) -#define simdMaskD(a, sel) (float64x2_t)(vandq_u64((uint64x2_t)(a), sel)) -#define simdMaskNotD(a, sel) (float64x2_t)(vbicq_u64((uint64x2_t)(a), sel)) -#define simdBlendD(a, b, sel) vbslq_f64(sel, b, a) -#define simdReduceD(a) simdReduceD_arm_neon_asimd(a) -/* Boolean & comparison operations on SimdDInt32 */ -#define SimdDIBool uint32x2_t -#define simdCmpEqDI vceq_s32 -#define simdCmpLtDI vclt_s32 -#define simdAndDIB vand_u32 -#define simdOrDIB vorr_u32 -#define simdAnyTrueDIB(x) (vmaxv_u32(x) != 0) -#define simdMaskDI(a, sel) vand_s32(a, vreinterpret_s32_u32(sel)) -#define simdMaskNotDI(a, sel) vbic_s32(a, vreinterpret_s32_u32(sel)) -#define simdBlendDI(a, b, sel) vbsl_s32(sel, b, a) -/* Conversions between different booleans */ -#define simdCvtDB2DIB(x) vqmovn_u64(x) -#define simdCvtDIB2DB(x) vorrq_u64(vmovl_u32(x), vshlq_n_u64(vmovl_u32(x), 32)) - -/* Float/double conversion */ -#define simdCvtF2DD(f, d0, d1) { *d0 = vcvt_f64_f32(vget_low_f32(f)); *d1 = vcvt_high_f64_f32(f); } -#define simdCvtDD2F(d0, d1) vcvt_high_f32_f64(vcvt_f32_f64(d0), d1) - -/**************************************************** - * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS * - ****************************************************/ -static inline SimdDouble -simdGetExponentD_arm_neon_asimd(SimdDouble x) -{ - const float64x2_t expmask = (float64x2_t)( vdupq_n_s64(0x7FF0000000000000LL) ); - int64x2_t iexp; - - iexp = (int64x2_t)(simdAndD(x, expmask)); - iexp = vsubq_s64(vshrq_n_s64(iexp, 52), vdupq_n_s64(1023)); - return vcvtq_f64_s64(iexp); -} - - -static inline SimdDouble -simdGetMantissaD_arm_neon_asimd(SimdDouble x) -{ - const float64x2_t mantmask = (float64x2_t)( vdupq_n_s64(0x000FFFFFFFFFFFFFLL) ); - const float64x2_t one = vdupq_n_f64(1.0); - - /* Get mantissa */ - x = simdAndD(mantmask, x); - /* Reset zero (but correctly biased) exponent */ - return simdOrD(x, one); -} - - -static inline SimdDouble -simdSetExponentD_arm_neon_asimd(SimdDouble x) -{ - int64x2_t iexp = vcvtnq_s64_f64(x); - - iexp = vshlq_n_s64(vaddq_s64(iexp, vdupq_n_s64(1023)), 52); - return (float64x2_t)(iexp); -} - -static inline double -simdReduceD_arm_neon_asimd(SimdDouble a) -{ - a = vpaddq_f64(a, a); - return vgetq_lane_f64(a, 0); -} - -#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H */ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H +#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H + +#include "config.h" + +#include + +#include + +#include "impl_arm_neon_asimd_simd_float.h" + +namespace gmx +{ + +class SimdDouble +{ + public: + SimdDouble() {} + + SimdDouble(double d) : simdInternal_(vdupq_n_f64(d)) {} + + // Internal utility constructor to simplify return statements + SimdDouble(float64x2_t simd) : simdInternal_(simd) {} + + float64x2_t simdInternal_; +}; + +class SimdDInt32 +{ + public: + SimdDInt32() {} + + SimdDInt32(std::int32_t i) : simdInternal_(vdup_n_s32(i)) {} + + // Internal utility constructor to simplify return statements + SimdDInt32(int32x2_t simd) : simdInternal_(simd) {} + + int32x2_t simdInternal_; +}; + +class SimdDBool +{ + public: + SimdDBool() {} + + SimdDBool(bool b) : simdInternal_(vdupq_n_u64( b ? 0xFFFFFFFFFFFFFFFF : 0)) {} + + // Internal utility constructor to simplify return statements + SimdDBool(uint64x2_t simd) : simdInternal_(simd) {} + + uint64x2_t simdInternal_; +}; + +class SimdDIBool +{ + public: + SimdDIBool() {} + + SimdDIBool(bool b) : simdInternal_(vdup_n_u32( b ? 0xFFFFFFFF : 0)) {} + + // Internal utility constructor to simplify return statements + SimdDIBool(uint32x2_t simd) : simdInternal_(simd) {} + + uint32x2_t simdInternal_; +}; + +static inline SimdDouble gmx_simdcall +load(const double *m) +{ + assert(std::size_t(m) % 16 == 0); + return { + vld1q_f64(m) + }; +} + +static inline void gmx_simdcall +store(double *m, SimdDouble a) +{ + assert(std::size_t(m) % 16 == 0); + vst1q_f64(m, a.simdInternal_); +} + +static inline SimdDouble gmx_simdcall +loadU(const double *m) +{ + return { + vld1q_f64(m) + }; +} + +static inline void gmx_simdcall +storeU(double *m, SimdDouble a) +{ + vst1q_f64(m, a.simdInternal_); +} + +static inline SimdDouble gmx_simdcall +setZeroD() +{ + return { + vdupq_n_f64(0.0) + }; +} + +static inline SimdDInt32 gmx_simdcall +loadDI(const std::int32_t * m) +{ + assert(std::size_t(m) % 8 == 0); + return { + vld1_s32(m) + }; +} + +static inline void gmx_simdcall +store(std::int32_t * m, SimdDInt32 a) +{ + assert(std::size_t(m) % 8 == 0); + vst1_s32(m, a.simdInternal_); +} + +static inline SimdDInt32 gmx_simdcall +loadUDI(const std::int32_t *m) +{ + return { + vld1_s32(m) + }; +} + +static inline void gmx_simdcall +storeU(std::int32_t * m, SimdDInt32 a) +{ + vst1_s32(m, a.simdInternal_); +} + +static inline SimdDInt32 gmx_simdcall +setZeroDI() +{ + return { + vdup_n_s32(0) + }; +} + +template gmx_simdcall +static inline std::int32_t +extract(SimdDInt32 a) +{ + return vget_lane_s32(a.simdInternal_, index); +} + +static inline SimdDouble gmx_simdcall +operator&(SimdDouble a, SimdDouble b) +{ + return { + float64x2_t(vandq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_))) + }; +} + +static inline SimdDouble gmx_simdcall +andNot(SimdDouble a, SimdDouble b) +{ + return { + float64x2_t(vbicq_s64(int64x2_t(b.simdInternal_), int64x2_t(a.simdInternal_))) + }; +} + +static inline SimdDouble gmx_simdcall +operator|(SimdDouble a, SimdDouble b) +{ + return { + float64x2_t(vorrq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_))) + }; +} + +static inline SimdDouble gmx_simdcall +operator^(SimdDouble a, SimdDouble b) +{ + return { + float64x2_t(veorq_s64(int64x2_t(a.simdInternal_), int64x2_t(b.simdInternal_))) + }; +} + +static inline SimdDouble gmx_simdcall +operator+(SimdDouble a, SimdDouble b) +{ + return { + vaddq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +operator-(SimdDouble a, SimdDouble b) +{ + return { + vsubq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +operator-(SimdDouble x) +{ + return { + vnegq_f64(x.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +operator*(SimdDouble a, SimdDouble b) +{ + return { + vmulq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +fma(SimdDouble a, SimdDouble b, SimdDouble c) +{ + return { + vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +fms(SimdDouble a, SimdDouble b, SimdDouble c) +{ + return { + vnegq_f64(vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +fnma(SimdDouble a, SimdDouble b, SimdDouble c) +{ + return { + vfmsq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +fnms(SimdDouble a, SimdDouble b, SimdDouble c) +{ + return { + vnegq_f64(vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +rsqrt(SimdDouble x) +{ + return { + vrsqrteq_f64(x.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +rsqrtIter(SimdDouble lu, SimdDouble x) +{ + return { + vmulq_f64(lu.simdInternal_, vrsqrtsq_f64(vmulq_f64(lu.simdInternal_, lu.simdInternal_), x.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +rcp(SimdDouble x) +{ + return { + vrecpeq_f64(x.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +rcpIter(SimdDouble lu, SimdDouble x) +{ + return { + vmulq_f64(lu.simdInternal_, vrecpsq_f64(lu.simdInternal_, x.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +maskAdd(SimdDouble a, SimdDouble b, SimdDBool m) +{ + float64x2_t addend = float64x2_t(vandq_u64(uint64x2_t(b.simdInternal_), m.simdInternal_)); + + return { + vaddq_f64(a.simdInternal_, addend) + }; +} + +static inline SimdDouble gmx_simdcall +maskzMul(SimdDouble a, SimdDouble b, SimdDBool m) +{ + float64x2_t prod = vmulq_f64(a.simdInternal_, b.simdInternal_); + return { + float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m) +{ + float64x2_t prod = vfmaq_f64(c.simdInternal_, b.simdInternal_, a.simdInternal_); + + return { + float64x2_t(vandq_u64(uint64x2_t(prod), m.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +maskzRsqrt(SimdDouble x, SimdDBool m) +{ + // The result will always be correct since we mask the result with m, but + // for debug builds we also want to make sure not to generate FP exceptions +#ifndef NDEBUG + x.simdInternal_ = vbslq_f64(m.simdInternal_, vdupq_n_f64(1.0, x.simdInternal_); +#endif + return { + float64x2_t(vandq_u64(uint64x2_t(vrsqrteq_f64(x.simdInternal_)), m.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +maskzRcp(SimdDouble x, SimdDBool m) +{ + // The result will always be correct since we mask the result with m, but + // for debug builds we also want to make sure not to generate FP exceptions +#ifndef NDEBUG + x.simdInternal_ = vbslq_f64(m.simdInternal_, vdupq_n_f64(1.0, x.simdInternal_); +#endif + return { + float64x2_t(vandq_u64(uint64x2_t(vrecpeq_f64(x.simdInternal_)), m.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +abs(SimdDouble x) +{ + return { + vabsq_f64( x.simdInternal_ ) + }; +} + +static inline SimdDouble gmx_simdcall +max(SimdDouble a, SimdDouble b) +{ + return { + vmaxq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +min(SimdDouble a, SimdDouble b) +{ + return { + vminq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +round(SimdDouble x) +{ + return { + vrndnq_f64(x.simdInternal_) + }; +} + +static inline SimdDouble gmx_simdcall +trunc(SimdDouble x) +{ + return { + vrndq_f64( x.simdInternal_ ) + }; +} + +static inline SimdDouble +frexp(SimdDouble value, SimdDInt32 * exponent) +{ + const float64x2_t exponentMask = float64x2_t( vdupq_n_s64(0x7FF0000000000000LL) ); + const float64x2_t mantissaMask = float64x2_t( vdupq_n_s64(0x800FFFFFFFFFFFFFLL) ); + + const int64x2_t exponentBias = vdupq_n_s64(1022); // add 1 to make our definition identical to frexp() + const float64x2_t half = vdupq_n_f64(0.5); + int64x2_t iExponent; + + iExponent = vandq_s64( int64x2_t(value.simdInternal_), int64x2_t(exponentMask) ); + iExponent = vsubq_s64(vshrq_n_s64(iExponent, 52), exponentBias); + exponent->simdInternal_ = vmovn_s64(iExponent); + + return { + float64x2_t(vorrq_s64(vandq_s64(int64x2_t(value.simdInternal_), int64x2_t(mantissaMask)), int64x2_t(half))) + }; +} + +static inline SimdDouble +ldexp(SimdDouble value, SimdDInt32 exponent) +{ + const int64x2_t exponentBias = vdupq_n_s64(1023); + int64x2_t iExponent; + + iExponent = vmovl_s32(exponent.simdInternal_); + iExponent = vshlq_n_s64(vaddq_s64(iExponent, exponentBias), 52); + + return { + vmulq_f64(value.simdInternal_, float64x2_t(iExponent)) + }; +} + +static inline double gmx_simdcall +reduce(SimdDouble a) +{ + float64x2_t b = vpaddq_f64(a.simdInternal_, a.simdInternal_); + return vgetq_lane_f64(b, 0); +} + +static inline SimdDBool gmx_simdcall +operator==(SimdDouble a, SimdDouble b) +{ + return { + vceqq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDBool gmx_simdcall +operator!=(SimdDouble a, SimdDouble b) +{ + return { + vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a.simdInternal_, b.simdInternal_)))) + }; +} + +static inline SimdDBool gmx_simdcall +operator<(SimdDouble a, SimdDouble b) +{ + return { + vcltq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDBool gmx_simdcall +operator<=(SimdDouble a, SimdDouble b) +{ + return { + vcleq_f64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDBool gmx_simdcall +testBits(SimdDouble a) +{ + return { + vtstq_s64( int64x2_t(a.simdInternal_), int64x2_t(a.simdInternal_) ) + }; +} + +static inline SimdDBool gmx_simdcall +operator&&(SimdDBool a, SimdDBool b) +{ + return { + vandq_u64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDBool gmx_simdcall +operator||(SimdDBool a, SimdDBool b) +{ + return { + vorrq_u64(a.simdInternal_, b.simdInternal_) + }; +} + +static inline bool gmx_simdcall +anyTrue(SimdDBool a) +{ + return (vmaxvq_u32((uint32x4_t)(a.simdInternal_)) != 0); +} + +static inline SimdDouble gmx_simdcall +selectByMask(SimdDouble a, SimdDBool m) +{ + return { + float64x2_t(vandq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +selectByNotMask(SimdDouble a, SimdDBool m) +{ + return { + float64x2_t(vbicq_u64(uint64x2_t(a.simdInternal_), m.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +blend(SimdDouble a, SimdDouble b, SimdDBool sel) +{ + return { + vbslq_f64(sel.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator<<(SimdDInt32 a, int n) +{ + return { + vshl_n_s32(a.simdInternal_, n) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator>>(SimdDInt32 a, int n) +{ + return { + vshr_n_s32(a.simdInternal_, n) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator&(SimdDInt32 a, SimdDInt32 b) +{ + return { + vand_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +andNot(SimdDInt32 a, SimdDInt32 b) +{ + return { + vbic_s32(b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator|(SimdDInt32 a, SimdDInt32 b) +{ + return { + vorr_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator^(SimdDInt32 a, SimdDInt32 b) +{ + return { + veor_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator+(SimdDInt32 a, SimdDInt32 b) +{ + return { + vadd_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator-(SimdDInt32 a, SimdDInt32 b) +{ + return { + vsub_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +operator*(SimdDInt32 a, SimdDInt32 b) +{ + return { + vmul_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDIBool gmx_simdcall +operator==(SimdDInt32 a, SimdDInt32 b) +{ + return { + vceq_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDIBool gmx_simdcall +testBits(SimdDInt32 a) +{ + return { + vtst_s32( a.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdDIBool gmx_simdcall +operator<(SimdDInt32 a, SimdDInt32 b) +{ + return { + vclt_s32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDIBool gmx_simdcall +operator&&(SimdDIBool a, SimdDIBool b) +{ + return { + vand_u32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline SimdDIBool gmx_simdcall +operator||(SimdDIBool a, SimdDIBool b) +{ + return { + vorr_u32(a.simdInternal_, b.simdInternal_) + }; +} + +static inline bool gmx_simdcall +anyTrue(SimdDIBool a) +{ + return (vmaxv_u32(a.simdInternal_) != 0); +} + +static inline SimdDInt32 gmx_simdcall +selectByMask(SimdDInt32 a, SimdDIBool m) +{ + return { + vand_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_)) + }; +} + +static inline SimdDInt32 gmx_simdcall +selectByNotMask(SimdDInt32 a, SimdDIBool m) +{ + return { + vbic_s32(a.simdInternal_, vreinterpret_s32_u32(m.simdInternal_)) + }; +} + +static inline SimdDInt32 gmx_simdcall +blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel) +{ + return { + vbsl_s32(sel.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} + +static inline SimdDInt32 gmx_simdcall +cvtR2I(SimdDouble a) +{ + return { + vmovn_s64(vcvtnq_s64_f64(a.simdInternal_)) + }; +} + +static inline SimdDInt32 gmx_simdcall +cvttR2I(SimdDouble a) +{ + return { + vmovn_s64(vcvtq_s64_f64(a.simdInternal_)) + }; +} + +static inline SimdDouble gmx_simdcall +cvtI2R(SimdDInt32 a) +{ + return { + vcvtq_f64_s64(vmovl_s32(a.simdInternal_)) + }; +} + +static inline SimdDIBool gmx_simdcall +cvtB2IB(SimdDBool a) +{ + return { + vqmovn_u64(a.simdInternal_) + }; +} + +static inline SimdDBool gmx_simdcall +cvtIB2B(SimdDIBool a) +{ + return { + vorrq_u64(vmovl_u32(a.simdInternal_), vshlq_n_u64(vmovl_u32(a.simdInternal_), 32)) + }; +} + +static inline void gmx_simdcall +cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1) +{ + d0->simdInternal_ = vcvt_f64_f32(vget_low_f32(f.simdInternal_)); + d1->simdInternal_ = vcvt_high_f64_f32(f.simdInternal_); +} + +static inline SimdFloat gmx_simdcall +cvtDD2F(SimdDouble d0, SimdDouble d1) +{ + return { + vcvt_high_f32_f64(vcvt_f32_f64(d0.simdInternal_), d1.simdInternal_) + }; +} + +} // namespace gmx + +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_DOUBLE_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h index ee8c3f1a7b..340f9298d9 100644 --- a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_float.h @@ -36,70 +36,92 @@ #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H -#include +#include "config.h" #include -#include "impl_arm_neon_asimd_common.h" - -/* NEON ASIMD always has FMA support, so make sure we use that for single too. */ -#undef simdFmaddF -#define simdFmaddF(a, b, c) vfmaq_f32(c, b, a) -#undef simdFmsubF -#define simdFmsubF(a, b, c) vnegq_f32(vfmsq_f32(c, b, a)) -#undef simdFnmaddF -#define simdFnmaddF(a, b, c) vfmsq_f32(c, b, a) -#undef simdFnmsubF -#define simdFnmsubF(a, b, c) vnegq_f32(vfmaq_f32(c, b, a)) - -/* The rounding instructions were actually added already in ARMv8, but most - * compilers did not add intrinsics for them. Make sure we use them for single - * precision too when enabling NEON Advanced SIMD. - */ -#undef simdRoundF -#define simdRoundF(x) vrndnq_f32(x) -#undef simdTruncF -#define simdTruncF(x) vrndq_f32(x) - -/* NEON Advanced SIMD has a real rounding conversion instruction */ -#undef simdCvtF2I -#define simdCvtF2I(x) vcvtnq_s32_f32(x) - -/* Since we redefine rounding/conversion-with-rounding, make - * sure we use the new operations by redefining the routine - * to set the exponent too. - */ -#undef simdSetExponentF -#define simdSetExponentF simdSetExponentF_arm_neon_asimd - -/* We can do more efficient reduce with vector pairwise arithmetic */ -#undef simdReduceF -#define simdReduceF(a) simdReduceF_arm_neon_asimd(a) - -/* Pick the largest unsigned integer as a shortcut for any-true */ -#undef simdAnyTrueFB -#define simdAnyTrueFB(x) (vmaxvq_u32(x) != 0) -#undef simdAnyTrueFIB -#define simdAnyTrueFIB(x) (vmaxvq_u32(x) != 0) - -/**************************************************** - * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS * - ****************************************************/ -static inline SimdFloat -simdSetExponentF_arm_neon_asimd(SimdFloat x) +#include "gromacs/simd/impl_arm_neon/impl_arm_neon_simd_float.h" + +namespace gmx +{ + +static inline SimdFloat gmx_simdcall +fma(SimdFloat a, SimdFloat b, SimdFloat c) { - int32x4_t iexp = vcvtnq_s32_f32(x); + return { + vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) + }; +} - iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23); - return vreinterpretq_f32_s32(iexp); +static inline SimdFloat gmx_simdcall +fms(SimdFloat a, SimdFloat b, SimdFloat c) +{ + return { + vnegq_f32(vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) + }; } -static inline float -simdReduceF_arm_neon_asimd(SimdFloat a) +static inline SimdFloat gmx_simdcall +fnma(SimdFloat a, SimdFloat b, SimdFloat c) { - a = vpaddq_f32(a, a); - a = vpaddq_f32(a, a); - return vgetq_lane_f32(a, 0); + return { + vfmsq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_) + }; } -#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H */ +static inline SimdFloat gmx_simdcall +fnms(SimdFloat a, SimdFloat b, SimdFloat c) +{ + return { + vnegq_f32(vfmaq_f32(c.simdInternal_, b.simdInternal_, a.simdInternal_)) + }; +} + +static inline SimdFloat gmx_simdcall +round(SimdFloat x) +{ + return { + vrndnq_f32(x.simdInternal_) + }; +} + +static inline SimdFloat gmx_simdcall +trunc(SimdFloat x) +{ + return { + vrndq_f32(x.simdInternal_) + }; +} + +static inline SimdFInt32 gmx_simdcall +cvtR2I(SimdFloat a) +{ + return { + vcvtnq_s32_f32(a.simdInternal_) + }; +} + +static inline bool gmx_simdcall +anyTrue(SimdFBool a) +{ + return (vmaxvq_u32(a.simdInternal_) != 0); +} + +static inline bool gmx_simdcall +anyTrue(SimdFIBool a) +{ + return (vmaxvq_u32(a.simdInternal_) != 0); +} + +static inline float gmx_simdcall +reduce(SimdFloat a) +{ + float32x4_t b = a.simdInternal_; + b = vpaddq_f32(b, b); + b = vpaddq_f32(b, b); + return vgetq_lane_f32(b, 0); +} + +} // namespace gmx + +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_SIMD_FLOAT_H diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h new file mode 100644 index 0000000000..45abcbbe16 --- /dev/null +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_double.h @@ -0,0 +1,306 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H +#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H + +#include "config.h" + +#include +#include +#include + +#include + +#include "gromacs/utility/basedefinitions.h" + +#include "impl_arm_neon_asimd_simd_double.h" + +namespace gmx +{ + +template +static gmx_inline void gmx_simdcall +gatherLoadTranspose(const double * base, + const std::int32_t offset[], + SimdDouble * v0, + SimdDouble * v1, + SimdDouble * v2, + SimdDouble * v3) +{ + float64x2_t t1, t2, t3, t4; + + assert(std::size_t(offset) % 8 == 0); + assert(std::size_t(base) % 16 == 0); + assert(align % 2 == 0); + + t1 = vld1q_f64(base + align * offset[0]); + t2 = vld1q_f64(base + align * offset[1]); + t3 = vld1q_f64(base + align * offset[0] + 2); + t4 = vld1q_f64(base + align * offset[1] + 2); + v0->simdInternal_ = vuzp1q_f64(t1, t2); + v1->simdInternal_ = vuzp2q_f64(t1, t2); + v2->simdInternal_ = vuzp1q_f64(t3, t4); + v3->simdInternal_ = vuzp2q_f64(t3, t4); +} + +template +static gmx_inline void gmx_simdcall +gatherLoadTranspose(const double * base, + const std::int32_t offset[], + SimdDouble * v0, + SimdDouble * v1) +{ + float64x2_t t1, t2; + + assert(std::size_t(offset) % 8 == 0); + assert(std::size_t(base) % 16 == 0); + assert(align % 2 == 0); + + t1 = vld1q_f64(base + align * offset[0]); + t2 = vld1q_f64(base + align * offset[1]); + v0->simdInternal_ = vuzp1q_f64(t1, t2); + v1->simdInternal_ = vuzp2q_f64(t1, t2); +} + +static const int c_simdBestPairAlignmentDouble = 2; + +template +static gmx_inline void gmx_simdcall +gatherLoadUTranspose(const double * base, + const std::int32_t offset[], + SimdDouble * v0, + SimdDouble * v1, + SimdDouble * v2) +{ + float64x2_t t1, t2; + float64x1_t t3, t4; + + assert(std::size_t(offset) % 8 == 0); + + t1 = vld1q_f64(base + align * offset[0]); + t2 = vld1q_f64(base + align * offset[1]); + t3 = vld1_f64(base + align * offset[0] + 2); + t4 = vld1_f64(base + align * offset[1] + 2); + v0->simdInternal_ = vuzp1q_f64(t1, t2); + v1->simdInternal_ = vuzp2q_f64(t1, t2); + v2->simdInternal_ = vcombine_f64(t3, t4); +} + +template +static gmx_inline void gmx_simdcall +transposeScatterStoreU(double * base, + const std::int32_t offset[], + SimdDouble v0, + SimdDouble v1, + SimdDouble v2) +{ + float64x2_t t0, t1; + + assert(std::size_t(offset) % 8 == 0); + + t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); + t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); + vst1q_f64(base + align * offset[0], t0); + vst1q_f64(base + align * offset[1], t1); + vst1_f64(base + align * offset[0] + 2, vget_low_f64(v2.simdInternal_)); + vst1_f64(base + align * offset[1] + 2, vget_high_f64(v2.simdInternal_)); +} + +template +static gmx_inline void gmx_simdcall +transposeScatterIncrU(double * base, + const std::int32_t offset[], + SimdDouble v0, + SimdDouble v1, + SimdDouble v2) +{ + float64x2_t t0, t1, t2; + float64x1_t t3; + + assert(std::size_t(offset) % 8 == 0); + + t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0 + t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1 + + t2 = vld1q_f64(base + align * offset[0]); + t2 = vaddq_f64(t2, t0); + vst1q_f64(base + align * offset[0], t2); + + t3 = vld1_f64(base + align * offset[0] + 2); + t3 = vadd_f64(t3, vget_low_f64(v2.simdInternal_)); + vst1_f64(base + align * offset[0] + 2, t3); + + t2 = vld1q_f64(base + align * offset[1]); + t2 = vaddq_f64(t2, t1); + vst1q_f64(base + align * offset[1], t2); + + t3 = vld1_f64(base + align * offset[1] + 2); + t3 = vadd_f64(t3, vget_high_f64(v2.simdInternal_)); + vst1_f64(base + align * offset[1] + 2, t3); +} + +template +static gmx_inline void gmx_simdcall +transposeScatterDecrU(double * base, + const std::int32_t offset[], + SimdDouble v0, + SimdDouble v1, + SimdDouble v2) +{ + float64x2_t t0, t1, t2; + float64x1_t t3; + + assert(std::size_t(offset) % 8 == 0); + + t0 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); // x0 y0 + t1 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); // x1 y1 + + t2 = vld1q_f64(base + align * offset[0]); + t2 = vsubq_f64(t2, t0); + vst1q_f64(base + align * offset[0], t2); + + t3 = vld1_f64(base + align * offset[0] + 2); + t3 = vsub_f64(t3, vget_low_f64(v2.simdInternal_)); + vst1_f64(base + align * offset[0] + 2, t3); + + t2 = vld1q_f64(base + align * offset[1]); + t2 = vsubq_f64(t2, t1); + vst1q_f64(base + align * offset[1], t2); + + t3 = vld1_f64(base + align * offset[1] + 2); + t3 = vsub_f64(t3, vget_high_f64(v2.simdInternal_)); + vst1_f64(base + align * offset[1] + 2, t3); +} + +static gmx_inline void gmx_simdcall +expandScalarsToTriplets(SimdDouble scalar, + SimdDouble * triplets0, + SimdDouble * triplets1, + SimdDouble * triplets2) +{ + triplets0->simdInternal_ = vuzp1q_f64(scalar.simdInternal_, scalar.simdInternal_); + triplets1->simdInternal_ = scalar.simdInternal_; + triplets2->simdInternal_ = vuzp2q_f64(scalar.simdInternal_, scalar.simdInternal_); +} + +template +static gmx_inline void gmx_simdcall +gatherLoadBySimdIntTranspose(const double * base, + SimdDInt32 offset, + SimdDouble * v0, + SimdDouble * v1, + SimdDouble * v2, + SimdDouble * v3) +{ + GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH) ioffset[GMX_SIMD_DINT32_WIDTH]; + + assert(std::size_t(base) % 16 == 0); + assert(align % 2 == 0); + + vst1_s32(ioffset, offset.simdInternal_); + gatherLoadTranspose(base, ioffset, v0, v1, v2, v3); +} + + +template +static gmx_inline void gmx_simdcall +gatherLoadBySimdIntTranspose(const double * base, + SimdDInt32 offset, + SimdDouble * v0, + SimdDouble * v1) +{ + GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH) ioffset[GMX_SIMD_DINT32_WIDTH]; + + assert(std::size_t(base) % 16 == 0); + assert(align % 2 == 0); + + vst1_s32(ioffset, offset.simdInternal_); + gatherLoadTranspose(base, ioffset, v0, v1); +} + +template +static gmx_inline void gmx_simdcall +gatherLoadUBySimdIntTranspose(const double * base, + SimdDInt32 offset, + SimdDouble * v0, + SimdDouble * v1) +{ + GMX_ALIGNED(int, GMX_SIMD_DINT32_WIDTH) ioffset[GMX_SIMD_DINT32_WIDTH]; + + vst1_s32(ioffset, offset.simdInternal_); + + float64x2_t t1, t2; + + t1 = vld1q_f64(base + align * ioffset[0]); + t2 = vld1q_f64(base + align * ioffset[1]); + v0->simdInternal_ = vuzp1q_f64(t1, t2); + v1->simdInternal_ = vuzp2q_f64(t1, t2); +} + + +static gmx_inline double gmx_simdcall +reduceIncr4ReturnSum(double * m, + SimdDouble v0, + SimdDouble v1, + SimdDouble v2, + SimdDouble v3) +{ + float64x2_t t1, t2, t3, t4; + + assert(std::size_t(m) % 8 == 0); + + t1 = vuzp1q_f64(v0.simdInternal_, v1.simdInternal_); + t2 = vuzp2q_f64(v0.simdInternal_, v1.simdInternal_); + t3 = vuzp1q_f64(v2.simdInternal_, v3.simdInternal_); + t4 = vuzp2q_f64(v2.simdInternal_, v3.simdInternal_); + + t1 = vaddq_f64(t1, t2); + t3 = vaddq_f64(t3, t4); + + t2 = vaddq_f64(t1, vld1q_f64(m)); + t4 = vaddq_f64(t3, vld1q_f64(m + 2)); + vst1q_f64(m, t2); + vst1q_f64(m + 2, t4); + + t1 = vaddq_f64(t1, t3); + t2 = vpaddq_f64(t1, t1); + + return vgetq_lane_f64(t2, 0); +} + +} // namespace gmx + +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_DOUBLE_H diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h similarity index 55% rename from src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h rename to src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h index 536b67c97a..177cc7ddda 100644 --- a/src/gromacs/simd/impl_arm_neon/impl_arm_neon_common.h +++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_util_float.h @@ -33,36 +33,9 @@ * the research papers on the package. Check out http://www.gromacs.org. */ -#ifndef GMX_SIMD_IMPL_ARM_NEON_COMMON_H -#define GMX_SIMD_IMPL_ARM_NEON_COMMON_H +#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_FLOAT_H +#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_FLOAT_H -/* Capability definitions for ARM 32-bit NEON */ -#define GMX_SIMD 1 -#define GMX_SIMD_HAVE_FLOAT 1 -#define GMX_SIMD_HAVE_DOUBLE 0 -#define GMX_SIMD_HAVE_LOADU 1 -#define GMX_SIMD_HAVE_STOREU 1 -#define GMX_SIMD_HAVE_LOGICAL 1 -#define GMX_SIMD_HAVE_FMA 1 -#define GMX_SIMD_HAVE_FRACTION 0 -#define GMX_SIMD_HAVE_FINT32 1 -#define GMX_SIMD_HAVE_FINT32_EXTRACT 1 -#define GMX_SIMD_HAVE_FINT32_LOGICAL 1 -#define GMX_SIMD_HAVE_FINT32_ARITHMETICS 1 -#define GMX_SIMD_HAVE_DINT32 0 -#define GMX_SIMD_HAVE_DINT32_EXTRACT 0 -#define GMX_SIMD_HAVE_DINT32_LOGICAL 0 -#define GMX_SIMD_HAVE_DINT32_ARITHMETICS 0 -#define GMX_SIMD4_HAVE_FLOAT 1 -#define GMX_SIMD4_HAVE_DOUBLE 0 +#include "gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h" -/* Implementation details */ -#define GMX_SIMD_FLOAT_WIDTH 4 -#undef GMX_SIMD_DOUBLE_WIDTH -#define GMX_SIMD_FINT32_WIDTH 4 -#undef GMX_SIMD_DINT32_WIDTH -#define GMX_SIMD4_WIDTH 4 -#define GMX_SIMD_RSQRT_BITS 8 -#define GMX_SIMD_RCP_BITS 8 - -#endif /* GMX_SIMD_IMPL_ARM_NEON_COMMON_H */ +#endif // GMX_SIMD_IMPL_ARM_NEON_ASIMD_UTIL_FLOAT_H diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h index 0d969933bd..e345aff722 100644 --- a/src/gromacs/simd/simd.h +++ b/src/gromacs/simd/simd.h @@ -108,6 +108,10 @@ # include "impl_x86_sse2/impl_x86_sse2.h" #elif GMX_SIMD_IBM_QPX # include "impl_ibm_qpx/impl_ibm_qpx.h" +#elif GMX_SIMD_ARM_NEON_ASIMD +# include "impl_arm_neon_asimd/impl_arm_neon_asimd.h" +#elif GMX_SIMD_ARM_NEON +# include "impl_arm_neon/impl_arm_neon.h" #elif (GMX_SIMD_REFERENCE || defined DOXYGEN) # include "impl_reference/impl_reference.h" // Includes doxygen documentation #else -- 2.11.4.GIT