From 2221393549638e0164015aad3e6b9ea3eb2a0a10 Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Tue, 12 Dec 2017 15:54:56 +0100 Subject: [PATCH] Work around AVX-512 issues in gcc-5.4 and 7.1 Fixes compilation issues with mixed and double precision builds using AVX-512 SIMD with gcc-5.4 or gcc-7.1. Also tested with gcc-6.3, and Debug as well as Release builds for all three versions, all of which now pass the simd unit tests. Fixes #2325. Change-Id: I59c3ae0467b51412d1ebbb5b57a248534288a5db --- .../simd/impl_x86_avx_512/impl_x86_avx_512_util_double.h | 16 ++++++++++------ .../simd/impl_x86_avx_512/impl_x86_avx_512_util_float.h | 10 +++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_double.h b/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_double.h index b418f6064b..3ff77c61e0 100644 --- a/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_double.h +++ b/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_double.h @@ -96,7 +96,8 @@ gatherLoadBySimdIntTranspose(const double * base, SimdDInt32 offset, SimdDouble { offset = fastMultiply(offset); } - v->simdInternal_ = _mm512_i32gather_pd(offset.simdInternal_, base, sizeof(double)); + constexpr size_t scale = sizeof(double); + v->simdInternal_ = _mm512_i32gather_pd(offset.simdInternal_, base, scale); gatherLoadBySimdIntTranspose<1>(base+1, offset, Fargs ...); } @@ -130,13 +131,15 @@ transposeScatterStoreU(double * base, SimdDouble v2) { SimdDInt32 simdoffset = simdLoad(offset, SimdDInt32Tag()); + if (align > 1) { simdoffset = fastMultiply(simdoffset);; } - _mm512_i32scatter_pd(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(double)); - _mm512_i32scatter_pd(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(double)); - _mm512_i32scatter_pd(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(double)); + constexpr size_t scale = sizeof(double); + _mm512_i32scatter_pd(base, simdoffset.simdInternal_, v0.simdInternal_, scale); + _mm512_i32scatter_pd(&(base[1]), simdoffset.simdInternal_, v1.simdInternal_, scale); + _mm512_i32scatter_pd(&(base[2]), simdoffset.simdInternal_, v2.simdInternal_, scale); } template @@ -402,8 +405,9 @@ gatherLoadTransposeHsimd(const double * base0, idx = _mm256_inserti128_si256(_mm256_castsi128_si256(idx0), idx1, 1); - tmp1 = _mm512_i32gather_pd(idx, base0, sizeof(double)); //TODO: Might be faster to use invidual loads - tmp2 = _mm512_i32gather_pd(idx, base1, sizeof(double)); + constexpr size_t scale = sizeof(double); + tmp1 = _mm512_i32gather_pd(idx, base0, scale); //TODO: Might be faster to use invidual loads + tmp2 = _mm512_i32gather_pd(idx, base1, scale); v0->simdInternal_ = _mm512_shuffle_f64x2(tmp1, tmp2, 0x44 ); v1->simdInternal_ = _mm512_shuffle_f64x2(tmp1, tmp2, 0xEE ); diff --git a/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_float.h b/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_float.h index ee08c068e0..61548bba13 100644 --- a/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_float.h +++ b/src/gromacs/simd/impl_x86_avx_512/impl_x86_avx_512_util_float.h @@ -137,12 +137,12 @@ transposeScatterStoreU(float * base, { simdoffset = fastMultiply(simdoffset); } - constexpr int align_ = (align > 2) ? 1 : align; - _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, sizeof(float)*align_); - _mm512_i32scatter_ps(base+1, simdoffset.simdInternal_, v1.simdInternal_, sizeof(float)*align_); - _mm512_i32scatter_ps(base+2, simdoffset.simdInternal_, v2.simdInternal_, sizeof(float)*align_); -} + constexpr size_t scale = (align > 2) ? sizeof(float) : sizeof(float) * align; + _mm512_i32scatter_ps(base, simdoffset.simdInternal_, v0.simdInternal_, scale); + _mm512_i32scatter_ps(&(base[1]), simdoffset.simdInternal_, v1.simdInternal_, scale); + _mm512_i32scatter_ps(&(base[2]), simdoffset.simdInternal_, v2.simdInternal_, scale); +} template static inline void gmx_simdcall -- 2.11.4.GIT