src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
  37 #define GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <assert.h>
  42 #include <math.h>
  43
  44 #include <immintrin.h>
  45
  46 #include "impl_intel_mic_common.h"
  47 #include "impl_intel_mic_simd_float.h"
  48
  49 /****************************************************
  50  *      SINGLE PRECISION SIMD4 IMPLEMENTATION       *
  51  ****************************************************/
  52 /* Load and store are guranteed to only access the 4 floats. All arithmetic operations
  53    only operate on the 4 elements (to avoid floating excpetions). But other operations
  54    are not gurateed to not modify the other 12 elements. E.g. setzero or blendzero
  55    set the upper 12 to zero. */
  56 #define Simd4Float           __m512
  57 #define simd4Mask              _mm512_int2mask(0xF)
  58 #define simd4LoadF(m)         _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
  59 #define simd4Load1F(m)        _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
  60 #define simd4Set1F            _mm512_set1_ps
  61 #define simd4StoreF           simd4StoreF_mic
  62 #define simd4LoadUF           simd4LoadUF_mic
  63 #define simd4StoreUF          simd4StoreUF_mic
  64 #define simd4SetZeroF         _mm512_setzero_ps
  65 #define simd4AddF(a, b)       _mm512_mask_add_ps(_mm512_undefined_ps(), simd4Mask, a, b)
  66 #define simd4SubF(a, b)       _mm512_mask_sub_ps(_mm512_undefined_ps(), simd4Mask, a, b)
  67 #define simd4MulF(a, b)       _mm512_mask_mul_ps(_mm512_undefined_ps(), simd4Mask, a, b)
  68 #define simd4FmaddF(a, b, c)  _mm512_mask_fmadd_ps(a, simd4Mask, b, c)
  69 #define simd4FmsubF(a, b, c)  _mm512_mask_fmsub_ps(a, simd4Mask, b, c)
  70 #define simd4FnmaddF(a, b, c) _mm512_mask_fnmadd_ps(a, simd4Mask, b, c)
  71 #define simd4FnmsubF(a, b, c) _mm512_mask_fnmsub_ps(a, simd4Mask, b, c)
  72 #define simd4AndF(a, b)       _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
  73 #define simd4AndNotF(a, b)    _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
  74 #define simd4OrF(a, b)        _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
  75 #define simd4XorF(a, b)       _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
  76 #define simd4RsqrtF(a)        _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), simd4Mask, a)
  77 #define simd4AbsF(x)         simd4AndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
  78 #define simd4NegF(x)         _mm512_mask_addn_ps(_mm512_undefined_ps(), simd4Mask, x, _mm512_setzero_ps())
  79 #define simd4MaxF(a, b)       _mm512_mask_gmax_ps(_mm512_undefined_ps(), simd4Mask, a, b)
  80 #define simd4MinF(a, b)       _mm512_mask_gmin_ps(_mm512_undefined_ps(), simd4Mask, a, b)
  81 #define simd4RoundF(x)        _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
  82 #define simd4TruncF(x)        _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
  83 #define simd4DotProductF(a, b) _mm512_mask_reduce_add_ps(_mm512_int2mask(7), _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(7), a, b))
  84 #define Simd4FBool           __mmask16
  85 #define simd4CmpEqF(a, b)     _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_EQ_OQ)
  86 #define simd4CmpLtF(a, b)     _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LT_OS)
  87 #define simd4CmpLeF(a, b)     _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LE_OS)
  88 #define simd4AndFB            _mm512_kand
  89 #define simd4OrFB             _mm512_kor
  90 #define simd4AnyTrueFB(x)     (_mm512_mask2int(x)&0xF)
  91 #define simd4MaskF(a, sel)    _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
  92 #define simd4MaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
  93 #define simd4BlendF(a, b, sel)    _mm512_mask_blend_ps(sel, a, b)
  94 #define simd4ReduceF(x)       _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x)
  95
  96 /* Implementation helpers */
  97
  98 /* load store simd4 */
  99 static inline void gmx_simdcall
 100 simd4StoreF_mic(float * m, __m512 s)
 101 {
 102     assert((size_t)m%16 == 0);
 103     _mm512_mask_packstorelo_ps(m, simd4Mask, s);
 104 }
 105
 106 static inline __m512 gmx_simdcall
 107 simd4LoadUF_mic(const float * m)
 108 {
 109     return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), simd4Mask, m), simd4Mask, m+16);
 110 }
 111
 112 static inline void gmx_simdcall
 113 simd4StoreUF_mic(float * m, __m512 s)
 114 {
 115     _mm512_mask_packstorelo_ps(m, simd4Mask, s);
 116     _mm512_mask_packstorehi_ps(m+16, simd4Mask, s);
 117 }
 118
 119 #endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H */