src/gromacs/simd/impl_intel_mic/impl_intel_mic_simd_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
  37 #define GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <cmath>
  42 #include <cstdint>
  43
  44 #include <immintrin.h>
  45
  46 #include "impl_intel_mic_common.h"
  47
  48 /****************************************************
  49  *      SINGLE PRECISION SIMD IMPLEMENTATION        *
  50  ****************************************************/
  51 #define SimdFloat           __m512
  52 #define simdLoadF            _mm512_load_ps
  53 #define simdLoad1F(m)        _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
  54 #define simdSet1F            _mm512_set1_ps
  55 #define simdStoreF           _mm512_store_ps
  56 #define simdLoadUF           simdLoadUF_mic
  57 #define simdStoreUF          simdStoreUF_mic
  58 #define simdSetZeroF         _mm512_setzero_ps
  59 #define simdAddF             _mm512_add_ps
  60 #define simdSubF             _mm512_sub_ps
  61 #define simdMulF             _mm512_mul_ps
  62 #define simdFmaddF           _mm512_fmadd_ps
  63 #define simdFmsubF           _mm512_fmsub_ps
  64 #define simdFnmaddF          _mm512_fnmadd_ps
  65 #define simdFnmsubF          _mm512_fnmsub_ps
  66 #define simdAndF(a, b)        _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
  67 #define simdAndNotF(a, b)     _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
  68 #define simdOrF(a, b)         _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
  69 #define simdXorF(a, b)        _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
  70 #define simdRsqrtF           _mm512_rsqrt23_ps
  71 #define simdRcpF             _mm512_rcp23_ps
  72 #define simdAbsF(x)         simdAndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
  73 #define simdNegF(x)         _mm512_addn_ps(x, _mm512_setzero_ps())
  74 #define simdMaxF             _mm512_gmax_ps
  75 #define simdMinF             _mm512_gmin_ps
  76 #define simdRoundF(x)        _mm512_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
  77 #define simdTruncF(x)        _mm512_round_ps(x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
  78 #define simdFractionF(x)     _mm512_sub_ps(x, simdTruncF(x))
  79 #define simdGetExponentF(x) _mm512_getexp_ps(x)
  80 #define simdGetMantissaF(x) _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
  81 #define simdSetExponentF(x) simdSetExponentF_mic(x)
  82 /* integer datatype corresponding to float: SimdFInt32 */
  83 #define SimdFInt32          __m512i
  84 #define simdLoadFI           _mm512_load_epi32
  85 #define simdSet1FI           _mm512_set1_epi32
  86 #define simdStoreFI          _mm512_store_epi32
  87 #define simdLoadUFI          simdLoadUFI_mic
  88 #define simdStoreUFI         simdStoreUFI_mic
  89 #define simdExtractFI        simdExtractFI_mic
  90 #define simdSetZeroFI        _mm512_setzero_epi32
  91 #define simdCvtF2I(a)        _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
  92 #define simdCvttF2I(a)       _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
  93 #define simdCvtI2F(a)        _mm512_cvtfxpnt_round_adjustepi32_ps(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
  94 /* Integer logical ops on SimdFInt32 */
  95 #define simdSlliFI           _mm512_slli_epi32
  96 #define simdSrliFI           _mm512_srli_epi32
  97 #define simdAndFI            _mm512_and_epi32
  98 #define simdAndNotFI         _mm512_andnot_epi32
  99 #define simdOrFI             _mm512_or_epi32
 100 #define simdXorFI            _mm512_xor_epi32
 101 /* Integer arithmetic ops on SimdFInt32 */
 102 #define simdAddFI            _mm512_add_epi32
 103 #define simdSubFI            _mm512_sub_epi32
 104 #define simdMulFI            _mm512_mullo_epi32
 105 /* Boolean & comparison operations on SimdFloat */
 106 #define SimdFBool           __mmask16
 107 #define simdCmpEqF(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
 108 #define simdCmpLtF(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
 109 #define simdCmpLeF(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
 110 #define simdAndFB            _mm512_kand
 111 #define simdAndNotFB(a, b)   _mm512_knot(_mm512_kor(a, b))
 112 #define simdOrFB             _mm512_kor
 113 #define simdAnyTrueFB        _mm512_mask2int
 114 #define simdMaskF(a, sel)    _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
 115 #define simdMaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
 116 #define simdBlendF(a, b, sel)    _mm512_mask_blend_ps(sel, a, b)
 117 #define simdReduceF(a)       _mm512_reduce_add_ps(a)
 118 /* Boolean & comparison operations on SimdFInt32 */
 119 #define SimdFIBool          __mmask16
 120 #define simdCmpEqFI(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
 121 #define simdCmpLtFI(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
 122 #define simdAndFIB           _mm512_kand
 123 #define simdOrFIB            _mm512_kor
 124 #define simdAnyTrueFIB       _mm512_mask2int
 125 #define simdMaskFI(a, sel)    _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
 126 #define simdMaskNotFI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
 127 #define simdBlendFI(a, b, sel)    _mm512_mask_blend_epi32(sel, a, b)
 128 /* Conversions between different booleans */
 129 #define simdCvtFB2FIB(x)     (x)
 130 #define simdCvtFIB2FB(x)     (x)
 131
 132 /* MIC provides full single precision of some neat functions: */
 133 /* 1/sqrt(x) and 1/x work fine in simd_math.h, and won't use extra iterations */
 134 #define simdExp2F            simdExp2F_mic
 135 #define simdExpF             simdExpF_mic
 136 #define simdLogF             simdLogF_mic
 137
 138 /* load store float */
 139 static inline __m512 gmx_simdcall
 140 simdLoadUF_mic(const float * m)
 141 {
 142     return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
 143 }
 144
 145 static inline void gmx_simdcall
 146 simdStoreUF_mic(float * m, __m512 s)
 147 {
 148     _mm512_packstorelo_ps(m, s);
 149     _mm512_packstorehi_ps(m+16, s);
 150 }
 151
 152 /* load store fint32 */
 153 static inline __m512i gmx_simdcall
 154 simdLoadUFI_mic(const std::int32_t * m)
 155 {
 156     return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
 157 }
 158
 159 static inline void gmx_simdcall
 160 simdStoreUFI_mic(std::int32_t * m, __m512i s)
 161 {
 162     _mm512_packstorelo_epi32(m, s);
 163     _mm512_packstorehi_epi32(m+16, s);
 164 }
 165
 166 /* extract */
 167 static inline std::int32_t gmx_simdcall
 168 simdExtractFI_mic(SimdFInt32 a, int index)
 169 {
 170     int r;
 171     _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
 172     return r;
 173 }
 174
 175 /* This is likely faster than the built in scale operation (lat 8, t-put 3)
 176  * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
 177  */
 178 static inline __m512 gmx_simdcall
 179 simdSetExponentF_mic(__m512 a)
 180 {
 181     __m512i       iexp         = simdCvtF2I(a);
 182
 183     const __m512i expbias      = _mm512_set1_epi32(127);
 184     iexp = _mm512_slli_epi32(_mm512_add_epi32(iexp, expbias), 23);
 185     return _mm512_castsi512_ps(iexp);
 186
 187     /* scale alternative:
 188        return _mm512_scale_ps(_mm512_set1_ps(1), iexp);
 189      */
 190 }
 191
 192 static inline __m512 gmx_simdcall
 193 simdExp2F_mic(__m512 x)
 194 {
 195     return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
 196 }
 197
 198 static inline __m512 gmx_simdcall
 199 simdExpF_mic(__m512 x)
 200 {
 201     const SimdFloat         argscale    = simdSet1F(1.44269504088896341f);
 202     const SimdFloat         invargscale = simdSet1F(-0.69314718055994528623f);
 203     __m512                  xscaled     = _mm512_mul_ps(x, argscale);
 204     __m512                  r           = simdExp2F_mic(xscaled);
 205
 206     /* simdExp2F_mic() provides 23 bits of accuracy, but we ruin some of that
 207      * with the argument scaling due to single-precision rounding, where the
 208      * rounding error is amplified exponentially. To correct this, we find the
 209      * difference between the scaled argument and the true one (extended precision
 210      * arithmetics does not appear to be necessary to fulfill our accuracy requirements)
 211      * and then multiply by the exponent of this correction since exp(a+b)=exp(a)*exp(b).
 212      * Note that this only adds two instructions (and maybe some constant loads).
 213      */
 214     x         = simdFmaddF(invargscale, xscaled, x);
 215     /* x will now be a _very_ small number, so approximate exp(x)=1+x.
 216      * We should thus apply the correction as r'=r*(1+x)=r+r*x
 217      */
 218     r         = simdFmaddF(r, x, r);
 219     return r;
 220 }
 221
 222 static inline __m512 gmx_simdcall
 223 simdLogF_mic(__m512 x)
 224 {
 225     return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
 226 }
 227
 228 #endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H */