Improve text writing in help output
[gromacs.git] / src / gromacs / simd / impl_intel_mic / impl_intel_mic_simd_float.h
blobc2b081554b3faab71d9b4c0172c05b8a26b5e419
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
37 #define GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H
39 #include "config.h"
41 #include <cmath>
42 #include <cstdint>
44 #include <immintrin.h>
46 #include "impl_intel_mic_common.h"
48 /****************************************************
49 * SINGLE PRECISION SIMD IMPLEMENTATION *
50 ****************************************************/
51 #define SimdFloat __m512
52 #define simdLoadF _mm512_load_ps
53 #define simdLoad1F(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
54 #define simdSet1F _mm512_set1_ps
55 #define simdStoreF _mm512_store_ps
56 #define simdLoadUF simdLoadUF_mic
57 #define simdStoreUF simdStoreUF_mic
58 #define simdSetZeroF _mm512_setzero_ps
59 #define simdAddF _mm512_add_ps
60 #define simdSubF _mm512_sub_ps
61 #define simdMulF _mm512_mul_ps
62 #define simdFmaddF _mm512_fmadd_ps
63 #define simdFmsubF _mm512_fmsub_ps
64 #define simdFnmaddF _mm512_fnmadd_ps
65 #define simdFnmsubF _mm512_fnmsub_ps
66 #define simdAndF(a, b) _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
67 #define simdAndNotF(a, b) _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
68 #define simdOrF(a, b) _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
69 #define simdXorF(a, b) _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
70 #define simdRsqrtF _mm512_rsqrt23_ps
71 #define simdRcpF _mm512_rcp23_ps
72 #define simdAbsF(x) simdAndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
73 #define simdNegF(x) _mm512_addn_ps(x, _mm512_setzero_ps())
74 #define simdMaxF _mm512_gmax_ps
75 #define simdMinF _mm512_gmin_ps
76 #define simdRoundF(x) _mm512_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
77 #define simdTruncF(x) _mm512_round_ps(x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
78 #define simdFractionF(x) _mm512_sub_ps(x, simdTruncF(x))
79 #define simdGetExponentF(x) _mm512_getexp_ps(x)
80 #define simdGetMantissaF(x) _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
81 #define simdSetExponentF(x) simdSetExponentF_mic(x)
82 /* integer datatype corresponding to float: SimdFInt32 */
83 #define SimdFInt32 __m512i
84 #define simdLoadFI _mm512_load_epi32
85 #define simdSet1FI _mm512_set1_epi32
86 #define simdStoreFI _mm512_store_epi32
87 #define simdLoadUFI simdLoadUFI_mic
88 #define simdStoreUFI simdStoreUFI_mic
89 #define simdExtractFI simdExtractFI_mic
90 #define simdSetZeroFI _mm512_setzero_epi32
91 #define simdCvtF2I(a) _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
92 #define simdCvttF2I(a) _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
93 #define simdCvtI2F(a) _mm512_cvtfxpnt_round_adjustepi32_ps(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
94 /* Integer logical ops on SimdFInt32 */
95 #define simdSlliFI _mm512_slli_epi32
96 #define simdSrliFI _mm512_srli_epi32
97 #define simdAndFI _mm512_and_epi32
98 #define simdAndNotFI _mm512_andnot_epi32
99 #define simdOrFI _mm512_or_epi32
100 #define simdXorFI _mm512_xor_epi32
101 /* Integer arithmetic ops on SimdFInt32 */
102 #define simdAddFI _mm512_add_epi32
103 #define simdSubFI _mm512_sub_epi32
104 #define simdMulFI _mm512_mullo_epi32
105 /* Boolean & comparison operations on SimdFloat */
106 #define SimdFBool __mmask16
107 #define simdCmpEqF(a, b) _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
108 #define simdCmpLtF(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
109 #define simdCmpLeF(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
110 #define simdAndFB _mm512_kand
111 #define simdAndNotFB(a, b) _mm512_knot(_mm512_kor(a, b))
112 #define simdOrFB _mm512_kor
113 #define simdAnyTrueFB _mm512_mask2int
114 #define simdMaskF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
115 #define simdMaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
116 #define simdBlendF(a, b, sel) _mm512_mask_blend_ps(sel, a, b)
117 #define simdReduceF(a) _mm512_reduce_add_ps(a)
118 /* Boolean & comparison operations on SimdFInt32 */
119 #define SimdFIBool __mmask16
120 #define simdCmpEqFI(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
121 #define simdCmpLtFI(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
122 #define simdAndFIB _mm512_kand
123 #define simdOrFIB _mm512_kor
124 #define simdAnyTrueFIB _mm512_mask2int
125 #define simdMaskFI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
126 #define simdMaskNotFI(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
127 #define simdBlendFI(a, b, sel) _mm512_mask_blend_epi32(sel, a, b)
128 /* Conversions between different booleans */
129 #define simdCvtFB2FIB(x) (x)
130 #define simdCvtFIB2FB(x) (x)
132 /* MIC provides full single precision of some neat functions: */
133 /* 1/sqrt(x) and 1/x work fine in simd_math.h, and won't use extra iterations */
134 #define simdExp2F simdExp2F_mic
135 #define simdExpF simdExpF_mic
136 #define simdLogF simdLogF_mic
138 /* load store float */
139 static inline __m512 gmx_simdcall
140 simdLoadUF_mic(const float * m)
142 return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
145 static inline void gmx_simdcall
146 simdStoreUF_mic(float * m, __m512 s)
148 _mm512_packstorelo_ps(m, s);
149 _mm512_packstorehi_ps(m+16, s);
152 /* load store fint32 */
153 static inline __m512i gmx_simdcall
154 simdLoadUFI_mic(const std::int32_t * m)
156 return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
159 static inline void gmx_simdcall
160 simdStoreUFI_mic(std::int32_t * m, __m512i s)
162 _mm512_packstorelo_epi32(m, s);
163 _mm512_packstorehi_epi32(m+16, s);
166 /* extract */
167 static inline std::int32_t gmx_simdcall
168 simdExtractFI_mic(SimdFInt32 a, int index)
170 int r;
171 _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
172 return r;
175 /* This is likely faster than the built in scale operation (lat 8, t-put 3)
176 * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
178 static inline __m512 gmx_simdcall
179 simdSetExponentF_mic(__m512 a)
181 __m512i iexp = simdCvtF2I(a);
183 const __m512i expbias = _mm512_set1_epi32(127);
184 iexp = _mm512_slli_epi32(_mm512_add_epi32(iexp, expbias), 23);
185 return _mm512_castsi512_ps(iexp);
187 /* scale alternative:
188 return _mm512_scale_ps(_mm512_set1_ps(1), iexp);
192 static inline __m512 gmx_simdcall
193 simdExp2F_mic(__m512 x)
195 return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
198 static inline __m512 gmx_simdcall
199 simdExpF_mic(__m512 x)
201 const SimdFloat argscale = simdSet1F(1.44269504088896341f);
202 const SimdFloat invargscale = simdSet1F(-0.69314718055994528623f);
203 __m512 xscaled = _mm512_mul_ps(x, argscale);
204 __m512 r = simdExp2F_mic(xscaled);
206 /* simdExp2F_mic() provides 23 bits of accuracy, but we ruin some of that
207 * with the argument scaling due to single-precision rounding, where the
208 * rounding error is amplified exponentially. To correct this, we find the
209 * difference between the scaled argument and the true one (extended precision
210 * arithmetics does not appear to be necessary to fulfill our accuracy requirements)
211 * and then multiply by the exponent of this correction since exp(a+b)=exp(a)*exp(b).
212 * Note that this only adds two instructions (and maybe some constant loads).
214 x = simdFmaddF(invargscale, xscaled, x);
215 /* x will now be a _very_ small number, so approximate exp(x)=1+x.
216 * We should thus apply the correction as r'=r*(1+x)=r+r*x
218 r = simdFmaddF(r, x, r);
219 return r;
222 static inline __m512 gmx_simdcall
223 simdLogF_mic(__m512 x)
225 return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
228 #endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD_FLOAT_H */