Update instructions in containers.rst
[gromacs.git] / src / gromacs / simd / impl_ibm_vsx / impl_ibm_vsx_util_float.h
blobda57a8ae669bac7494134c257f4c06973ffbf67b
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
38 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
40 #include "config.h"
42 #include "gromacs/utility/basedefinitions.h"
44 #include "impl_ibm_vsx_definitions.h"
45 #include "impl_ibm_vsx_simd_float.h"
47 namespace gmx
50 template<int align>
51 static inline void gmx_simdcall gatherLoadTranspose(const float* base,
52 const std::int32_t offset[],
53 SimdFloat* v0,
54 SimdFloat* v1,
55 SimdFloat* v2,
56 SimdFloat* v3)
58 __vector float l0, l1, l2, l3;
60 l0 = simdLoad(base + align * offset[0]).simdInternal_;
61 l1 = simdLoad(base + align * offset[1]).simdInternal_;
62 l2 = simdLoad(base + align * offset[2]).simdInternal_;
63 l3 = simdLoad(base + align * offset[3]).simdInternal_;
65 __vector float t0 = vec_mergeh(l0, l2);
66 __vector float t1 = vec_mergel(l0, l2);
67 __vector float t2 = vec_mergeh(l1, l3);
68 __vector float t3 = vec_mergel(l1, l3);
69 v0->simdInternal_ = vec_mergeh(t0, t2);
70 v1->simdInternal_ = vec_mergel(t0, t2);
71 v2->simdInternal_ = vec_mergeh(t1, t3);
72 v3->simdInternal_ = vec_mergel(t1, t3);
75 template<int align>
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
79 __vector float t0, t1, t2, t3;
81 t0 = reinterpret_cast<__vector float>(
82 vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
83 t1 = reinterpret_cast<__vector float>(
84 vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
85 t2 = reinterpret_cast<__vector float>(
86 vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
87 t3 = reinterpret_cast<__vector float>(
88 vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
89 t0 = vec_mergeh(t0, t2);
90 t1 = vec_mergeh(t1, t3);
91 v0->simdInternal_ = vec_mergeh(t0, t1);
92 v1->simdInternal_ = vec_mergel(t0, t1);
95 static const int c_simdBestPairAlignmentFloat = 2;
97 template<int align>
98 static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
99 const std::int32_t offset[],
100 SimdFloat* v0,
101 SimdFloat* v1,
102 SimdFloat* v2)
105 if (align % 4 == 0)
107 SimdFloat t3;
108 gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
110 else
112 __vector float t1, t2, t3, t4, t5, t6, t7, t8;
113 const __vector unsigned char perm_lo2hi = { 0, 1, 2, 3, 4, 5, 6, 7,
114 16, 17, 18, 19, 20, 21, 22, 23 };
115 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
116 8, 9, 10, 11, 12, 13, 14, 15 };
118 t1 = reinterpret_cast<__vector float>(
119 vec_splats(*reinterpret_cast<const double*>(base + align * offset[0])));
120 t2 = reinterpret_cast<__vector float>(
121 vec_splats(*reinterpret_cast<const double*>(base + align * offset[1])));
122 t3 = reinterpret_cast<__vector float>(
123 vec_splats(*reinterpret_cast<const double*>(base + align * offset[2])));
124 t4 = reinterpret_cast<__vector float>(
125 vec_splats(*reinterpret_cast<const double*>(base + align * offset[3])));
126 t5 = vec_splats(*(base + align * offset[0] + 2));
127 t6 = vec_splats(*(base + align * offset[1] + 2));
128 t7 = vec_splats(*(base + align * offset[2] + 2));
129 t8 = vec_splats(*(base + align * offset[3] + 2));
131 t1 = vec_mergeh(t1, t2);
132 t3 = vec_mergeh(t3, t4);
133 v0->simdInternal_ = vec_perm(t1, t3, perm_lo2hi);
134 v1->simdInternal_ = vec_perm(t3, t1, perm_hi2lo);
135 t5 = vec_mergeh(t5, t6);
136 t7 = vec_mergeh(t7, t8);
137 v2->simdInternal_ = vec_perm(t5, t7, perm_lo2hi);
142 // gcc-4.9 does not recognize that the argument to vec_extract() is used
143 template<int align>
144 static inline void gmx_simdcall transposeScatterStoreU(float* base,
145 const std::int32_t offset[],
146 SimdFloat v0,
147 SimdFloat v1,
148 SimdFloat gmx_unused v2)
150 __vector float t1, t2;
152 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
153 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
154 *reinterpret_cast<double*>(base + align * offset[0]) =
155 vec_extract(reinterpret_cast<__vector double>(t1), 0);
156 base[align * offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
157 *reinterpret_cast<double*>(base + align * offset[1]) =
158 vec_extract(reinterpret_cast<__vector double>(t1), 1);
159 base[align * offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
160 *reinterpret_cast<double*>(base + align * offset[2]) =
161 vec_extract(reinterpret_cast<__vector double>(t2), 0);
162 base[align * offset[2] + 2] = vec_extract(v2.simdInternal_, 2);
163 *reinterpret_cast<double*>(base + align * offset[3]) =
164 vec_extract(reinterpret_cast<__vector double>(t2), 1);
165 base[align * offset[3] + 2] = vec_extract(v2.simdInternal_, 3);
168 template<int align>
169 static inline void gmx_simdcall
170 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
172 if (align < 4)
174 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
175 8, 9, 10, 11, 12, 13, 14, 15 };
176 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
178 t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
179 t1 = vec_perm(t0, t0, perm_hi2lo); // x1 y1
180 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
181 t3 = vec_perm(t2, t2, perm_hi2lo); // x3 y3
183 t4 = reinterpret_cast<__vector float>(
184 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
185 t4 = vec_add(t4, t0);
186 *reinterpret_cast<double*>(base + align * offset[0]) =
187 vec_extract(reinterpret_cast<__vector double>(t4), 0);
189 float extracted = vec_extract(v2.simdInternal_, 0);
190 base[align * offset[0] + 2] += extracted;
193 t5 = reinterpret_cast<__vector float>(
194 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
195 t5 = vec_add(t5, t1);
196 *reinterpret_cast<double*>(base + align * offset[1]) =
197 vec_extract(reinterpret_cast<__vector double>(t5), 0);
199 float extracted = vec_extract(v2.simdInternal_, 1);
200 base[align * offset[1] + 2] += extracted;
203 t6 = reinterpret_cast<__vector float>(
204 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
205 t6 = vec_add(t6, t2);
206 *reinterpret_cast<double*>(base + align * offset[2]) =
207 vec_extract(reinterpret_cast<__vector double>(t6), 0);
209 float extracted = vec_extract(v2.simdInternal_, 2);
210 base[align * offset[2] + 2] += extracted;
213 t7 = reinterpret_cast<__vector float>(
214 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
215 t7 = vec_add(t7, t3);
216 *reinterpret_cast<double*>(base + align * offset[3]) =
217 vec_extract(reinterpret_cast<__vector double>(t7), 0);
219 float extracted = vec_extract(v2.simdInternal_, 3);
220 base[align * offset[3] + 2] += extracted;
223 else
225 // Extra elements means we can use full width-4 load/store operations
226 SimdFloat v3;
227 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
228 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
229 __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
230 __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
231 v0.simdInternal_ = vec_mergeh(t0, t2);
232 v1.simdInternal_ = vec_mergel(t0, t2);
233 v2.simdInternal_ = vec_mergeh(t1, t3);
234 v3.simdInternal_ = vec_mergel(t1, t3);
236 store(base + align * offset[0], simdLoad(base + align * offset[0]) + v0);
237 store(base + align * offset[1], simdLoad(base + align * offset[1]) + v1);
238 store(base + align * offset[2], simdLoad(base + align * offset[2]) + v2);
239 store(base + align * offset[3], simdLoad(base + align * offset[3]) + v3);
243 template<int align>
244 static inline void gmx_simdcall
245 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
247 if (align < 4)
249 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31,
250 8, 9, 10, 11, 12, 13, 14, 15 };
251 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
253 t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
254 t1 = vec_perm(t0, t0, perm_hi2lo); // x1 y1
255 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
256 t3 = vec_perm(t2, t2, perm_hi2lo); // x3 y3
258 t4 = reinterpret_cast<__vector float>(
259 vec_splats(*reinterpret_cast<double*>(base + align * offset[0])));
260 t4 = vec_sub(t4, t0);
261 *reinterpret_cast<double*>(base + align * offset[0]) =
262 vec_extract(reinterpret_cast<__vector double>(t4), 0);
264 float extracted = vec_extract(v2.simdInternal_, 0);
265 base[align * offset[0] + 2] -= extracted;
268 t5 = reinterpret_cast<__vector float>(
269 vec_splats(*reinterpret_cast<double*>(base + align * offset[1])));
270 t5 = vec_sub(t5, t1);
271 *reinterpret_cast<double*>(base + align * offset[1]) =
272 vec_extract(reinterpret_cast<__vector double>(t5), 0);
274 float extracted = vec_extract(v2.simdInternal_, 1);
275 base[align * offset[1] + 2] -= extracted;
278 t6 = reinterpret_cast<__vector float>(
279 vec_splats(*reinterpret_cast<double*>(base + align * offset[2])));
280 t6 = vec_sub(t6, t2);
281 *reinterpret_cast<double*>(base + align * offset[2]) =
282 vec_extract(reinterpret_cast<__vector double>(t6), 0);
284 float extracted = vec_extract(v2.simdInternal_, 2);
285 base[align * offset[2] + 2] -= extracted;
288 t7 = reinterpret_cast<__vector float>(
289 vec_splats(*reinterpret_cast<double*>(base + align * offset[3])));
290 t7 = vec_sub(t7, t3);
291 *reinterpret_cast<double*>(base + align * offset[3]) =
292 vec_extract(reinterpret_cast<__vector double>(t7), 0);
294 float extracted = vec_extract(v2.simdInternal_, 3);
295 base[align * offset[3] + 2] -= extracted;
298 else
300 // Extra elements means we can use full width-4 load/store operations
301 SimdFloat v3;
302 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
303 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
304 __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0F));
305 __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0F));
306 v0.simdInternal_ = vec_mergeh(t0, t2);
307 v1.simdInternal_ = vec_mergel(t0, t2);
308 v2.simdInternal_ = vec_mergeh(t1, t3);
309 v3.simdInternal_ = vec_mergel(t1, t3);
311 store(base + align * offset[0], simdLoad(base + align * offset[0]) - v0);
312 store(base + align * offset[1], simdLoad(base + align * offset[1]) - v1);
313 store(base + align * offset[2], simdLoad(base + align * offset[2]) - v2);
314 store(base + align * offset[3], simdLoad(base + align * offset[3]) - v3);
318 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
319 SimdFloat* triplets0,
320 SimdFloat* triplets1,
321 SimdFloat* triplets2)
323 // These permutes will be translated to immediate permutes (xxpermdi)
324 // since they operate on doublewords, which will be faster than loading
325 // the constants required for fully flexible permutes.
326 // (although the real reason was that the latter was buggy on xlc-13.1).
327 __vector unsigned char perm0 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
328 __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
329 __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
330 __vector float t0, t1;
332 t0 = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
333 t1 = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
334 triplets0->simdInternal_ = vec_perm(t0, scalar.simdInternal_, perm0);
335 triplets1->simdInternal_ = vec_perm(t0, t1, perm1);
336 triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, t1, perm2);
339 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
340 leading to segfaults. Possibly the calling convention doesn't
341 implement __vector int correctly. Release mode is OK. gcc is OK. */
342 template<int align>
343 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
344 SimdFInt32 offset,
345 SimdFloat* v0,
346 SimdFloat* v1,
347 SimdFloat* v2,
348 SimdFloat* v3)
350 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
352 store(ioffset, offset);
353 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
356 template<int align>
357 static inline void gmx_simdcall
358 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
360 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
362 store(ioffset, offset);
363 gatherLoadTranspose<align>(base, ioffset, v0, v1);
366 template<int align>
367 static inline void gmx_simdcall
368 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
370 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
372 store(ioffset, offset);
373 gatherLoadTranspose<align>(base, ioffset, v0, v1);
376 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
378 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
379 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
380 __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
381 __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
382 v0.simdInternal_ = vec_mergeh(t0, t2);
383 v1.simdInternal_ = vec_mergel(t0, t2);
384 v2.simdInternal_ = vec_mergeh(t1, t3);
385 v3.simdInternal_ = vec_mergel(t1, t3);
387 v0 = v0 + v1;
388 v2 = v2 + v3;
389 v0 = v0 + v2;
390 v2 = v0 + simdLoad(m);
391 store(m, v2);
393 return reduce(v0);
396 } // namespace gmx
398 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H