Introduce SimulatorBuilder
[gromacs.git] / src / gromacs / simd / impl_ibm_vsx / impl_ibm_vsx_util_float.h
blob50e8ce8ad6aa763713e9fea9e04eb4120bf50e7e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
39 #include "config.h"
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_float.h"
46 namespace gmx
49 template <int align>
50 static inline void gmx_simdcall
51 gatherLoadTranspose(const float * base,
52 const std::int32_t offset[],
53 SimdFloat * v0,
54 SimdFloat * v1,
55 SimdFloat * v2,
56 SimdFloat * v3)
58 __vector float l0, l1, l2, l3;
60 l0 = simdLoad( base + align * offset[0] ).simdInternal_;
61 l1 = simdLoad( base + align * offset[1] ).simdInternal_;
62 l2 = simdLoad( base + align * offset[2] ).simdInternal_;
63 l3 = simdLoad( base + align * offset[3] ).simdInternal_;
65 __vector float t0 = vec_mergeh(l0, l2);
66 __vector float t1 = vec_mergel(l0, l2);
67 __vector float t2 = vec_mergeh(l1, l3);
68 __vector float t3 = vec_mergel(l1, l3);
69 v0->simdInternal_ = vec_mergeh(t0, t2);
70 v1->simdInternal_ = vec_mergel(t0, t2);
71 v2->simdInternal_ = vec_mergeh(t1, t3);
72 v3->simdInternal_ = vec_mergel(t1, t3);
75 template <int align>
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float * base,
78 const std::int32_t offset[],
79 SimdFloat * v0,
80 SimdFloat * v1)
82 __vector float t0, t1, t2, t3;
84 t0 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[0])));
85 t1 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[1])));
86 t2 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[2])));
87 t3 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[3])));
88 t0 = vec_mergeh(t0, t2);
89 t1 = vec_mergeh(t1, t3);
90 v0->simdInternal_ = vec_mergeh(t0, t1);
91 v1->simdInternal_ = vec_mergel(t0, t1);
94 static const int c_simdBestPairAlignmentFloat = 2;
96 template <int align>
97 static inline void gmx_simdcall
98 gatherLoadUTranspose(const float * base,
99 const std::int32_t offset[],
100 SimdFloat * v0,
101 SimdFloat * v1,
102 SimdFloat * v2)
105 if (align % 4 == 0)
107 SimdFloat t3;
108 gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
110 else
112 __vector float t1, t2, t3, t4, t5, t6, t7, t8;
113 const __vector unsigned char perm_lo2hi = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
114 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
116 t1 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[0])));
117 t2 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[1])));
118 t3 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[2])));
119 t4 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<const double *>(base + align * offset[3])));
120 t5 = vec_splats( *(base + align * offset[0] + 2) );
121 t6 = vec_splats( *(base + align * offset[1] + 2) );
122 t7 = vec_splats( *(base + align * offset[2] + 2) );
123 t8 = vec_splats( *(base + align * offset[3] + 2) );
125 t1 = vec_mergeh(t1, t2);
126 t3 = vec_mergeh(t3, t4);
127 v0->simdInternal_ = vec_perm(t1, t3, perm_lo2hi);
128 v1->simdInternal_ = vec_perm(t3, t1, perm_hi2lo);
129 t5 = vec_mergeh(t5, t6);
130 t7 = vec_mergeh(t7, t8);
131 v2->simdInternal_ = vec_perm(t5, t7, perm_lo2hi);
136 // gcc-4.9 does not recognize that the argument to vec_extract() is used
137 template <int align>
138 static inline void gmx_simdcall
139 transposeScatterStoreU(float * base,
140 const std::int32_t offset[],
141 SimdFloat v0,
142 SimdFloat v1,
143 SimdFloat gmx_unused v2)
145 __vector float t1, t2;
147 t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
148 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
149 *reinterpret_cast<double *>( base + align * offset[0] ) = vec_extract(reinterpret_cast<__vector double>(t1), 0);
150 base[align*offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
151 *reinterpret_cast<double *>( base + align * offset[1] ) = vec_extract(reinterpret_cast<__vector double>(t1), 1);
152 base[align*offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
153 *reinterpret_cast<double *>( base + align * offset[2] ) = vec_extract(reinterpret_cast<__vector double>(t2), 0);
154 base[align*offset[2] + 2] = vec_extract(v2.simdInternal_, 2);
155 *reinterpret_cast<double *>( base + align * offset[3] ) = vec_extract(reinterpret_cast<__vector double>(t2), 1);
156 base[align*offset[3] + 2] = vec_extract(v2.simdInternal_, 3);
159 template <int align>
160 static inline void gmx_simdcall
161 transposeScatterIncrU(float * base,
162 const std::int32_t offset[],
163 SimdFloat v0,
164 SimdFloat v1,
165 SimdFloat v2)
167 if (align < 4)
169 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
170 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
172 t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
173 t1 = vec_perm(t0, t0, perm_hi2lo); // x1 y1
174 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
175 t3 = vec_perm(t2, t2, perm_hi2lo); // x3 y3
177 t4 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[0])));
178 t4 = vec_add(t4, t0);
179 *reinterpret_cast<double *>( base + align * offset[0] ) = vec_extract(reinterpret_cast<__vector double>(t4), 0);
181 float extracted = vec_extract(v2.simdInternal_, 0);
182 base[align*offset[0] + 2] += extracted;
185 t5 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[1])));
186 t5 = vec_add(t5, t1);
187 *reinterpret_cast<double *>( base + align * offset[1] ) = vec_extract(reinterpret_cast<__vector double>(t5), 0);
189 float extracted = vec_extract(v2.simdInternal_, 1);
190 base[align*offset[1] + 2] += extracted;
193 t6 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[2])));
194 t6 = vec_add(t6, t2);
195 *reinterpret_cast<double *>( base + align * offset[2] ) = vec_extract(reinterpret_cast<__vector double>(t6), 0);
197 float extracted = vec_extract(v2.simdInternal_, 2);
198 base[align*offset[2] + 2] += extracted;
201 t7 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[3])));
202 t7 = vec_add(t7, t3);
203 *reinterpret_cast<double *>( base + align * offset[3] ) = vec_extract(reinterpret_cast<__vector double>(t7), 0);
205 float extracted = vec_extract(v2.simdInternal_, 3);
206 base[align*offset[3] + 2] += extracted;
209 else
211 // Extra elements means we can use full width-4 load/store operations
212 SimdFloat v3;
213 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
214 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
215 __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0f));
216 __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0f));
217 v0.simdInternal_ = vec_mergeh(t0, t2);
218 v1.simdInternal_ = vec_mergel(t0, t2);
219 v2.simdInternal_ = vec_mergeh(t1, t3);
220 v3.simdInternal_ = vec_mergel(t1, t3);
222 store(base + align * offset[0], simdLoad(base + align * offset[0]) + v0 );
223 store(base + align * offset[1], simdLoad(base + align * offset[1]) + v1 );
224 store(base + align * offset[2], simdLoad(base + align * offset[2]) + v2 );
225 store(base + align * offset[3], simdLoad(base + align * offset[3]) + v3 );
229 template <int align>
230 static inline void gmx_simdcall
231 transposeScatterDecrU(float * base,
232 const std::int32_t offset[],
233 SimdFloat v0,
234 SimdFloat v1,
235 SimdFloat v2)
237 if (align < 4)
239 const __vector unsigned char perm_hi2lo = { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
240 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
242 t0 = vec_mergeh(v0.simdInternal_, v1.simdInternal_); // x0 y0 x1 y1
243 t1 = vec_perm(t0, t0, perm_hi2lo); // x1 y1
244 t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_); // x2 y2 x3 y3
245 t3 = vec_perm(t2, t2, perm_hi2lo); // x3 y3
247 t4 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[0])));
248 t4 = vec_sub(t4, t0);
249 *reinterpret_cast<double *>( base + align * offset[0] ) = vec_extract(reinterpret_cast<__vector double>(t4), 0);
251 float extracted = vec_extract(v2.simdInternal_, 0);
252 base[align*offset[0] + 2] -= extracted;
255 t5 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[1])));
256 t5 = vec_sub(t5, t1);
257 *reinterpret_cast<double *>( base + align * offset[1] ) = vec_extract(reinterpret_cast<__vector double>(t5), 0);
259 float extracted = vec_extract(v2.simdInternal_, 1);
260 base[align*offset[1] + 2] -= extracted;
263 t6 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[2])));
264 t6 = vec_sub(t6, t2);
265 *reinterpret_cast<double *>( base + align * offset[2] ) = vec_extract(reinterpret_cast<__vector double>(t6), 0);
267 float extracted = vec_extract(v2.simdInternal_, 2);
268 base[align*offset[2] + 2] -= extracted;
271 t7 = reinterpret_cast<__vector float>(vec_splats(*reinterpret_cast<double *>(base + align * offset[3])));
272 t7 = vec_sub(t7, t3);
273 *reinterpret_cast<double *>( base + align * offset[3] ) = vec_extract(reinterpret_cast<__vector double>(t7), 0);
275 float extracted = vec_extract(v2.simdInternal_, 3);
276 base[align*offset[3] + 2] -= extracted;
279 else
281 // Extra elements means we can use full width-4 load/store operations
282 SimdFloat v3;
283 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
284 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
285 __vector float t2 = vec_mergeh(v1.simdInternal_, vec_splats(0.0f));
286 __vector float t3 = vec_mergel(v1.simdInternal_, vec_splats(0.0f));
287 v0.simdInternal_ = vec_mergeh(t0, t2);
288 v1.simdInternal_ = vec_mergel(t0, t2);
289 v2.simdInternal_ = vec_mergeh(t1, t3);
290 v3.simdInternal_ = vec_mergel(t1, t3);
292 store(base + align * offset[0], simdLoad(base + align * offset[0]) - v0 );
293 store(base + align * offset[1], simdLoad(base + align * offset[1]) - v1 );
294 store(base + align * offset[2], simdLoad(base + align * offset[2]) - v2 );
295 store(base + align * offset[3], simdLoad(base + align * offset[3]) - v3 );
299 static inline void gmx_simdcall
300 expandScalarsToTriplets(SimdFloat scalar,
301 SimdFloat * triplets0,
302 SimdFloat * triplets1,
303 SimdFloat * triplets2)
305 // These permutes will be translated to immediate permutes (xxpermdi)
306 // since they operate on doublewords, which will be faster than loading
307 // the constants required for fully flexible permutes.
308 // (although the real reason was that the latter was buggy on xlc-13.1).
309 __vector unsigned char perm0 = { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
310 __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
311 __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
312 __vector float t0, t1;
314 t0 = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
315 t1 = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
316 triplets0->simdInternal_ = vec_perm(t0, scalar.simdInternal_, perm0);
317 triplets1->simdInternal_ = vec_perm(t0, t1, perm1);
318 triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, t1, perm2);
321 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
322 leading to segfaults. Possibly the calling convention doesn't
323 implement __vector int correctly. Release mode is OK. gcc is OK. */
324 template <int align>
325 static inline void gmx_simdcall
326 gatherLoadBySimdIntTranspose(const float * base,
327 SimdFInt32 offset,
328 SimdFloat * v0,
329 SimdFloat * v1,
330 SimdFloat * v2,
331 SimdFloat * v3)
333 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
335 store(ioffset, offset );
336 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
339 template <int align>
340 static inline void gmx_simdcall
341 gatherLoadBySimdIntTranspose(const float * base,
342 SimdFInt32 offset,
343 SimdFloat * v0,
344 SimdFloat * v1)
346 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
348 store(ioffset, offset );
349 gatherLoadTranspose<align>(base, ioffset, v0, v1);
352 template <int align>
353 static inline void gmx_simdcall
354 gatherLoadUBySimdIntTranspose(const float * base,
355 SimdFInt32 offset,
356 SimdFloat * v0,
357 SimdFloat * v1)
359 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
361 store(ioffset, offset );
362 gatherLoadTranspose<align>(base, ioffset, v0, v1);
365 static inline float gmx_simdcall
366 reduceIncr4ReturnSum(float * m,
367 SimdFloat v0,
368 SimdFloat v1,
369 SimdFloat v2,
370 SimdFloat v3)
372 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
373 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
374 __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
375 __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
376 v0.simdInternal_ = vec_mergeh(t0, t2);
377 v1.simdInternal_ = vec_mergel(t0, t2);
378 v2.simdInternal_ = vec_mergeh(t1, t3);
379 v3.simdInternal_ = vec_mergel(t1, t3);
381 v0 = v0 + v1;
382 v2 = v2 + v3;
383 v0 = v0 + v2;
384 v2 = v0 + simdLoad(m);
385 store(m, v2);
387 return reduce(v0);
390 } // namespace gmx
392 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H