2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
38 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
42 #include "gromacs/utility/basedefinitions.h"
44 #include "impl_ibm_vsx_definitions.h"
45 #include "impl_ibm_vsx_simd_float.h"
51 static inline void gmx_simdcall
gatherLoadTranspose(const float* base
,
52 const std::int32_t offset
[],
58 __vector
float l0
, l1
, l2
, l3
;
60 l0
= simdLoad(base
+ align
* offset
[0]).simdInternal_
;
61 l1
= simdLoad(base
+ align
* offset
[1]).simdInternal_
;
62 l2
= simdLoad(base
+ align
* offset
[2]).simdInternal_
;
63 l3
= simdLoad(base
+ align
* offset
[3]).simdInternal_
;
65 __vector
float t0
= vec_mergeh(l0
, l2
);
66 __vector
float t1
= vec_mergel(l0
, l2
);
67 __vector
float t2
= vec_mergeh(l1
, l3
);
68 __vector
float t3
= vec_mergel(l1
, l3
);
69 v0
->simdInternal_
= vec_mergeh(t0
, t2
);
70 v1
->simdInternal_
= vec_mergel(t0
, t2
);
71 v2
->simdInternal_
= vec_mergeh(t1
, t3
);
72 v3
->simdInternal_
= vec_mergel(t1
, t3
);
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float* base
, const std::int32_t offset
[], SimdFloat
* v0
, SimdFloat
* v1
)
79 __vector
float t0
, t1
, t2
, t3
;
81 t0
= reinterpret_cast<__vector
float>(
82 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[0])));
83 t1
= reinterpret_cast<__vector
float>(
84 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[1])));
85 t2
= reinterpret_cast<__vector
float>(
86 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[2])));
87 t3
= reinterpret_cast<__vector
float>(
88 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[3])));
89 t0
= vec_mergeh(t0
, t2
);
90 t1
= vec_mergeh(t1
, t3
);
91 v0
->simdInternal_
= vec_mergeh(t0
, t1
);
92 v1
->simdInternal_
= vec_mergel(t0
, t1
);
95 static const int c_simdBestPairAlignmentFloat
= 2;
98 static inline void gmx_simdcall
gatherLoadUTranspose(const float* base
,
99 const std::int32_t offset
[],
108 gatherLoadTranspose
<align
>(base
, offset
, v0
, v1
, v2
, &t3
);
112 __vector
float t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
;
113 const __vector
unsigned char perm_lo2hi
= { 0, 1, 2, 3, 4, 5, 6, 7,
114 16, 17, 18, 19, 20, 21, 22, 23 };
115 const __vector
unsigned char perm_hi2lo
= { 24, 25, 26, 27, 28, 29, 30, 31,
116 8, 9, 10, 11, 12, 13, 14, 15 };
118 t1
= reinterpret_cast<__vector
float>(
119 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[0])));
120 t2
= reinterpret_cast<__vector
float>(
121 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[1])));
122 t3
= reinterpret_cast<__vector
float>(
123 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[2])));
124 t4
= reinterpret_cast<__vector
float>(
125 vec_splats(*reinterpret_cast<const double*>(base
+ align
* offset
[3])));
126 t5
= vec_splats(*(base
+ align
* offset
[0] + 2));
127 t6
= vec_splats(*(base
+ align
* offset
[1] + 2));
128 t7
= vec_splats(*(base
+ align
* offset
[2] + 2));
129 t8
= vec_splats(*(base
+ align
* offset
[3] + 2));
131 t1
= vec_mergeh(t1
, t2
);
132 t3
= vec_mergeh(t3
, t4
);
133 v0
->simdInternal_
= vec_perm(t1
, t3
, perm_lo2hi
);
134 v1
->simdInternal_
= vec_perm(t3
, t1
, perm_hi2lo
);
135 t5
= vec_mergeh(t5
, t6
);
136 t7
= vec_mergeh(t7
, t8
);
137 v2
->simdInternal_
= vec_perm(t5
, t7
, perm_lo2hi
);
142 // gcc-4.9 does not recognize that the argument to vec_extract() is used
144 static inline void gmx_simdcall
transposeScatterStoreU(float* base
,
145 const std::int32_t offset
[],
148 SimdFloat gmx_unused v2
)
150 __vector
float t1
, t2
;
152 t1
= vec_mergeh(v0
.simdInternal_
, v1
.simdInternal_
);
153 t2
= vec_mergel(v0
.simdInternal_
, v1
.simdInternal_
);
154 *reinterpret_cast<double*>(base
+ align
* offset
[0]) =
155 vec_extract(reinterpret_cast<__vector
double>(t1
), 0);
156 base
[align
* offset
[0] + 2] = vec_extract(v2
.simdInternal_
, 0);
157 *reinterpret_cast<double*>(base
+ align
* offset
[1]) =
158 vec_extract(reinterpret_cast<__vector
double>(t1
), 1);
159 base
[align
* offset
[1] + 2] = vec_extract(v2
.simdInternal_
, 1);
160 *reinterpret_cast<double*>(base
+ align
* offset
[2]) =
161 vec_extract(reinterpret_cast<__vector
double>(t2
), 0);
162 base
[align
* offset
[2] + 2] = vec_extract(v2
.simdInternal_
, 2);
163 *reinterpret_cast<double*>(base
+ align
* offset
[3]) =
164 vec_extract(reinterpret_cast<__vector
double>(t2
), 1);
165 base
[align
* offset
[3] + 2] = vec_extract(v2
.simdInternal_
, 3);
169 static inline void gmx_simdcall
170 transposeScatterIncrU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
174 const __vector
unsigned char perm_hi2lo
= { 24, 25, 26, 27, 28, 29, 30, 31,
175 8, 9, 10, 11, 12, 13, 14, 15 };
176 __vector
float t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
;
178 t0
= vec_mergeh(v0
.simdInternal_
, v1
.simdInternal_
); // x0 y0 x1 y1
179 t1
= vec_perm(t0
, t0
, perm_hi2lo
); // x1 y1
180 t2
= vec_mergel(v0
.simdInternal_
, v1
.simdInternal_
); // x2 y2 x3 y3
181 t3
= vec_perm(t2
, t2
, perm_hi2lo
); // x3 y3
183 t4
= reinterpret_cast<__vector
float>(
184 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[0])));
185 t4
= vec_add(t4
, t0
);
186 *reinterpret_cast<double*>(base
+ align
* offset
[0]) =
187 vec_extract(reinterpret_cast<__vector
double>(t4
), 0);
189 float extracted
= vec_extract(v2
.simdInternal_
, 0);
190 base
[align
* offset
[0] + 2] += extracted
;
193 t5
= reinterpret_cast<__vector
float>(
194 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[1])));
195 t5
= vec_add(t5
, t1
);
196 *reinterpret_cast<double*>(base
+ align
* offset
[1]) =
197 vec_extract(reinterpret_cast<__vector
double>(t5
), 0);
199 float extracted
= vec_extract(v2
.simdInternal_
, 1);
200 base
[align
* offset
[1] + 2] += extracted
;
203 t6
= reinterpret_cast<__vector
float>(
204 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[2])));
205 t6
= vec_add(t6
, t2
);
206 *reinterpret_cast<double*>(base
+ align
* offset
[2]) =
207 vec_extract(reinterpret_cast<__vector
double>(t6
), 0);
209 float extracted
= vec_extract(v2
.simdInternal_
, 2);
210 base
[align
* offset
[2] + 2] += extracted
;
213 t7
= reinterpret_cast<__vector
float>(
214 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[3])));
215 t7
= vec_add(t7
, t3
);
216 *reinterpret_cast<double*>(base
+ align
* offset
[3]) =
217 vec_extract(reinterpret_cast<__vector
double>(t7
), 0);
219 float extracted
= vec_extract(v2
.simdInternal_
, 3);
220 base
[align
* offset
[3] + 2] += extracted
;
225 // Extra elements means we can use full width-4 load/store operations
227 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
228 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
229 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, vec_splats(0.0F
));
230 __vector
float t3
= vec_mergel(v1
.simdInternal_
, vec_splats(0.0F
));
231 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
232 v1
.simdInternal_
= vec_mergel(t0
, t2
);
233 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
234 v3
.simdInternal_
= vec_mergel(t1
, t3
);
236 store(base
+ align
* offset
[0], simdLoad(base
+ align
* offset
[0]) + v0
);
237 store(base
+ align
* offset
[1], simdLoad(base
+ align
* offset
[1]) + v1
);
238 store(base
+ align
* offset
[2], simdLoad(base
+ align
* offset
[2]) + v2
);
239 store(base
+ align
* offset
[3], simdLoad(base
+ align
* offset
[3]) + v3
);
244 static inline void gmx_simdcall
245 transposeScatterDecrU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
249 const __vector
unsigned char perm_hi2lo
= { 24, 25, 26, 27, 28, 29, 30, 31,
250 8, 9, 10, 11, 12, 13, 14, 15 };
251 __vector
float t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
;
253 t0
= vec_mergeh(v0
.simdInternal_
, v1
.simdInternal_
); // x0 y0 x1 y1
254 t1
= vec_perm(t0
, t0
, perm_hi2lo
); // x1 y1
255 t2
= vec_mergel(v0
.simdInternal_
, v1
.simdInternal_
); // x2 y2 x3 y3
256 t3
= vec_perm(t2
, t2
, perm_hi2lo
); // x3 y3
258 t4
= reinterpret_cast<__vector
float>(
259 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[0])));
260 t4
= vec_sub(t4
, t0
);
261 *reinterpret_cast<double*>(base
+ align
* offset
[0]) =
262 vec_extract(reinterpret_cast<__vector
double>(t4
), 0);
264 float extracted
= vec_extract(v2
.simdInternal_
, 0);
265 base
[align
* offset
[0] + 2] -= extracted
;
268 t5
= reinterpret_cast<__vector
float>(
269 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[1])));
270 t5
= vec_sub(t5
, t1
);
271 *reinterpret_cast<double*>(base
+ align
* offset
[1]) =
272 vec_extract(reinterpret_cast<__vector
double>(t5
), 0);
274 float extracted
= vec_extract(v2
.simdInternal_
, 1);
275 base
[align
* offset
[1] + 2] -= extracted
;
278 t6
= reinterpret_cast<__vector
float>(
279 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[2])));
280 t6
= vec_sub(t6
, t2
);
281 *reinterpret_cast<double*>(base
+ align
* offset
[2]) =
282 vec_extract(reinterpret_cast<__vector
double>(t6
), 0);
284 float extracted
= vec_extract(v2
.simdInternal_
, 2);
285 base
[align
* offset
[2] + 2] -= extracted
;
288 t7
= reinterpret_cast<__vector
float>(
289 vec_splats(*reinterpret_cast<double*>(base
+ align
* offset
[3])));
290 t7
= vec_sub(t7
, t3
);
291 *reinterpret_cast<double*>(base
+ align
* offset
[3]) =
292 vec_extract(reinterpret_cast<__vector
double>(t7
), 0);
294 float extracted
= vec_extract(v2
.simdInternal_
, 3);
295 base
[align
* offset
[3] + 2] -= extracted
;
300 // Extra elements means we can use full width-4 load/store operations
302 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
303 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
304 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, vec_splats(0.0F
));
305 __vector
float t3
= vec_mergel(v1
.simdInternal_
, vec_splats(0.0F
));
306 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
307 v1
.simdInternal_
= vec_mergel(t0
, t2
);
308 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
309 v3
.simdInternal_
= vec_mergel(t1
, t3
);
311 store(base
+ align
* offset
[0], simdLoad(base
+ align
* offset
[0]) - v0
);
312 store(base
+ align
* offset
[1], simdLoad(base
+ align
* offset
[1]) - v1
);
313 store(base
+ align
* offset
[2], simdLoad(base
+ align
* offset
[2]) - v2
);
314 store(base
+ align
* offset
[3], simdLoad(base
+ align
* offset
[3]) - v3
);
318 static inline void gmx_simdcall
expandScalarsToTriplets(SimdFloat scalar
,
319 SimdFloat
* triplets0
,
320 SimdFloat
* triplets1
,
321 SimdFloat
* triplets2
)
323 // These permutes will be translated to immediate permutes (xxpermdi)
324 // since they operate on doublewords, which will be faster than loading
325 // the constants required for fully flexible permutes.
326 // (although the real reason was that the latter was buggy on xlc-13.1).
327 __vector
unsigned char perm0
= { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
328 __vector
unsigned char perm1
= { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
329 __vector
unsigned char perm2
= { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
330 __vector
float t0
, t1
;
332 t0
= vec_mergeh(scalar
.simdInternal_
, scalar
.simdInternal_
);
333 t1
= vec_mergel(scalar
.simdInternal_
, scalar
.simdInternal_
);
334 triplets0
->simdInternal_
= vec_perm(t0
, scalar
.simdInternal_
, perm0
);
335 triplets1
->simdInternal_
= vec_perm(t0
, t1
, perm1
);
336 triplets2
->simdInternal_
= vec_perm(scalar
.simdInternal_
, t1
, perm2
);
339 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
340 leading to segfaults. Possibly the calling convention doesn't
341 implement __vector int correctly. Release mode is OK. gcc is OK. */
343 static inline void gmx_simdcall
gatherLoadBySimdIntTranspose(const float* base
,
350 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
352 store(ioffset
, offset
);
353 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
, v2
, v3
);
357 static inline void gmx_simdcall
358 gatherLoadBySimdIntTranspose(const float* base
, SimdFInt32 offset
, SimdFloat
* v0
, SimdFloat
* v1
)
360 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
362 store(ioffset
, offset
);
363 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
367 static inline void gmx_simdcall
368 gatherLoadUBySimdIntTranspose(const float* base
, SimdFInt32 offset
, SimdFloat
* v0
, SimdFloat
* v1
)
370 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
372 store(ioffset
, offset
);
373 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
376 static inline float gmx_simdcall
reduceIncr4ReturnSum(float* m
, SimdFloat v0
, SimdFloat v1
, SimdFloat v2
, SimdFloat v3
)
378 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
379 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
380 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, v3
.simdInternal_
);
381 __vector
float t3
= vec_mergel(v1
.simdInternal_
, v3
.simdInternal_
);
382 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
383 v1
.simdInternal_
= vec_mergel(t0
, t2
);
384 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
385 v3
.simdInternal_
= vec_mergel(t1
, t3
);
390 v2
= v0
+ simdLoad(m
);
398 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H