2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_float.h"
50 static inline void gmx_simdcall
51 gatherLoadTranspose(const float * base
,
52 const std::int32_t offset
[],
58 __vector
float l0
, l1
, l2
, l3
;
60 l0
= simdLoad( base
+ align
* offset
[0] ).simdInternal_
;
61 l1
= simdLoad( base
+ align
* offset
[1] ).simdInternal_
;
62 l2
= simdLoad( base
+ align
* offset
[2] ).simdInternal_
;
63 l3
= simdLoad( base
+ align
* offset
[3] ).simdInternal_
;
65 __vector
float t0
= vec_mergeh(l0
, l2
);
66 __vector
float t1
= vec_mergel(l0
, l2
);
67 __vector
float t2
= vec_mergeh(l1
, l3
);
68 __vector
float t3
= vec_mergel(l1
, l3
);
69 v0
->simdInternal_
= vec_mergeh(t0
, t2
);
70 v1
->simdInternal_
= vec_mergel(t0
, t2
);
71 v2
->simdInternal_
= vec_mergeh(t1
, t3
);
72 v3
->simdInternal_
= vec_mergel(t1
, t3
);
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float * base
,
78 const std::int32_t offset
[],
82 __vector
float t0
, t1
, t2
, t3
;
84 t0
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[0])));
85 t1
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[1])));
86 t2
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[2])));
87 t3
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[3])));
88 t0
= vec_mergeh(t0
, t2
);
89 t1
= vec_mergeh(t1
, t3
);
90 v0
->simdInternal_
= vec_mergeh(t0
, t1
);
91 v1
->simdInternal_
= vec_mergel(t0
, t1
);
94 static const int c_simdBestPairAlignmentFloat
= 2;
97 static inline void gmx_simdcall
98 gatherLoadUTranspose(const float * base
,
99 const std::int32_t offset
[],
108 gatherLoadTranspose
<align
>(base
, offset
, v0
, v1
, v2
, &t3
);
112 __vector
float t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
;
113 const __vector
unsigned char perm_lo2hi
= { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
114 const __vector
unsigned char perm_hi2lo
= { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
116 t1
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[0])));
117 t2
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[1])));
118 t3
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[2])));
119 t4
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<const double *>(base
+ align
* offset
[3])));
120 t5
= vec_splats( *(base
+ align
* offset
[0] + 2) );
121 t6
= vec_splats( *(base
+ align
* offset
[1] + 2) );
122 t7
= vec_splats( *(base
+ align
* offset
[2] + 2) );
123 t8
= vec_splats( *(base
+ align
* offset
[3] + 2) );
125 t1
= vec_mergeh(t1
, t2
);
126 t3
= vec_mergeh(t3
, t4
);
127 v0
->simdInternal_
= vec_perm(t1
, t3
, perm_lo2hi
);
128 v1
->simdInternal_
= vec_perm(t3
, t1
, perm_hi2lo
);
129 t5
= vec_mergeh(t5
, t6
);
130 t7
= vec_mergeh(t7
, t8
);
131 v2
->simdInternal_
= vec_perm(t5
, t7
, perm_lo2hi
);
136 // gcc-4.9 does not recognize that the argument to vec_extract() is used
138 static inline void gmx_simdcall
139 transposeScatterStoreU(float * base
,
140 const std::int32_t offset
[],
143 SimdFloat gmx_unused v2
)
145 __vector
float t1
, t2
;
147 t1
= vec_mergeh(v0
.simdInternal_
, v1
.simdInternal_
);
148 t2
= vec_mergel(v0
.simdInternal_
, v1
.simdInternal_
);
149 *reinterpret_cast<double *>( base
+ align
* offset
[0] ) = vec_extract(reinterpret_cast<__vector
double>(t1
), 0);
150 base
[align
*offset
[0] + 2] = vec_extract(v2
.simdInternal_
, 0);
151 *reinterpret_cast<double *>( base
+ align
* offset
[1] ) = vec_extract(reinterpret_cast<__vector
double>(t1
), 1);
152 base
[align
*offset
[1] + 2] = vec_extract(v2
.simdInternal_
, 1);
153 *reinterpret_cast<double *>( base
+ align
* offset
[2] ) = vec_extract(reinterpret_cast<__vector
double>(t2
), 0);
154 base
[align
*offset
[2] + 2] = vec_extract(v2
.simdInternal_
, 2);
155 *reinterpret_cast<double *>( base
+ align
* offset
[3] ) = vec_extract(reinterpret_cast<__vector
double>(t2
), 1);
156 base
[align
*offset
[3] + 2] = vec_extract(v2
.simdInternal_
, 3);
160 static inline void gmx_simdcall
161 transposeScatterIncrU(float * base
,
162 const std::int32_t offset
[],
169 const __vector
unsigned char perm_hi2lo
= { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
170 __vector
float t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
;
172 t0
= vec_mergeh(v0
.simdInternal_
, v1
.simdInternal_
); // x0 y0 x1 y1
173 t1
= vec_perm(t0
, t0
, perm_hi2lo
); // x1 y1
174 t2
= vec_mergel(v0
.simdInternal_
, v1
.simdInternal_
); // x2 y2 x3 y3
175 t3
= vec_perm(t2
, t2
, perm_hi2lo
); // x3 y3
177 t4
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[0])));
178 t4
= vec_add(t4
, t0
);
179 *reinterpret_cast<double *>( base
+ align
* offset
[0] ) = vec_extract(reinterpret_cast<__vector
double>(t4
), 0);
181 float extracted
= vec_extract(v2
.simdInternal_
, 0);
182 base
[align
*offset
[0] + 2] += extracted
;
185 t5
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[1])));
186 t5
= vec_add(t5
, t1
);
187 *reinterpret_cast<double *>( base
+ align
* offset
[1] ) = vec_extract(reinterpret_cast<__vector
double>(t5
), 0);
189 float extracted
= vec_extract(v2
.simdInternal_
, 1);
190 base
[align
*offset
[1] + 2] += extracted
;
193 t6
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[2])));
194 t6
= vec_add(t6
, t2
);
195 *reinterpret_cast<double *>( base
+ align
* offset
[2] ) = vec_extract(reinterpret_cast<__vector
double>(t6
), 0);
197 float extracted
= vec_extract(v2
.simdInternal_
, 2);
198 base
[align
*offset
[2] + 2] += extracted
;
201 t7
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[3])));
202 t7
= vec_add(t7
, t3
);
203 *reinterpret_cast<double *>( base
+ align
* offset
[3] ) = vec_extract(reinterpret_cast<__vector
double>(t7
), 0);
205 float extracted
= vec_extract(v2
.simdInternal_
, 3);
206 base
[align
*offset
[3] + 2] += extracted
;
211 // Extra elements means we can use full width-4 load/store operations
213 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
214 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
215 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, vec_splats(0.0f
));
216 __vector
float t3
= vec_mergel(v1
.simdInternal_
, vec_splats(0.0f
));
217 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
218 v1
.simdInternal_
= vec_mergel(t0
, t2
);
219 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
220 v3
.simdInternal_
= vec_mergel(t1
, t3
);
222 store(base
+ align
* offset
[0], simdLoad(base
+ align
* offset
[0]) + v0
);
223 store(base
+ align
* offset
[1], simdLoad(base
+ align
* offset
[1]) + v1
);
224 store(base
+ align
* offset
[2], simdLoad(base
+ align
* offset
[2]) + v2
);
225 store(base
+ align
* offset
[3], simdLoad(base
+ align
* offset
[3]) + v3
);
230 static inline void gmx_simdcall
231 transposeScatterDecrU(float * base
,
232 const std::int32_t offset
[],
239 const __vector
unsigned char perm_hi2lo
= { 24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15 };
240 __vector
float t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
;
242 t0
= vec_mergeh(v0
.simdInternal_
, v1
.simdInternal_
); // x0 y0 x1 y1
243 t1
= vec_perm(t0
, t0
, perm_hi2lo
); // x1 y1
244 t2
= vec_mergel(v0
.simdInternal_
, v1
.simdInternal_
); // x2 y2 x3 y3
245 t3
= vec_perm(t2
, t2
, perm_hi2lo
); // x3 y3
247 t4
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[0])));
248 t4
= vec_sub(t4
, t0
);
249 *reinterpret_cast<double *>( base
+ align
* offset
[0] ) = vec_extract(reinterpret_cast<__vector
double>(t4
), 0);
251 float extracted
= vec_extract(v2
.simdInternal_
, 0);
252 base
[align
*offset
[0] + 2] -= extracted
;
255 t5
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[1])));
256 t5
= vec_sub(t5
, t1
);
257 *reinterpret_cast<double *>( base
+ align
* offset
[1] ) = vec_extract(reinterpret_cast<__vector
double>(t5
), 0);
259 float extracted
= vec_extract(v2
.simdInternal_
, 1);
260 base
[align
*offset
[1] + 2] -= extracted
;
263 t6
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[2])));
264 t6
= vec_sub(t6
, t2
);
265 *reinterpret_cast<double *>( base
+ align
* offset
[2] ) = vec_extract(reinterpret_cast<__vector
double>(t6
), 0);
267 float extracted
= vec_extract(v2
.simdInternal_
, 2);
268 base
[align
*offset
[2] + 2] -= extracted
;
271 t7
= reinterpret_cast<__vector
float>(vec_splats(*reinterpret_cast<double *>(base
+ align
* offset
[3])));
272 t7
= vec_sub(t7
, t3
);
273 *reinterpret_cast<double *>( base
+ align
* offset
[3] ) = vec_extract(reinterpret_cast<__vector
double>(t7
), 0);
275 float extracted
= vec_extract(v2
.simdInternal_
, 3);
276 base
[align
*offset
[3] + 2] -= extracted
;
281 // Extra elements means we can use full width-4 load/store operations
283 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
284 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
285 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, vec_splats(0.0f
));
286 __vector
float t3
= vec_mergel(v1
.simdInternal_
, vec_splats(0.0f
));
287 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
288 v1
.simdInternal_
= vec_mergel(t0
, t2
);
289 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
290 v3
.simdInternal_
= vec_mergel(t1
, t3
);
292 store(base
+ align
* offset
[0], simdLoad(base
+ align
* offset
[0]) - v0
);
293 store(base
+ align
* offset
[1], simdLoad(base
+ align
* offset
[1]) - v1
);
294 store(base
+ align
* offset
[2], simdLoad(base
+ align
* offset
[2]) - v2
);
295 store(base
+ align
* offset
[3], simdLoad(base
+ align
* offset
[3]) - v3
);
299 static inline void gmx_simdcall
300 expandScalarsToTriplets(SimdFloat scalar
,
301 SimdFloat
* triplets0
,
302 SimdFloat
* triplets1
,
303 SimdFloat
* triplets2
)
305 // These permutes will be translated to immediate permutes (xxpermdi)
306 // since they operate on doublewords, which will be faster than loading
307 // the constants required for fully flexible permutes.
308 // (although the real reason was that the latter was buggy on xlc-13.1).
309 __vector
unsigned char perm0
= { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
310 __vector
unsigned char perm1
= { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 };
311 __vector
unsigned char perm2
= { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
312 __vector
float t0
, t1
;
314 t0
= vec_mergeh(scalar
.simdInternal_
, scalar
.simdInternal_
);
315 t1
= vec_mergel(scalar
.simdInternal_
, scalar
.simdInternal_
);
316 triplets0
->simdInternal_
= vec_perm(t0
, scalar
.simdInternal_
, perm0
);
317 triplets1
->simdInternal_
= vec_perm(t0
, t1
, perm1
);
318 triplets2
->simdInternal_
= vec_perm(scalar
.simdInternal_
, t1
, perm2
);
321 /* TODO In debug mode, xlc 13.1.5 seems to overwrite v0 on the stack,
322 leading to segfaults. Possibly the calling convention doesn't
323 implement __vector int correctly. Release mode is OK. gcc is OK. */
325 static inline void gmx_simdcall
326 gatherLoadBySimdIntTranspose(const float * base
,
333 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
335 store(ioffset
, offset
);
336 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
, v2
, v3
);
340 static inline void gmx_simdcall
341 gatherLoadBySimdIntTranspose(const float * base
,
346 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
348 store(ioffset
, offset
);
349 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
353 static inline void gmx_simdcall
354 gatherLoadUBySimdIntTranspose(const float * base
,
359 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
361 store(ioffset
, offset
);
362 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
365 static inline float gmx_simdcall
366 reduceIncr4ReturnSum(float * m
,
372 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
373 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
374 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, v3
.simdInternal_
);
375 __vector
float t3
= vec_mergel(v1
.simdInternal_
, v3
.simdInternal_
);
376 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
377 v1
.simdInternal_
= vec_mergel(t0
, t2
);
378 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
379 v3
.simdInternal_
= vec_mergel(t1
, t3
);
384 v2
= v0
+ simdLoad(m
);
392 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_FLOAT_H