2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
44 #include "gromacs/utility/basedefinitions.h"
46 #include "impl_ibm_vmx_definitions.h"
47 #include "impl_ibm_vmx_simd_float.h"
53 static inline void gmx_simdcall
gatherLoadTranspose(const float* base
,
54 const std::int32_t offset
[],
60 *v0
= simdLoad(base
+ align
* offset
[0]);
61 *v1
= simdLoad(base
+ align
* offset
[1]);
62 *v2
= simdLoad(base
+ align
* offset
[2]);
63 *v3
= simdLoad(base
+ align
* offset
[3]);
65 __vector
float t0
= vec_mergeh(v0
->simdInternal_
, v2
->simdInternal_
);
66 __vector
float t1
= vec_mergel(v0
->simdInternal_
, v2
->simdInternal_
);
67 __vector
float t2
= vec_mergeh(v1
->simdInternal_
, v3
->simdInternal_
);
68 __vector
float t3
= vec_mergel(v1
->simdInternal_
, v3
->simdInternal_
);
69 v0
->simdInternal_
= vec_mergeh(t0
, t2
);
70 v1
->simdInternal_
= vec_mergel(t0
, t2
);
71 v2
->simdInternal_
= vec_mergeh(t1
, t3
);
72 v3
->simdInternal_
= vec_mergel(t1
, t3
);
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float* base
, const std::int32_t offset
[], SimdFloat
* v0
, SimdFloat
* v1
)
83 gatherLoadTranspose
<align
>(base
, offset
, v0
, v1
, &t2
, &t3
);
87 __vector
float t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
;
88 __vector
unsigned char p0
, p1
, p2
, p3
;
90 // This is REALLY slow, since we have no choice but to load individual
91 // elements when we cannot guarantee that we can access beyond the end of
92 // the memory. Fortunately, 99% of the usage should be the aligned-to-4
93 // case above instead.
94 t0
= vec_lde(0, base
+ align
* offset
[0]);
95 t1
= vec_lde(0, base
+ align
* offset
[1]);
96 t2
= vec_lde(0, base
+ align
* offset
[2]);
97 t3
= vec_lde(0, base
+ align
* offset
[3]);
98 p0
= vec_lvsl(0, base
+ align
* offset
[0]);
99 p1
= vec_lvsl(0, base
+ align
* offset
[1]);
100 p2
= vec_lvsl(0, base
+ align
* offset
[2]);
101 p3
= vec_lvsl(0, base
+ align
* offset
[3]);
102 t0
= vec_perm(t0
, t0
, p0
);
103 t1
= vec_perm(t1
, t1
, p1
);
104 t2
= vec_perm(t2
, t2
, p2
);
105 t3
= vec_perm(t3
, t3
, p3
);
106 t0
= vec_mergeh(t0
, t2
);
107 t1
= vec_mergeh(t1
, t3
);
108 v0
->simdInternal_
= vec_mergeh(t0
, t1
);
110 t4
= vec_lde(0, base
+ align
* offset
[0] + 1);
111 t5
= vec_lde(0, base
+ align
* offset
[1] + 1);
112 t6
= vec_lde(0, base
+ align
* offset
[2] + 1);
113 t7
= vec_lde(0, base
+ align
* offset
[3] + 1);
114 p0
= vec_lvsl(0, base
+ align
* offset
[0] + 1);
115 p1
= vec_lvsl(0, base
+ align
* offset
[1] + 1);
116 p2
= vec_lvsl(0, base
+ align
* offset
[2] + 1);
117 p3
= vec_lvsl(0, base
+ align
* offset
[3] + 1);
118 t4
= vec_perm(t4
, t4
, p0
);
119 t5
= vec_perm(t5
, t5
, p1
);
120 t6
= vec_perm(t6
, t6
, p2
);
121 t7
= vec_perm(t7
, t7
, p3
);
122 t4
= vec_mergeh(t4
, t6
);
123 t5
= vec_mergeh(t5
, t7
);
124 v1
->simdInternal_
= vec_mergeh(t4
, t5
);
128 static const int c_simdBestPairAlignmentFloat
= 2;
131 static inline void gmx_simdcall
gatherLoadUTranspose(const float* base
,
132 const std::int32_t offset
[],
140 gatherLoadTranspose
<align
>(base
, offset
, v0
, v1
, v2
, &t3
);
144 __vector
float t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, t9
, t10
, t11
;
145 __vector
unsigned char p0
, p1
, p2
, p3
;
147 // This is REALLY slow, since we have no choice but to load individual
148 // elements when we cannot guarantee that we can access beyond the end of
149 // the memory. Unfortunately this is likely the most common case.
150 t0
= vec_lde(0, base
+ align
* offset
[0]);
151 t1
= vec_lde(0, base
+ align
* offset
[1]);
152 t2
= vec_lde(0, base
+ align
* offset
[2]);
153 t3
= vec_lde(0, base
+ align
* offset
[3]);
154 p0
= vec_lvsl(0, base
+ align
* offset
[0]);
155 p1
= vec_lvsl(0, base
+ align
* offset
[1]);
156 p2
= vec_lvsl(0, base
+ align
* offset
[2]);
157 p3
= vec_lvsl(0, base
+ align
* offset
[3]);
158 t0
= vec_perm(t0
, t0
, p0
);
159 t1
= vec_perm(t1
, t1
, p1
);
160 t2
= vec_perm(t2
, t2
, p2
);
161 t3
= vec_perm(t3
, t3
, p3
);
162 t0
= vec_mergeh(t0
, t2
);
163 t1
= vec_mergeh(t1
, t3
);
164 v0
->simdInternal_
= vec_mergeh(t0
, t1
);
166 t4
= vec_lde(0, base
+ align
* offset
[0] + 1);
167 t5
= vec_lde(0, base
+ align
* offset
[1] + 1);
168 t6
= vec_lde(0, base
+ align
* offset
[2] + 1);
169 t7
= vec_lde(0, base
+ align
* offset
[3] + 1);
170 p0
= vec_lvsl(0, base
+ align
* offset
[0] + 1);
171 p1
= vec_lvsl(0, base
+ align
* offset
[1] + 1);
172 p2
= vec_lvsl(0, base
+ align
* offset
[2] + 1);
173 p3
= vec_lvsl(0, base
+ align
* offset
[3] + 1);
174 t4
= vec_perm(t4
, t4
, p0
);
175 t5
= vec_perm(t5
, t5
, p1
);
176 t6
= vec_perm(t6
, t6
, p2
);
177 t7
= vec_perm(t7
, t7
, p3
);
178 t4
= vec_mergeh(t4
, t6
);
179 t5
= vec_mergeh(t5
, t7
);
180 v1
->simdInternal_
= vec_mergeh(t4
, t5
);
182 t8
= vec_lde(0, base
+ align
* offset
[0] + 2);
183 t9
= vec_lde(0, base
+ align
* offset
[1] + 2);
184 t10
= vec_lde(0, base
+ align
* offset
[2] + 2);
185 t11
= vec_lde(0, base
+ align
* offset
[3] + 2);
186 p0
= vec_lvsl(0, base
+ align
* offset
[0] + 2);
187 p1
= vec_lvsl(0, base
+ align
* offset
[1] + 2);
188 p2
= vec_lvsl(0, base
+ align
* offset
[2] + 2);
189 p3
= vec_lvsl(0, base
+ align
* offset
[3] + 2);
190 t8
= vec_perm(t8
, t8
, p0
);
191 t9
= vec_perm(t9
, t9
, p1
);
192 t10
= vec_perm(t10
, t10
, p2
);
193 t11
= vec_perm(t11
, t11
, p3
);
194 t8
= vec_mergeh(t8
, t10
);
195 t9
= vec_mergeh(t9
, t11
);
196 v2
->simdInternal_
= vec_mergeh(t8
, t9
);
202 static inline void gmx_simdcall
203 transposeScatterStoreU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
205 __vector
unsigned char p0
, p1
, p2
, p3
;
207 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
208 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
209 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, v2
.simdInternal_
);
210 __vector
float t3
= vec_mergel(v1
.simdInternal_
, v2
.simdInternal_
);
211 __vector
float t4
= vec_mergeh(t0
, t2
);
212 __vector
float t5
= vec_mergel(t0
, t2
);
213 __vector
float t6
= vec_mergeh(t1
, t3
);
214 __vector
float t7
= vec_mergel(t1
, t3
);
216 p0
= vec_lvsr(0, base
+ align
* offset
[0]);
217 p1
= vec_lvsr(0, base
+ align
* offset
[1]);
218 p2
= vec_lvsr(0, base
+ align
* offset
[2]);
219 p3
= vec_lvsr(0, base
+ align
* offset
[3]);
221 t4
= vec_perm(t4
, t4
, p0
);
222 t5
= vec_perm(t5
, t5
, p1
);
223 t6
= vec_perm(t6
, t6
, p2
);
224 t7
= vec_perm(t7
, t7
, p3
);
226 vec_ste(t4
, 0, base
+ align
* offset
[0]);
227 vec_ste(t4
, 4, base
+ align
* offset
[0]);
228 vec_ste(t4
, 8, base
+ align
* offset
[0]);
229 vec_ste(t5
, 0, base
+ align
* offset
[1]);
230 vec_ste(t5
, 4, base
+ align
* offset
[1]);
231 vec_ste(t5
, 8, base
+ align
* offset
[1]);
232 vec_ste(t6
, 0, base
+ align
* offset
[2]);
233 vec_ste(t6
, 4, base
+ align
* offset
[2]);
234 vec_ste(t6
, 8, base
+ align
* offset
[2]);
235 vec_ste(t7
, 0, base
+ align
* offset
[3]);
236 vec_ste(t7
, 4, base
+ align
* offset
[3]);
237 vec_ste(t7
, 8, base
+ align
* offset
[3]);
242 static inline void gmx_simdcall
243 transposeScatterIncrU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
247 __vector
float zero
= reinterpret_cast<__vector
float>(vec_splat_u32(0));
248 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
249 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
250 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, zero
);
251 __vector
float t3
= vec_mergel(v1
.simdInternal_
, zero
);
252 __vector
float t4
= vec_mergeh(t0
, t2
);
253 __vector
float t5
= vec_mergel(t0
, t2
);
254 __vector
float t6
= vec_mergeh(t1
, t3
);
255 __vector
float t7
= vec_mergel(t1
, t3
);
257 vec_st(vec_add(vec_ld(0, base
+ align
* offset
[0]), t4
), 0, base
+ align
* offset
[0]);
258 vec_st(vec_add(vec_ld(0, base
+ align
* offset
[1]), t5
), 0, base
+ align
* offset
[1]);
259 vec_st(vec_add(vec_ld(0, base
+ align
* offset
[2]), t6
), 0, base
+ align
* offset
[2]);
260 vec_st(vec_add(vec_ld(0, base
+ align
* offset
[3]), t7
), 0, base
+ align
* offset
[3]);
264 alignas(GMX_SIMD_ALIGNMENT
) float rdata0
[GMX_SIMD_FLOAT_WIDTH
];
265 alignas(GMX_SIMD_ALIGNMENT
) float rdata1
[GMX_SIMD_FLOAT_WIDTH
];
266 alignas(GMX_SIMD_ALIGNMENT
) float rdata2
[GMX_SIMD_FLOAT_WIDTH
];
268 vec_st(v0
.simdInternal_
, 0, rdata0
);
269 vec_st(v1
.simdInternal_
, 0, rdata1
);
270 vec_st(v2
.simdInternal_
, 0, rdata2
);
272 base
[align
* offset
[0] + 0] += rdata0
[0];
273 base
[align
* offset
[0] + 1] += rdata1
[0];
274 base
[align
* offset
[0] + 2] += rdata2
[0];
275 base
[align
* offset
[1] + 0] += rdata0
[1];
276 base
[align
* offset
[1] + 1] += rdata1
[1];
277 base
[align
* offset
[1] + 2] += rdata2
[1];
278 base
[align
* offset
[2] + 0] += rdata0
[2];
279 base
[align
* offset
[2] + 1] += rdata1
[2];
280 base
[align
* offset
[2] + 2] += rdata2
[2];
281 base
[align
* offset
[3] + 0] += rdata0
[3];
282 base
[align
* offset
[3] + 1] += rdata1
[3];
283 base
[align
* offset
[3] + 2] += rdata2
[3];
288 static inline void gmx_simdcall
289 transposeScatterDecrU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
293 __vector
float zero
= reinterpret_cast<__vector
float>(vec_splat_u32(0));
294 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
295 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
296 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, zero
);
297 __vector
float t3
= vec_mergel(v1
.simdInternal_
, zero
);
298 __vector
float t4
= vec_mergeh(t0
, t2
);
299 __vector
float t5
= vec_mergel(t0
, t2
);
300 __vector
float t6
= vec_mergeh(t1
, t3
);
301 __vector
float t7
= vec_mergel(t1
, t3
);
303 vec_st(vec_sub(vec_ld(0, base
+ align
* offset
[0]), t4
), 0, base
+ align
* offset
[0]);
304 vec_st(vec_sub(vec_ld(0, base
+ align
* offset
[1]), t5
), 0, base
+ align
* offset
[1]);
305 vec_st(vec_sub(vec_ld(0, base
+ align
* offset
[2]), t6
), 0, base
+ align
* offset
[2]);
306 vec_st(vec_sub(vec_ld(0, base
+ align
* offset
[3]), t7
), 0, base
+ align
* offset
[3]);
310 alignas(GMX_SIMD_ALIGNMENT
) float rdata0
[GMX_SIMD_FLOAT_WIDTH
];
311 alignas(GMX_SIMD_ALIGNMENT
) float rdata1
[GMX_SIMD_FLOAT_WIDTH
];
312 alignas(GMX_SIMD_ALIGNMENT
) float rdata2
[GMX_SIMD_FLOAT_WIDTH
];
314 vec_st(v0
.simdInternal_
, 0, rdata0
);
315 vec_st(v1
.simdInternal_
, 0, rdata1
);
316 vec_st(v2
.simdInternal_
, 0, rdata2
);
318 base
[align
* offset
[0] + 0] -= rdata0
[0];
319 base
[align
* offset
[0] + 1] -= rdata1
[0];
320 base
[align
* offset
[0] + 2] -= rdata2
[0];
321 base
[align
* offset
[1] + 0] -= rdata0
[1];
322 base
[align
* offset
[1] + 1] -= rdata1
[1];
323 base
[align
* offset
[1] + 2] -= rdata2
[1];
324 base
[align
* offset
[2] + 0] -= rdata0
[2];
325 base
[align
* offset
[2] + 1] -= rdata1
[2];
326 base
[align
* offset
[2] + 2] -= rdata2
[2];
327 base
[align
* offset
[3] + 0] -= rdata0
[3];
328 base
[align
* offset
[3] + 1] -= rdata1
[3];
329 base
[align
* offset
[3] + 2] -= rdata2
[3];
333 static inline void gmx_simdcall
expandScalarsToTriplets(SimdFloat scalar
,
334 SimdFloat
* triplets0
,
335 SimdFloat
* triplets1
,
336 SimdFloat
* triplets2
)
338 const __vector
unsigned char perm0
= { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
339 const __vector
unsigned char perm1
= { 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11 };
340 const __vector
unsigned char perm2
= { 8, 9, 10, 11, 12, 13, 14, 15,
341 12, 13, 14, 15, 12, 13, 14, 15 };
343 triplets0
->simdInternal_
= vec_perm(scalar
.simdInternal_
, scalar
.simdInternal_
, perm0
);
344 triplets1
->simdInternal_
= vec_perm(scalar
.simdInternal_
, scalar
.simdInternal_
, perm1
);
345 triplets2
->simdInternal_
= vec_perm(scalar
.simdInternal_
, scalar
.simdInternal_
, perm2
);
350 static inline void gmx_simdcall
gatherLoadBySimdIntTranspose(const float* base
,
357 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
359 vec_st(offset
.simdInternal_
, 0, ioffset
);
360 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
, v2
, v3
);
364 static inline void gmx_simdcall
365 gatherLoadBySimdIntTranspose(const float* base
, SimdFInt32 offset
, SimdFloat
* v0
, SimdFloat
* v1
)
367 alignas(GMX_SIMD_ALIGNMENT
) std::int32_t ioffset
[GMX_SIMD_FINT32_WIDTH
];
369 vec_st(offset
.simdInternal_
, 0, ioffset
);
370 gatherLoadTranspose
<align
>(base
, ioffset
, v0
, v1
);
374 static inline float gmx_simdcall
reduceIncr4ReturnSum(float* m
, SimdFloat v0
, SimdFloat v1
, SimdFloat v2
, SimdFloat v3
)
376 __vector
float t0
= vec_mergeh(v0
.simdInternal_
, v2
.simdInternal_
);
377 __vector
float t1
= vec_mergel(v0
.simdInternal_
, v2
.simdInternal_
);
378 __vector
float t2
= vec_mergeh(v1
.simdInternal_
, v3
.simdInternal_
);
379 __vector
float t3
= vec_mergel(v1
.simdInternal_
, v3
.simdInternal_
);
380 v0
.simdInternal_
= vec_mergeh(t0
, t2
);
381 v1
.simdInternal_
= vec_mergel(t0
, t2
);
382 v2
.simdInternal_
= vec_mergeh(t1
, t3
);
383 v3
.simdInternal_
= vec_mergel(t1
, t3
);
388 v2
= v0
+ simdLoad(m
);
396 #endif // GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H