Update instructions in containers.rst
[gromacs.git] / src / gromacs / simd / impl_ibm_vmx / impl_ibm_vmx_util_float.h
blob95f424d5c25eca216fd6c64f312e406de2fe03bd
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
39 #include "config.h"
41 #include <cstddef>
42 #include <cstdint>
44 #include "gromacs/utility/basedefinitions.h"
46 #include "impl_ibm_vmx_definitions.h"
47 #include "impl_ibm_vmx_simd_float.h"
49 namespace gmx
52 template<int align>
53 static inline void gmx_simdcall gatherLoadTranspose(const float* base,
54 const std::int32_t offset[],
55 SimdFloat* v0,
56 SimdFloat* v1,
57 SimdFloat* v2,
58 SimdFloat* v3)
60 *v0 = simdLoad(base + align * offset[0]);
61 *v1 = simdLoad(base + align * offset[1]);
62 *v2 = simdLoad(base + align * offset[2]);
63 *v3 = simdLoad(base + align * offset[3]);
65 __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
66 __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
67 __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
68 __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
69 v0->simdInternal_ = vec_mergeh(t0, t2);
70 v1->simdInternal_ = vec_mergel(t0, t2);
71 v2->simdInternal_ = vec_mergeh(t1, t3);
72 v3->simdInternal_ = vec_mergel(t1, t3);
75 template<int align>
76 static inline void gmx_simdcall
77 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
79 if (align % 4 == 0)
81 SimdFloat t2, t3;
83 gatherLoadTranspose<align>(base, offset, v0, v1, &t2, &t3);
85 else
87 __vector float t0, t1, t2, t3, t4, t5, t6, t7;
88 __vector unsigned char p0, p1, p2, p3;
90 // This is REALLY slow, since we have no choice but to load individual
91 // elements when we cannot guarantee that we can access beyond the end of
92 // the memory. Fortunately, 99% of the usage should be the aligned-to-4
93 // case above instead.
94 t0 = vec_lde(0, base + align * offset[0]);
95 t1 = vec_lde(0, base + align * offset[1]);
96 t2 = vec_lde(0, base + align * offset[2]);
97 t3 = vec_lde(0, base + align * offset[3]);
98 p0 = vec_lvsl(0, base + align * offset[0]);
99 p1 = vec_lvsl(0, base + align * offset[1]);
100 p2 = vec_lvsl(0, base + align * offset[2]);
101 p3 = vec_lvsl(0, base + align * offset[3]);
102 t0 = vec_perm(t0, t0, p0);
103 t1 = vec_perm(t1, t1, p1);
104 t2 = vec_perm(t2, t2, p2);
105 t3 = vec_perm(t3, t3, p3);
106 t0 = vec_mergeh(t0, t2);
107 t1 = vec_mergeh(t1, t3);
108 v0->simdInternal_ = vec_mergeh(t0, t1);
110 t4 = vec_lde(0, base + align * offset[0] + 1);
111 t5 = vec_lde(0, base + align * offset[1] + 1);
112 t6 = vec_lde(0, base + align * offset[2] + 1);
113 t7 = vec_lde(0, base + align * offset[3] + 1);
114 p0 = vec_lvsl(0, base + align * offset[0] + 1);
115 p1 = vec_lvsl(0, base + align * offset[1] + 1);
116 p2 = vec_lvsl(0, base + align * offset[2] + 1);
117 p3 = vec_lvsl(0, base + align * offset[3] + 1);
118 t4 = vec_perm(t4, t4, p0);
119 t5 = vec_perm(t5, t5, p1);
120 t6 = vec_perm(t6, t6, p2);
121 t7 = vec_perm(t7, t7, p3);
122 t4 = vec_mergeh(t4, t6);
123 t5 = vec_mergeh(t5, t7);
124 v1->simdInternal_ = vec_mergeh(t4, t5);
128 static const int c_simdBestPairAlignmentFloat = 2;
130 template<int align>
131 static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
132 const std::int32_t offset[],
133 SimdFloat* v0,
134 SimdFloat* v1,
135 SimdFloat* v2)
137 if (align % 4 == 0)
139 SimdFloat t3;
140 gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
142 else
144 __vector float t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
145 __vector unsigned char p0, p1, p2, p3;
147 // This is REALLY slow, since we have no choice but to load individual
148 // elements when we cannot guarantee that we can access beyond the end of
149 // the memory. Unfortunately this is likely the most common case.
150 t0 = vec_lde(0, base + align * offset[0]);
151 t1 = vec_lde(0, base + align * offset[1]);
152 t2 = vec_lde(0, base + align * offset[2]);
153 t3 = vec_lde(0, base + align * offset[3]);
154 p0 = vec_lvsl(0, base + align * offset[0]);
155 p1 = vec_lvsl(0, base + align * offset[1]);
156 p2 = vec_lvsl(0, base + align * offset[2]);
157 p3 = vec_lvsl(0, base + align * offset[3]);
158 t0 = vec_perm(t0, t0, p0);
159 t1 = vec_perm(t1, t1, p1);
160 t2 = vec_perm(t2, t2, p2);
161 t3 = vec_perm(t3, t3, p3);
162 t0 = vec_mergeh(t0, t2);
163 t1 = vec_mergeh(t1, t3);
164 v0->simdInternal_ = vec_mergeh(t0, t1);
166 t4 = vec_lde(0, base + align * offset[0] + 1);
167 t5 = vec_lde(0, base + align * offset[1] + 1);
168 t6 = vec_lde(0, base + align * offset[2] + 1);
169 t7 = vec_lde(0, base + align * offset[3] + 1);
170 p0 = vec_lvsl(0, base + align * offset[0] + 1);
171 p1 = vec_lvsl(0, base + align * offset[1] + 1);
172 p2 = vec_lvsl(0, base + align * offset[2] + 1);
173 p3 = vec_lvsl(0, base + align * offset[3] + 1);
174 t4 = vec_perm(t4, t4, p0);
175 t5 = vec_perm(t5, t5, p1);
176 t6 = vec_perm(t6, t6, p2);
177 t7 = vec_perm(t7, t7, p3);
178 t4 = vec_mergeh(t4, t6);
179 t5 = vec_mergeh(t5, t7);
180 v1->simdInternal_ = vec_mergeh(t4, t5);
182 t8 = vec_lde(0, base + align * offset[0] + 2);
183 t9 = vec_lde(0, base + align * offset[1] + 2);
184 t10 = vec_lde(0, base + align * offset[2] + 2);
185 t11 = vec_lde(0, base + align * offset[3] + 2);
186 p0 = vec_lvsl(0, base + align * offset[0] + 2);
187 p1 = vec_lvsl(0, base + align * offset[1] + 2);
188 p2 = vec_lvsl(0, base + align * offset[2] + 2);
189 p3 = vec_lvsl(0, base + align * offset[3] + 2);
190 t8 = vec_perm(t8, t8, p0);
191 t9 = vec_perm(t9, t9, p1);
192 t10 = vec_perm(t10, t10, p2);
193 t11 = vec_perm(t11, t11, p3);
194 t8 = vec_mergeh(t8, t10);
195 t9 = vec_mergeh(t9, t11);
196 v2->simdInternal_ = vec_mergeh(t8, t9);
201 template<int align>
202 static inline void gmx_simdcall
203 transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
205 __vector unsigned char p0, p1, p2, p3;
207 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
208 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
209 __vector float t2 = vec_mergeh(v1.simdInternal_, v2.simdInternal_);
210 __vector float t3 = vec_mergel(v1.simdInternal_, v2.simdInternal_);
211 __vector float t4 = vec_mergeh(t0, t2);
212 __vector float t5 = vec_mergel(t0, t2);
213 __vector float t6 = vec_mergeh(t1, t3);
214 __vector float t7 = vec_mergel(t1, t3);
216 p0 = vec_lvsr(0, base + align * offset[0]);
217 p1 = vec_lvsr(0, base + align * offset[1]);
218 p2 = vec_lvsr(0, base + align * offset[2]);
219 p3 = vec_lvsr(0, base + align * offset[3]);
221 t4 = vec_perm(t4, t4, p0);
222 t5 = vec_perm(t5, t5, p1);
223 t6 = vec_perm(t6, t6, p2);
224 t7 = vec_perm(t7, t7, p3);
226 vec_ste(t4, 0, base + align * offset[0]);
227 vec_ste(t4, 4, base + align * offset[0]);
228 vec_ste(t4, 8, base + align * offset[0]);
229 vec_ste(t5, 0, base + align * offset[1]);
230 vec_ste(t5, 4, base + align * offset[1]);
231 vec_ste(t5, 8, base + align * offset[1]);
232 vec_ste(t6, 0, base + align * offset[2]);
233 vec_ste(t6, 4, base + align * offset[2]);
234 vec_ste(t6, 8, base + align * offset[2]);
235 vec_ste(t7, 0, base + align * offset[3]);
236 vec_ste(t7, 4, base + align * offset[3]);
237 vec_ste(t7, 8, base + align * offset[3]);
241 template<int align>
242 static inline void gmx_simdcall
243 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
245 if (align % 4 == 0)
247 __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
248 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
249 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
250 __vector float t2 = vec_mergeh(v1.simdInternal_, zero);
251 __vector float t3 = vec_mergel(v1.simdInternal_, zero);
252 __vector float t4 = vec_mergeh(t0, t2);
253 __vector float t5 = vec_mergel(t0, t2);
254 __vector float t6 = vec_mergeh(t1, t3);
255 __vector float t7 = vec_mergel(t1, t3);
257 vec_st(vec_add(vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
258 vec_st(vec_add(vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
259 vec_st(vec_add(vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
260 vec_st(vec_add(vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
262 else
264 alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
265 alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
266 alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
268 vec_st(v0.simdInternal_, 0, rdata0);
269 vec_st(v1.simdInternal_, 0, rdata1);
270 vec_st(v2.simdInternal_, 0, rdata2);
272 base[align * offset[0] + 0] += rdata0[0];
273 base[align * offset[0] + 1] += rdata1[0];
274 base[align * offset[0] + 2] += rdata2[0];
275 base[align * offset[1] + 0] += rdata0[1];
276 base[align * offset[1] + 1] += rdata1[1];
277 base[align * offset[1] + 2] += rdata2[1];
278 base[align * offset[2] + 0] += rdata0[2];
279 base[align * offset[2] + 1] += rdata1[2];
280 base[align * offset[2] + 2] += rdata2[2];
281 base[align * offset[3] + 0] += rdata0[3];
282 base[align * offset[3] + 1] += rdata1[3];
283 base[align * offset[3] + 2] += rdata2[3];
287 template<int align>
288 static inline void gmx_simdcall
289 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
291 if (align % 4 == 0)
293 __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
294 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
295 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
296 __vector float t2 = vec_mergeh(v1.simdInternal_, zero);
297 __vector float t3 = vec_mergel(v1.simdInternal_, zero);
298 __vector float t4 = vec_mergeh(t0, t2);
299 __vector float t5 = vec_mergel(t0, t2);
300 __vector float t6 = vec_mergeh(t1, t3);
301 __vector float t7 = vec_mergel(t1, t3);
303 vec_st(vec_sub(vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
304 vec_st(vec_sub(vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
305 vec_st(vec_sub(vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
306 vec_st(vec_sub(vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
308 else
310 alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
311 alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
312 alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
314 vec_st(v0.simdInternal_, 0, rdata0);
315 vec_st(v1.simdInternal_, 0, rdata1);
316 vec_st(v2.simdInternal_, 0, rdata2);
318 base[align * offset[0] + 0] -= rdata0[0];
319 base[align * offset[0] + 1] -= rdata1[0];
320 base[align * offset[0] + 2] -= rdata2[0];
321 base[align * offset[1] + 0] -= rdata0[1];
322 base[align * offset[1] + 1] -= rdata1[1];
323 base[align * offset[1] + 2] -= rdata2[1];
324 base[align * offset[2] + 0] -= rdata0[2];
325 base[align * offset[2] + 1] -= rdata1[2];
326 base[align * offset[2] + 2] -= rdata2[2];
327 base[align * offset[3] + 0] -= rdata0[3];
328 base[align * offset[3] + 1] -= rdata1[3];
329 base[align * offset[3] + 2] -= rdata2[3];
333 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
334 SimdFloat* triplets0,
335 SimdFloat* triplets1,
336 SimdFloat* triplets2)
338 const __vector unsigned char perm0 = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
339 const __vector unsigned char perm1 = { 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11 };
340 const __vector unsigned char perm2 = { 8, 9, 10, 11, 12, 13, 14, 15,
341 12, 13, 14, 15, 12, 13, 14, 15 };
343 triplets0->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm0);
344 triplets1->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm1);
345 triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm2);
349 template<int align>
350 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
351 SimdFInt32 offset,
352 SimdFloat* v0,
353 SimdFloat* v1,
354 SimdFloat* v2,
355 SimdFloat* v3)
357 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
359 vec_st(offset.simdInternal_, 0, ioffset);
360 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
363 template<int align>
364 static inline void gmx_simdcall
365 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
367 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
369 vec_st(offset.simdInternal_, 0, ioffset);
370 gatherLoadTranspose<align>(base, ioffset, v0, v1);
374 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
376 __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
377 __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
378 __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
379 __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
380 v0.simdInternal_ = vec_mergeh(t0, t2);
381 v1.simdInternal_ = vec_mergel(t0, t2);
382 v2.simdInternal_ = vec_mergeh(t1, t3);
383 v3.simdInternal_ = vec_mergel(t1, t3);
385 v0 = v0 + v1;
386 v2 = v2 + v3;
387 v0 = v0 + v2;
388 v2 = v0 + simdLoad(m);
389 store(m, v2);
391 return reduce(v0);
394 } // namespace gmx
396 #endif // GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H