2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
36 #define GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H
54 SimdFloat(float f
) : simdInternal_(vdupq_n_f32(f
)) {}
56 // Internal utility constructor to simplify return statements
57 SimdFloat(float32x4_t simd
) : simdInternal_(simd
) {}
59 float32x4_t simdInternal_
;
67 SimdFInt32(std::int32_t i
) : simdInternal_(vdupq_n_s32(i
)) {}
69 // Internal utility constructor to simplify return statements
70 SimdFInt32(int32x4_t simd
) : simdInternal_(simd
) {}
72 int32x4_t simdInternal_
;
80 SimdFBool(bool b
) : simdInternal_(vdupq_n_u32( b
? 0xFFFFFFFF : 0)) {}
82 // Internal utility constructor to simplify return statements
83 SimdFBool(uint32x4_t simd
) : simdInternal_(simd
) {}
85 uint32x4_t simdInternal_
;
93 SimdFIBool(bool b
) : simdInternal_(vdupq_n_u32( b
? 0xFFFFFFFF : 0)) {}
95 // Internal utility constructor to simplify return statements
96 SimdFIBool(uint32x4_t simd
) : simdInternal_(simd
) {}
98 uint32x4_t simdInternal_
;
101 static inline SimdFloat gmx_simdcall
102 simdLoad(const float *m
)
104 assert(std::size_t(m
) % 16 == 0);
110 static inline void gmx_simdcall
111 store(float *m
, SimdFloat a
)
113 assert(std::size_t(m
) % 16 == 0);
114 vst1q_f32(m
, a
.simdInternal_
);
117 static inline SimdFloat gmx_simdcall
118 simdLoadU(const float *m
)
125 static inline void gmx_simdcall
126 storeU(float *m
, SimdFloat a
)
128 vst1q_f32(m
, a
.simdInternal_
);
131 static inline SimdFloat gmx_simdcall
139 static inline SimdFInt32 gmx_simdcall
140 simdLoadFI(const std::int32_t * m
)
142 assert(std::size_t(m
) % 16 == 0);
148 static inline void gmx_simdcall
149 store(std::int32_t * m
, SimdFInt32 a
)
151 assert(std::size_t(m
) % 16 == 0);
152 vst1q_s32(m
, a
.simdInternal_
);
155 static inline SimdFInt32 gmx_simdcall
156 simdLoadUFI(const std::int32_t *m
)
163 static inline void gmx_simdcall
164 storeU(std::int32_t * m
, SimdFInt32 a
)
166 vst1q_s32(m
, a
.simdInternal_
);
169 static inline SimdFInt32 gmx_simdcall
177 template<int index
> gmx_simdcall
178 static inline std::int32_t
179 extract(SimdFInt32 a
)
181 return vgetq_lane_s32(a
.simdInternal_
, index
);
184 static inline SimdFloat gmx_simdcall
185 operator&(SimdFloat a
, SimdFloat b
)
188 vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a
.simdInternal_
),
189 vreinterpretq_s32_f32(b
.simdInternal_
)))
193 static inline SimdFloat gmx_simdcall
194 andNot(SimdFloat a
, SimdFloat b
)
197 vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b
.simdInternal_
),
198 vreinterpretq_s32_f32(a
.simdInternal_
)))
202 static inline SimdFloat gmx_simdcall
203 operator|(SimdFloat a
, SimdFloat b
)
206 vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a
.simdInternal_
),
207 vreinterpretq_s32_f32(b
.simdInternal_
)))
211 static inline SimdFloat gmx_simdcall
212 operator^(SimdFloat a
, SimdFloat b
)
215 vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a
.simdInternal_
),
216 vreinterpretq_s32_f32(b
.simdInternal_
)))
220 static inline SimdFloat gmx_simdcall
221 operator+(SimdFloat a
, SimdFloat b
)
224 vaddq_f32(a
.simdInternal_
, b
.simdInternal_
)
228 static inline SimdFloat gmx_simdcall
229 operator-(SimdFloat a
, SimdFloat b
)
232 vsubq_f32(a
.simdInternal_
, b
.simdInternal_
)
236 static inline SimdFloat gmx_simdcall
237 operator-(SimdFloat x
)
240 vnegq_f32(x
.simdInternal_
)
244 static inline SimdFloat gmx_simdcall
245 operator*(SimdFloat a
, SimdFloat b
)
248 vmulq_f32(a
.simdInternal_
, b
.simdInternal_
)
252 // Override for Neon-Asimd
253 #if GMX_SIMD_ARM_NEON
254 static inline SimdFloat gmx_simdcall
255 fma(SimdFloat a
, SimdFloat b
, SimdFloat c
)
258 #ifdef __ARM_FEATURE_FMA
259 vfmaq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)
261 vmlaq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)
266 static inline SimdFloat gmx_simdcall
267 fms(SimdFloat a
, SimdFloat b
, SimdFloat c
)
270 #ifdef __ARM_FEATURE_FMA
271 vnegq_f32(vfmsq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
))
273 vnegq_f32(vmlsq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
))
278 static inline SimdFloat gmx_simdcall
279 fnma(SimdFloat a
, SimdFloat b
, SimdFloat c
)
282 #ifdef __ARM_FEATURE_FMA
283 vfmsq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)
285 vmlsq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)
290 static inline SimdFloat gmx_simdcall
291 fnms(SimdFloat a
, SimdFloat b
, SimdFloat c
)
294 #ifdef __ARM_FEATURE_FMA
295 vnegq_f32(vfmaq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
))
297 vnegq_f32(vmlaq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
))
303 static inline SimdFloat gmx_simdcall
307 vrsqrteq_f32(x
.simdInternal_
)
311 static inline SimdFloat gmx_simdcall
312 rsqrtIter(SimdFloat lu
, SimdFloat x
)
315 vmulq_f32(lu
.simdInternal_
, vrsqrtsq_f32(vmulq_f32(lu
.simdInternal_
, lu
.simdInternal_
), x
.simdInternal_
))
319 static inline SimdFloat gmx_simdcall
323 vrecpeq_f32(x
.simdInternal_
)
327 static inline SimdFloat gmx_simdcall
328 rcpIter(SimdFloat lu
, SimdFloat x
)
331 vmulq_f32(lu
.simdInternal_
, vrecpsq_f32(lu
.simdInternal_
, x
.simdInternal_
))
335 static inline SimdFloat gmx_simdcall
336 maskAdd(SimdFloat a
, SimdFloat b
, SimdFBool m
)
338 b
.simdInternal_
= vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(b
.simdInternal_
),
342 vaddq_f32(a
.simdInternal_
, b
.simdInternal_
)
346 static inline SimdFloat gmx_simdcall
347 maskzMul(SimdFloat a
, SimdFloat b
, SimdFBool m
)
349 SimdFloat tmp
= a
* b
;
352 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp
.simdInternal_
),
357 static inline SimdFloat gmx_simdcall
358 maskzFma(SimdFloat a
, SimdFloat b
, SimdFloat c
, SimdFBool m
)
360 #ifdef __ARM_FEATURE_FMA
361 float32x4_t tmp
= vfmaq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
);
363 float32x4_t tmp
= vmlaq_f32(c
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
);
367 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(tmp
),
372 static inline SimdFloat gmx_simdcall
373 maskzRsqrt(SimdFloat x
, SimdFBool m
)
376 x
.simdInternal_
= vbslq_f32(m
.simdInternal_
, x
.simdInternal_
, vdupq_n_f32(1.0f
));
379 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrsqrteq_f32(x
.simdInternal_
)),
384 static inline SimdFloat gmx_simdcall
385 maskzRcp(SimdFloat x
, SimdFBool m
)
388 x
.simdInternal_
= vbslq_f32(m
.simdInternal_
, x
.simdInternal_
, vdupq_n_f32(1.0f
));
391 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vrecpeq_f32(x
.simdInternal_
)),
396 static inline SimdFloat gmx_simdcall
400 vabsq_f32( x
.simdInternal_
)
404 static inline SimdFloat gmx_simdcall
405 max(SimdFloat a
, SimdFloat b
)
408 vmaxq_f32(a
.simdInternal_
, b
.simdInternal_
)
412 static inline SimdFloat gmx_simdcall
413 min(SimdFloat a
, SimdFloat b
)
416 vminq_f32(a
.simdInternal_
, b
.simdInternal_
)
420 // Round and trunc operations are defined at the end of this file, since they
421 // need to use float-to-integer and integer-to-float conversions.
423 static inline SimdFloat gmx_simdcall
424 frexp(SimdFloat value
, SimdFInt32
* exponent
)
426 const int32x4_t exponentMask
= vdupq_n_s32(0x7F800000);
427 const int32x4_t mantissaMask
= vdupq_n_s32(0x807FFFFF);
428 const int32x4_t exponentBias
= vdupq_n_s32(126); // add 1 to make our definition identical to frexp()
429 const float32x4_t half
= vdupq_n_f32(0.5f
);
432 iExponent
= vandq_s32(vreinterpretq_s32_f32(value
.simdInternal_
), exponentMask
);
433 iExponent
= vsubq_s32(vshrq_n_s32(iExponent
, 23), exponentBias
);
434 exponent
->simdInternal_
= iExponent
;
437 vreinterpretq_f32_s32(vorrq_s32(vandq_s32(vreinterpretq_s32_f32(value
.simdInternal_
),
439 vreinterpretq_s32_f32(half
)))
443 static inline SimdFloat gmx_simdcall
444 ldexp(SimdFloat value
, SimdFInt32 exponent
)
446 const int32x4_t exponentBias
= vdupq_n_s32(127);
449 iExponent
= vshlq_n_s32( vaddq_s32(exponent
.simdInternal_
, exponentBias
), 23);
452 vmulq_f32(value
.simdInternal_
, vreinterpretq_f32_s32(iExponent
))
456 // Override for Neon-Asimd
457 #if GMX_SIMD_ARM_NEON
458 static inline float gmx_simdcall
461 float32x4_t x
= a
.simdInternal_
;
462 float32x4_t y
= vextq_f32(x
, x
, 2);
465 y
= vextq_f32(x
, x
, 1);
467 return vgetq_lane_f32(x
, 0);
471 static inline SimdFBool gmx_simdcall
472 operator==(SimdFloat a
, SimdFloat b
)
475 vceqq_f32(a
.simdInternal_
, b
.simdInternal_
)
479 static inline SimdFBool gmx_simdcall
480 operator!=(SimdFloat a
, SimdFloat b
)
483 vmvnq_u32(vceqq_f32(a
.simdInternal_
, b
.simdInternal_
))
487 static inline SimdFBool gmx_simdcall
488 operator<(SimdFloat a
, SimdFloat b
)
491 vcltq_f32(a
.simdInternal_
, b
.simdInternal_
)
495 static inline SimdFBool gmx_simdcall
496 operator<=(SimdFloat a
, SimdFloat b
)
499 vcleq_f32(a
.simdInternal_
, b
.simdInternal_
)
503 static inline SimdFBool gmx_simdcall
504 testBits(SimdFloat a
)
506 uint32x4_t tmp
= vreinterpretq_u32_f32(a
.simdInternal_
);
513 static inline SimdFBool gmx_simdcall
514 operator&&(SimdFBool a
, SimdFBool b
)
518 vandq_u32(a
.simdInternal_
, b
.simdInternal_
)
522 static inline SimdFBool gmx_simdcall
523 operator||(SimdFBool a
, SimdFBool b
)
526 vorrq_u32(a
.simdInternal_
, b
.simdInternal_
)
530 // Override for Neon-Asimd
531 #if GMX_SIMD_ARM_NEON
532 static inline bool gmx_simdcall
535 uint32x4_t x
= a
.simdInternal_
;
536 uint32x4_t y
= vextq_u32(x
, x
, 2);
539 y
= vextq_u32(x
, x
, 1);
541 return (vgetq_lane_u32(x
, 0) != 0);
545 static inline SimdFloat gmx_simdcall
546 selectByMask(SimdFloat a
, SimdFBool m
)
549 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a
.simdInternal_
),
554 static inline SimdFloat gmx_simdcall
555 selectByNotMask(SimdFloat a
, SimdFBool m
)
558 vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a
.simdInternal_
),
563 static inline SimdFloat gmx_simdcall
564 blend(SimdFloat a
, SimdFloat b
, SimdFBool sel
)
567 vbslq_f32(sel
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)
571 static inline SimdFInt32 gmx_simdcall
572 operator<<(SimdFInt32 a
, int n
)
575 vshlq_n_s32(a
.simdInternal_
, n
)
579 static inline SimdFInt32 gmx_simdcall
580 operator>>(SimdFInt32 a
, int n
)
583 vshrq_n_s32(a
.simdInternal_
, n
)
587 static inline SimdFInt32 gmx_simdcall
588 operator&(SimdFInt32 a
, SimdFInt32 b
)
591 vandq_s32(a
.simdInternal_
, b
.simdInternal_
)
595 static inline SimdFInt32 gmx_simdcall
596 andNot(SimdFInt32 a
, SimdFInt32 b
)
599 vbicq_s32(b
.simdInternal_
, a
.simdInternal_
)
603 static inline SimdFInt32 gmx_simdcall
604 operator|(SimdFInt32 a
, SimdFInt32 b
)
607 vorrq_s32(a
.simdInternal_
, b
.simdInternal_
)
611 static inline SimdFInt32 gmx_simdcall
612 operator^(SimdFInt32 a
, SimdFInt32 b
)
615 veorq_s32(a
.simdInternal_
, b
.simdInternal_
)
619 static inline SimdFInt32 gmx_simdcall
620 operator+(SimdFInt32 a
, SimdFInt32 b
)
623 vaddq_s32(a
.simdInternal_
, b
.simdInternal_
)
627 static inline SimdFInt32 gmx_simdcall
628 operator-(SimdFInt32 a
, SimdFInt32 b
)
631 vsubq_s32(a
.simdInternal_
, b
.simdInternal_
)
635 static inline SimdFInt32 gmx_simdcall
636 operator*(SimdFInt32 a
, SimdFInt32 b
)
639 vmulq_s32(a
.simdInternal_
, b
.simdInternal_
)
643 static inline SimdFIBool gmx_simdcall
644 operator==(SimdFInt32 a
, SimdFInt32 b
)
647 vceqq_s32(a
.simdInternal_
, b
.simdInternal_
)
651 static inline SimdFIBool gmx_simdcall
652 testBits(SimdFInt32 a
)
655 vtstq_s32(a
.simdInternal_
, a
.simdInternal_
)
659 static inline SimdFIBool gmx_simdcall
660 operator<(SimdFInt32 a
, SimdFInt32 b
)
663 vcltq_s32(a
.simdInternal_
, b
.simdInternal_
)
667 static inline SimdFIBool gmx_simdcall
668 operator&&(SimdFIBool a
, SimdFIBool b
)
671 vandq_u32(a
.simdInternal_
, b
.simdInternal_
)
675 static inline SimdFIBool gmx_simdcall
676 operator||(SimdFIBool a
, SimdFIBool b
)
679 vorrq_u32(a
.simdInternal_
, b
.simdInternal_
)
683 // Override for Neon-Asimd
684 #if GMX_SIMD_ARM_NEON
685 static inline bool gmx_simdcall
686 anyTrue(SimdFIBool a
)
688 uint32x4_t x
= a
.simdInternal_
;
689 uint32x4_t y
= vextq_u32(x
, x
, 2);
692 y
= vextq_u32(x
, x
, 1);
694 return (vgetq_lane_u32(x
, 0) != 0);
698 static inline SimdFInt32 gmx_simdcall
699 selectByMask(SimdFInt32 a
, SimdFIBool m
)
702 vandq_s32(a
.simdInternal_
, vreinterpretq_s32_u32(m
.simdInternal_
))
706 static inline SimdFInt32 gmx_simdcall
707 selectByNotMask(SimdFInt32 a
, SimdFIBool m
)
710 vbicq_s32(a
.simdInternal_
, vreinterpretq_s32_u32(m
.simdInternal_
))
714 static inline SimdFInt32 gmx_simdcall
715 blend(SimdFInt32 a
, SimdFInt32 b
, SimdFIBool sel
)
718 vbslq_s32(sel
.simdInternal_
, b
.simdInternal_
, a
.simdInternal_
)
722 // Override for Neon-Asimd
723 #if GMX_SIMD_ARM_NEON
724 static inline SimdFInt32 gmx_simdcall
727 float32x4_t signBitOfA
= vreinterpretq_f32_u32(vandq_u32(vdupq_n_u32(0x80000000), vreinterpretq_u32_f32(a
.simdInternal_
)));
728 float32x4_t half
= vdupq_n_f32(0.5f
);
729 float32x4_t corr
= vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(half
), vreinterpretq_u32_f32(signBitOfA
)));
732 vcvtq_s32_f32(vaddq_f32(a
.simdInternal_
, corr
))
737 static inline SimdFInt32 gmx_simdcall
741 vcvtq_s32_f32(a
.simdInternal_
)
745 static inline SimdFloat gmx_simdcall
749 vcvtq_f32_s32(a
.simdInternal_
)
753 static inline SimdFIBool gmx_simdcall
761 static inline SimdFBool gmx_simdcall
762 cvtIB2B(SimdFIBool a
)
769 // Override for Neon-Asimd
770 #if GMX_SIMD_ARM_NEON
771 static inline SimdFloat gmx_simdcall
774 return cvtI2R(cvtR2I(x
));
777 static inline SimdFloat gmx_simdcall
780 return cvtI2R(cvttR2I(x
));
786 #endif // GMX_SIMD_IMPL_ARM_NEON_SIMD_FLOAT_H