128-bit AVX2 SIMD for AMD Ryzen
[gromacs.git] / src / gromacs / simd / impl_reference / impl_reference_simd4_float.h
blob0478379f93e57d902448c69752b02c8c0c6eea61
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
39 /*! \libinternal \file
41 * \brief Reference implementation, SIMD4 single precision.
43 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
45 * \ingroup module_simd
48 #include "config.h"
50 #include <cassert>
51 #include <cmath>
52 #include <cstddef>
53 #include <cstdint>
55 #include <algorithm>
56 #include <array>
58 #include "impl_reference_definitions.h"
60 namespace gmx
63 /*! \cond libapi */
64 /*! \addtogroup module_simd */
65 /*! \{ */
67 /*! \name Constant width-4 single precision SIMD types and instructions
68 * \{
71 /*! \libinternal \brief SIMD4 float type.
73 * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
75 * \note This variable cannot be placed inside other structures or classes, since
76 * some compilers (including at least clang-3.7) appear to lose the
77 * alignment. This is likely particularly severe when allocating such
78 * memory on the heap, but it occurs for stack structures too.
80 class Simd4Float
82 public:
83 Simd4Float() {}
85 //! \brief Construct from scalar
86 Simd4Float(float f) { simdInternal_.fill(f); }
88 /*! \brief Internal SIMD data. Implementation dependent, don't touch.
90 * This has to be public to enable usage in combination with static inline
91 * functions, but it should never, EVER, be accessed by any code outside
92 * the corresponding implementation directory since the type will depend
93 * on the architecture.
95 std::array<float, GMX_SIMD4_WIDTH> simdInternal_;
98 /*! \libinternal \brief SIMD4 variable type to use for logical comparisons on floats.
100 * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
102 * \note This variable cannot be placed inside other structures or classes, since
103 * some compilers (including at least clang-3.7) appear to lose the
104 * alignment. This is likely particularly severe when allocating such
105 * memory on the heap, but it occurs for stack structures too.
107 class Simd4FBool
109 public:
110 Simd4FBool() {}
112 //! \brief Construct from scalar bool
113 Simd4FBool(bool b) { simdInternal_.fill(b); }
115 /*! \brief Internal SIMD data. Implementation dependent, don't touch.
117 * This has to be public to enable usage in combination with static inline
118 * functions, but it should never, EVER, be accessed by any code outside
119 * the corresponding implementation directory since the type will depend
120 * on the architecture.
122 std::array<bool, GMX_SIMD4_WIDTH> simdInternal_;
125 /*! \brief Load 4 float values from aligned memory into SIMD4 variable.
127 * \param m Pointer to memory aligned to 4 elements.
128 * \return SIMD4 variable with data loaded.
130 static inline Simd4Float gmx_simdcall
131 load4(const float *m)
133 Simd4Float a;
135 assert(std::size_t(m) % (a.simdInternal_.size()*sizeof(float)) == 0);
137 std::copy(m, m+a.simdInternal_.size(), a.simdInternal_.begin());
138 return a;
141 /*! \brief Store the contents of SIMD4 float to aligned memory m.
143 * \param[out] m Pointer to memory, aligned to 4 elements.
144 * \param a SIMD4 variable to store
146 static inline void gmx_simdcall
147 store4(float *m, Simd4Float a)
149 assert(std::size_t(m) % (a.simdInternal_.size()*sizeof(float)) == 0);
151 std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
154 /*! \brief Load SIMD4 float from unaligned memory.
156 * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
158 * \param m Pointer to memory, no alignment requirement.
159 * \return SIMD4 variable with data loaded.
161 static inline Simd4Float gmx_simdcall
162 load4U(const float *m)
164 Simd4Float a;
165 std::copy(m, m+a.simdInternal_.size(), a.simdInternal_.begin());
166 return a;
169 /*! \brief Store SIMD4 float to unaligned memory.
171 * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
173 * \param[out] m Pointer to memory, no alignment requirement.
174 * \param a SIMD4 variable to store.
176 static inline void gmx_simdcall
177 store4U(float *m, Simd4Float a)
179 std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
182 /*! \brief Set all SIMD4 float elements to 0.
184 * You should typically just call \ref gmx::setZero(), which uses proxy objects
185 * internally to handle all types rather than adding the suffix used here.
187 * \return SIMD4 0.0
189 static inline Simd4Float gmx_simdcall
190 simd4SetZeroF()
192 return Simd4Float(0.0f);
196 /*! \brief Bitwise and for two SIMD4 float variables.
198 * Supported if \ref GMX_SIMD_HAVE_LOGICAL is 1.
200 * \param a data1
201 * \param b data2
202 * \return data1 & data2
204 static inline Simd4Float gmx_simdcall
205 operator&(Simd4Float a, Simd4Float b)
207 Simd4Float res;
209 union
211 float r;
212 std::int32_t i;
214 conv1, conv2;
216 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
218 conv1.r = a.simdInternal_[i];
219 conv2.r = b.simdInternal_[i];
220 conv1.i = conv1.i & conv2.i;
221 res.simdInternal_[i] = conv1.r;
223 return res;
227 /*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
229 * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
231 * \param a data1
232 * \param b data2
233 * \return (~data1) & data2
235 static inline Simd4Float gmx_simdcall
236 andNot(Simd4Float a, Simd4Float b)
238 Simd4Float res;
240 union
242 float r;
243 std::int32_t i;
245 conv1, conv2;
247 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
249 conv1.r = a.simdInternal_[i];
250 conv2.r = b.simdInternal_[i];
251 conv1.i = ~conv1.i & conv2.i;
252 res.simdInternal_[i] = conv1.r;
254 return res;
258 /*! \brief Bitwise or for two SIMD4 floats.
260 * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
262 * \param a data1
263 * \param b data2
264 * \return data1 | data2
266 static inline Simd4Float gmx_simdcall
267 operator|(Simd4Float a, Simd4Float b)
269 Simd4Float res;
271 union
273 float r;
274 std::int32_t i;
276 conv1, conv2;
278 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
280 conv1.r = a.simdInternal_[i];
281 conv2.r = b.simdInternal_[i];
282 conv1.i = conv1.i | conv2.i;
283 res.simdInternal_[i] = conv1.r;
285 return res;
288 /*! \brief Bitwise xor for two SIMD4 float variables.
290 * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
292 * \param a data1
293 * \param b data2
294 * \return data1 ^ data2
296 static inline Simd4Float gmx_simdcall
297 operator^(Simd4Float a, Simd4Float b)
299 Simd4Float res;
301 union
303 float r;
304 std::int32_t i;
306 conv1, conv2;
308 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
310 conv1.r = a.simdInternal_[i];
311 conv2.r = b.simdInternal_[i];
312 conv1.i = conv1.i ^ conv2.i;
313 res.simdInternal_[i] = conv1.r;
315 return res;
318 /*! \brief Add two float SIMD4 variables.
320 * \param a term1
321 * \param b term2
322 * \return a+b
324 static inline Simd4Float gmx_simdcall
325 operator+(Simd4Float a, Simd4Float b)
327 Simd4Float res;
329 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
331 res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
333 return res;
336 /*! \brief Subtract two SIMD4 variables.
338 * \param a term1
339 * \param b term2
340 * \return a-b
342 static inline Simd4Float gmx_simdcall
343 operator-(Simd4Float a, Simd4Float b)
345 Simd4Float res;
347 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
349 res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
351 return res;
354 /*! \brief SIMD4 floating-point negate.
356 * \param a SIMD4 floating-point value
357 * \return -a
359 static inline Simd4Float gmx_simdcall
360 operator-(Simd4Float a)
362 Simd4Float res;
364 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
366 res.simdInternal_[i] = -a.simdInternal_[i];
368 return res;
371 /*! \brief Multiply two SIMD4 variables.
373 * \param a factor1
374 * \param b factor2
375 * \return a*b.
377 static inline Simd4Float gmx_simdcall
378 operator*(Simd4Float a, Simd4Float b)
380 Simd4Float res;
382 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
384 res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
386 return res;
389 /*! \brief SIMD4 Fused-multiply-add. Result is a*b+c.
391 * \param a factor1
392 * \param b factor2
393 * \param c term
394 * \return a*b+c
396 static inline Simd4Float gmx_simdcall
397 fma(Simd4Float a, Simd4Float b, Simd4Float c)
399 return a*b+c;
402 /*! \brief SIMD4 Fused-multiply-subtract. Result is a*b-c.
404 * \param a factor1
405 * \param b factor2
406 * \param c term
407 * \return a*b-c
409 static inline Simd4Float gmx_simdcall
410 fms(Simd4Float a, Simd4Float b, Simd4Float c)
412 return a*b-c;
415 /*! \brief SIMD4 Fused-negated-multiply-add. Result is -a*b+c.
417 * \param a factor1
418 * \param b factor2
419 * \param c term
420 * \return -a*b+c
422 static inline Simd4Float gmx_simdcall
423 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
425 return c-a*b;
428 /*! \brief SIMD4 Fused-negated-multiply-subtract. Result is -a*b-c.
430 * \param a factor1
431 * \param b factor2
432 * \param c term
433 * \return -a*b-c
435 static inline Simd4Float gmx_simdcall
436 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
438 return -a*b-c;
441 /*! \brief SIMD4 1.0/sqrt(x) lookup.
443 * This is a low-level instruction that should only be called from routines
444 * implementing the inverse square root in simd_math.h.
446 * \param x Argument, x>0
447 * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
449 static inline Simd4Float gmx_simdcall
450 rsqrt(Simd4Float x)
452 Simd4Float res;
454 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
456 res.simdInternal_[i] = 1.0f / std::sqrt(x.simdInternal_[i]);
458 return res;
462 /*! \brief SIMD4 Floating-point fabs().
464 * \param a any floating point values
465 * \return fabs(a) for each element.
467 static inline Simd4Float gmx_simdcall
468 abs(Simd4Float a)
470 Simd4Float res;
472 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
474 res.simdInternal_[i] = std::abs(a.simdInternal_[i]);
476 return res;
479 /*! \brief Set each SIMD4 element to the largest from two variables.
481 * \param a Any floating-point value
482 * \param b Any floating-point value
483 * \return max(a,b) for each element.
485 static inline Simd4Float gmx_simdcall
486 max(Simd4Float a, Simd4Float b)
488 Simd4Float res;
490 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
492 res.simdInternal_[i] = std::max(a.simdInternal_[i], b.simdInternal_[i]);
494 return res;
498 /*! \brief Set each SIMD4 element to the largest from two variables.
500 * \param a Any floating-point value
501 * \param b Any floating-point value
502 * \return max(a,b) for each element.
504 static inline Simd4Float gmx_simdcall
505 min(Simd4Float a, Simd4Float b)
507 Simd4Float res;
509 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
511 res.simdInternal_[i] = std::min(a.simdInternal_[i], b.simdInternal_[i]);
513 return res;
517 /*! \brief SIMD4 Round to nearest integer value (in floating-point format).
519 * \param a Any floating-point value
520 * \return The nearest integer, represented in floating-point format.
522 static inline Simd4Float gmx_simdcall
523 round(Simd4Float a)
525 Simd4Float res;
527 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
529 res.simdInternal_[i] = std::round(a.simdInternal_[i]);
531 return res;
535 /*! \brief Truncate SIMD4, i.e. round towards zero - common hardware instruction.
537 * \param a Any floating-point value
538 * \return Integer rounded towards zero, represented in floating-point format.
540 * \note This is truncation towards zero, not floor(). The reason for this
541 * is that truncation is virtually always present as a dedicated hardware
542 * instruction, but floor() frequently isn't.
544 static inline Simd4Float gmx_simdcall
545 trunc(Simd4Float a)
547 Simd4Float res;
549 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
551 res.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
553 return res;
556 /*! \brief Return dot product of two single precision SIMD4 variables.
558 * The dot product is calculated between the first three elements in the two
559 * vectors, while the fourth is ignored. The result is returned as a scalar.
561 * \param a vector1
562 * \param b vector2
563 * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
565 static inline float gmx_simdcall
566 dotProduct(Simd4Float a, Simd4Float b)
568 return
569 (a.simdInternal_[0] * b.simdInternal_[0] +
570 a.simdInternal_[1] * b.simdInternal_[1] +
571 a.simdInternal_[2] * b.simdInternal_[2]);
574 /*! \brief SIMD4 float transpose
576 * \param[in,out] v0 Row 0 on input, column 0 on output
577 * \param[in,out] v1 Row 1 on input, column 1 on output
578 * \param[in,out] v2 Row 2 on input, column 2 on output
579 * \param[in,out] v3 Row 3 on input, column 3 on output
581 static inline void gmx_simdcall
582 transpose(Simd4Float * v0, Simd4Float * v1,
583 Simd4Float * v2, Simd4Float * v3)
585 Simd4Float t0 = *v0;
586 Simd4Float t1 = *v1;
587 Simd4Float t2 = *v2;
588 Simd4Float t3 = *v3;
589 v0->simdInternal_[0] = t0.simdInternal_[0];
590 v0->simdInternal_[1] = t1.simdInternal_[0];
591 v0->simdInternal_[2] = t2.simdInternal_[0];
592 v0->simdInternal_[3] = t3.simdInternal_[0];
593 v1->simdInternal_[0] = t0.simdInternal_[1];
594 v1->simdInternal_[1] = t1.simdInternal_[1];
595 v1->simdInternal_[2] = t2.simdInternal_[1];
596 v1->simdInternal_[3] = t3.simdInternal_[1];
597 v2->simdInternal_[0] = t0.simdInternal_[2];
598 v2->simdInternal_[1] = t1.simdInternal_[2];
599 v2->simdInternal_[2] = t2.simdInternal_[2];
600 v2->simdInternal_[3] = t3.simdInternal_[2];
601 v3->simdInternal_[0] = t0.simdInternal_[3];
602 v3->simdInternal_[1] = t1.simdInternal_[3];
603 v3->simdInternal_[2] = t2.simdInternal_[3];
604 v3->simdInternal_[3] = t3.simdInternal_[3];
607 /*! \brief a==b for SIMD4 float
609 * \param a value1
610 * \param b value2
611 * \return Each element of the boolean will be set to true if a==b.
613 static inline Simd4FBool gmx_simdcall
614 operator==(Simd4Float a, Simd4Float b)
616 Simd4FBool res;
618 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
620 res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
622 return res;
625 /*! \brief a!=b for SIMD4 float
627 * \param a value1
628 * \param b value2
629 * \return Each element of the boolean will be set to true if a!=b.
631 static inline Simd4FBool gmx_simdcall
632 operator!=(Simd4Float a, Simd4Float b)
634 Simd4FBool res;
636 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
638 res.simdInternal_[i] = (a.simdInternal_[i] != b.simdInternal_[i]);
640 return res;
643 /*! \brief a<b for SIMD4 float
645 * \param a value1
646 * \param b value2
647 * \return Each element of the boolean will be set to true if a<b.
649 static inline Simd4FBool gmx_simdcall
650 operator<(Simd4Float a, Simd4Float b)
652 Simd4FBool res;
654 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
656 res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
658 return res;
662 /*! \brief a<=b for SIMD4 float.
664 * \param a value1
665 * \param b value2
666 * \return Each element of the boolean will be set to true if a<=b.
668 static inline Simd4FBool gmx_simdcall
669 operator<=(Simd4Float a, Simd4Float b)
671 Simd4FBool res;
673 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
675 res.simdInternal_[i] = (a.simdInternal_[i] <= b.simdInternal_[i]);
677 return res;
680 /*! \brief Logical \a and on single precision SIMD4 booleans.
682 * \param a logical vars 1
683 * \param b logical vars 2
684 * \return For each element, the result boolean is true if a \& b are true.
686 * \note This is not necessarily a bitwise operation - the storage format
687 * of booleans is implementation-dependent.
689 static inline Simd4FBool gmx_simdcall
690 operator&&(Simd4FBool a, Simd4FBool b)
692 Simd4FBool res;
694 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
696 res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
698 return res;
701 /*! \brief Logical \a or on single precision SIMD4 booleans.
703 * \param a logical vars 1
704 * \param b logical vars 2
705 * \return For each element, the result boolean is true if a or b is true.
707 * Note that this is not necessarily a bitwise operation - the storage format
708 * of booleans is implementation-dependent.
710 static inline Simd4FBool gmx_simdcall
711 operator||(Simd4FBool a, Simd4FBool b)
713 Simd4FBool res;
715 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
717 res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
719 return res;
722 /*! \brief Returns non-zero if any of the boolean in SIMD4 a is True, otherwise 0.
724 * \param a Logical variable.
725 * \return true if any element in a is true, otherwise false.
727 * The actual return value for truth will depend on the architecture,
728 * so any non-zero value is considered truth.
730 static inline bool gmx_simdcall
731 anyTrue(Simd4FBool a)
733 bool res = false;
735 for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
737 res = res || a.simdInternal_[i];
739 return res;
742 /*! \brief Select from single precision SIMD4 variable where boolean is true.
744 * \param a Floating-point variable to select from
745 * \param mask Boolean selector
746 * \return For each element, a is selected for true, 0 for false.
748 static inline Simd4Float gmx_simdcall
749 selectByMask(Simd4Float a, Simd4FBool mask)
751 Simd4Float res;
753 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
755 res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0f;
757 return res;
760 /*! \brief Select from single precision SIMD4 variable where boolean is false.
762 * \param a Floating-point variable to select from
763 * \param mask Boolean selector
764 * \return For each element, a is selected for false, 0 for true (sic).
766 static inline Simd4Float gmx_simdcall
767 selectByNotMask(Simd4Float a, Simd4FBool mask)
769 Simd4Float res;
771 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
773 res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0f : a.simdInternal_[i];
775 return res;
779 /*! \brief Vector-blend SIMD4 selection.
781 * \param a First source
782 * \param b Second source
783 * \param sel Boolean selector
784 * \return For each element, select b if sel is true, a otherwise.
786 static inline Simd4Float gmx_simdcall
787 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
789 Simd4Float res;
791 for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
793 res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
795 return res;
799 /*! \brief Return sum of all elements in SIMD4 float variable.
801 * \param a SIMD4 variable to reduce/sum.
802 * \return The sum of all elements in the argument variable.
805 static inline float gmx_simdcall
806 reduce(Simd4Float a)
808 float sum = 0.0f;
810 for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
812 sum += a.simdInternal_[i];
814 return sum;
817 /*! \} */
819 /*! \} */
820 /*! \endcond */
822 } // namespace gmx
824 #endif // GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H