src/gromacs/simd/impl_reference/impl_reference_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
  37 #define GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference implementation, SIMD4 single precision.
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cmath>
  52 #include <cstddef>
  53 #include <cstdint>
  54
  55 #include <algorithm>
  56 #include <array>
  57
  58 #include "impl_reference_definitions.h"
  59
  60 namespace gmx
  61 {
  62
  63 /*! \cond libapi */
  64 /*! \addtogroup module_simd */
  65 /*! \{ */
  66
  67 /*! \name Constant width-4 single precision SIMD types and instructions
  68  * \{
  69  */
  70
  71 /*! \libinternal \brief SIMD4 float type.
  72  *
  73  * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
  74  *
  75  * \note This variable cannot be placed inside other structures or classes, since
  76  *       some compilers (including at least clang-3.7) appear to lose the
  77  *       alignment. This is likely particularly severe when allocating such
  78  *       memory on the heap, but it occurs for stack structures too.
  79  */
  80 class Simd4Float
  81 {
  82     public:
  83         Simd4Float() {}
  84
  85         //! \brief Construct from scalar
  86         Simd4Float(float f) { simdInternal_.fill(f); }
  87
  88         /*! \brief Internal SIMD data. Implementation dependent, don't touch.
  89          *
  90          * This has to be public to enable usage in combination with static inline
  91          * functions, but it should never, EVER, be accessed by any code outside
  92          * the corresponding implementation directory since the type will depend
  93          * on the architecture.
  94          */
  95         std::array<float, GMX_SIMD4_WIDTH>  simdInternal_;
  96 };
  97
  98 /*! \libinternal  \brief SIMD4 variable type to use for logical comparisons on floats.
  99  *
 100  * Available if \ref GMX_SIMD4_HAVE_FLOAT is 1.
 101  *
 102  * \note This variable cannot be placed inside other structures or classes, since
 103  *       some compilers (including at least clang-3.7) appear to lose the
 104  *       alignment. This is likely particularly severe when allocating such
 105  *       memory on the heap, but it occurs for stack structures too.
 106  */
 107 class Simd4FBool
 108 {
 109     public:
 110         Simd4FBool() {}
 111
 112         //! \brief Construct from scalar bool
 113         Simd4FBool(bool b) { simdInternal_.fill(b); }
 114
 115         /*! \brief Internal SIMD data. Implementation dependent, don't touch.
 116          *
 117          * This has to be public to enable usage in combination with static inline
 118          * functions, but it should never, EVER, be accessed by any code outside
 119          * the corresponding implementation directory since the type will depend
 120          * on the architecture.
 121          */
 122         std::array<bool, GMX_SIMD4_WIDTH>  simdInternal_;
 123 };
 124
 125 /*! \brief Load 4 float values from aligned memory into SIMD4 variable.
 126  *
 127  * \param m Pointer to memory aligned to 4 elements.
 128  * \return SIMD4 variable with data loaded.
 129  */
 130 static inline Simd4Float gmx_simdcall
 131 load4(const float *m)
 132 {
 133     Simd4Float a;
 134
 135     assert(std::size_t(m) % (a.simdInternal_.size()*sizeof(float)) == 0);
 136
 137     std::copy(m, m+a.simdInternal_.size(), a.simdInternal_.begin());
 138     return a;
 139 }
 140
 141 /*! \brief Store the contents of SIMD4 float to aligned memory m.
 142  *
 143  * \param[out] m Pointer to memory, aligned to 4 elements.
 144  * \param a SIMD4 variable to store
 145  */
 146 static inline void gmx_simdcall
 147 store4(float *m, Simd4Float a)
 148 {
 149     assert(std::size_t(m) % (a.simdInternal_.size()*sizeof(float)) == 0);
 150
 151     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 152 }
 153
 154 /*! \brief Load SIMD4 float from unaligned memory.
 155  *
 156  * Available if \ref GMX_SIMD_HAVE_LOADU is 1.
 157  *
 158  * \param m Pointer to memory, no alignment requirement.
 159  * \return SIMD4 variable with data loaded.
 160  */
 161 static inline Simd4Float gmx_simdcall
 162 load4U(const float *m)
 163 {
 164     Simd4Float a;
 165     std::copy(m, m+a.simdInternal_.size(), a.simdInternal_.begin());
 166     return a;
 167 }
 168
 169 /*! \brief Store SIMD4 float to unaligned memory.
 170  *
 171  * Available if \ref GMX_SIMD_HAVE_STOREU is 1.
 172  *
 173  * \param[out] m Pointer to memory, no alignment requirement.
 174  * \param a SIMD4 variable to store.
 175  */
 176 static inline void gmx_simdcall
 177 store4U(float *m, Simd4Float a)
 178 {
 179     std::copy(a.simdInternal_.begin(), a.simdInternal_.end(), m);
 180 }
 181
 182 /*! \brief Set all SIMD4 float elements to 0.
 183  *
 184  * You should typically just call \ref gmx::setZero(), which uses proxy objects
 185  * internally to handle all types rather than adding the suffix used here.
 186  *
 187  * \return SIMD4 0.0
 188  */
 189 static inline Simd4Float gmx_simdcall
 190 simd4SetZeroF()
 191 {
 192     return Simd4Float(0.0f);
 193 }
 194
 195
 196 /*! \brief Bitwise and for two SIMD4 float variables.
 197  *
 198  * Supported if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 199  *
 200  * \param a data1
 201  * \param b data2
 202  * \return data1 & data2
 203  */
 204 static inline Simd4Float gmx_simdcall
 205 operator&(Simd4Float a, Simd4Float b)
 206 {
 207     Simd4Float         res;
 208
 209     union
 210     {
 211         float         r;
 212         std::int32_t  i;
 213     }
 214     conv1, conv2;
 215
 216     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 217     {
 218         conv1.r              = a.simdInternal_[i];
 219         conv2.r              = b.simdInternal_[i];
 220         conv1.i              = conv1.i & conv2.i;
 221         res.simdInternal_[i] = conv1.r;
 222     }
 223     return res;
 224 }
 225
 226
 227 /*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
 228  *
 229  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 230  *
 231  * \param a data1
 232  * \param b data2
 233  * \return (~data1) & data2
 234  */
 235 static inline Simd4Float gmx_simdcall
 236 andNot(Simd4Float a, Simd4Float b)
 237 {
 238     Simd4Float         res;
 239
 240     union
 241     {
 242         float         r;
 243         std::int32_t  i;
 244     }
 245     conv1, conv2;
 246
 247     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 248     {
 249         conv1.r              = a.simdInternal_[i];
 250         conv2.r              = b.simdInternal_[i];
 251         conv1.i              = ~conv1.i & conv2.i;
 252         res.simdInternal_[i] = conv1.r;
 253     }
 254     return res;
 255 }
 256
 257
 258 /*! \brief Bitwise or for two SIMD4 floats.
 259  *
 260  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 261  *
 262  * \param a data1
 263  * \param b data2
 264  * \return data1 | data2
 265  */
 266 static inline Simd4Float gmx_simdcall
 267 operator|(Simd4Float a, Simd4Float b)
 268 {
 269     Simd4Float         res;
 270
 271     union
 272     {
 273         float         r;
 274         std::int32_t  i;
 275     }
 276     conv1, conv2;
 277
 278     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 279     {
 280         conv1.r              = a.simdInternal_[i];
 281         conv2.r              = b.simdInternal_[i];
 282         conv1.i              = conv1.i | conv2.i;
 283         res.simdInternal_[i] = conv1.r;
 284     }
 285     return res;
 286 }
 287
 288 /*! \brief Bitwise xor for two SIMD4 float variables.
 289  *
 290  * Available if \ref GMX_SIMD_HAVE_LOGICAL is 1.
 291  *
 292  * \param a data1
 293  * \param b data2
 294  * \return data1 ^ data2
 295  */
 296 static inline Simd4Float gmx_simdcall
 297 operator^(Simd4Float a, Simd4Float b)
 298 {
 299     Simd4Float         res;
 300
 301     union
 302     {
 303         float         r;
 304         std::int32_t  i;
 305     }
 306     conv1, conv2;
 307
 308     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 309     {
 310         conv1.r              = a.simdInternal_[i];
 311         conv2.r              = b.simdInternal_[i];
 312         conv1.i              = conv1.i ^ conv2.i;
 313         res.simdInternal_[i] = conv1.r;
 314     }
 315     return res;
 316 }
 317
 318 /*! \brief Add two float SIMD4 variables.
 319  *
 320  * \param a term1
 321  * \param b term2
 322  * \return a+b
 323  */
 324 static inline Simd4Float gmx_simdcall
 325 operator+(Simd4Float a, Simd4Float b)
 326 {
 327     Simd4Float         res;
 328
 329     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 330     {
 331         res.simdInternal_[i] = a.simdInternal_[i] + b.simdInternal_[i];
 332     }
 333     return res;
 334 }
 335
 336 /*! \brief Subtract two SIMD4 variables.
 337  *
 338  * \param a term1
 339  * \param b term2
 340  * \return a-b
 341  */
 342 static inline Simd4Float gmx_simdcall
 343 operator-(Simd4Float a, Simd4Float b)
 344 {
 345     Simd4Float         res;
 346
 347     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 348     {
 349         res.simdInternal_[i] = a.simdInternal_[i] - b.simdInternal_[i];
 350     }
 351     return res;
 352 }
 353
 354 /*! \brief SIMD4 floating-point negate.
 355  *
 356  * \param a SIMD4 floating-point value
 357  * \return -a
 358  */
 359 static inline Simd4Float gmx_simdcall
 360 operator-(Simd4Float a)
 361 {
 362     Simd4Float         res;
 363
 364     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 365     {
 366         res.simdInternal_[i] = -a.simdInternal_[i];
 367     }
 368     return res;
 369 }
 370
 371 /*! \brief Multiply two SIMD4 variables.
 372  *
 373  * \param a factor1
 374  * \param b factor2
 375  * \return a*b.
 376  */
 377 static inline Simd4Float gmx_simdcall
 378 operator*(Simd4Float a, Simd4Float b)
 379 {
 380     Simd4Float         res;
 381
 382     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 383     {
 384         res.simdInternal_[i] = a.simdInternal_[i] * b.simdInternal_[i];
 385     }
 386     return res;
 387 }
 388
 389 /*! \brief SIMD4 Fused-multiply-add. Result is a*b+c.
 390  *
 391  * \param a factor1
 392  * \param b factor2
 393  * \param c term
 394  * \return a*b+c
 395  */
 396 static inline Simd4Float gmx_simdcall
 397 fma(Simd4Float a, Simd4Float b, Simd4Float c)
 398 {
 399     return a*b+c;
 400 }
 401
 402 /*! \brief SIMD4 Fused-multiply-subtract. Result is a*b-c.
 403  *
 404  * \param a factor1
 405  * \param b factor2
 406  * \param c term
 407  * \return a*b-c
 408  */
 409 static inline Simd4Float gmx_simdcall
 410 fms(Simd4Float a, Simd4Float b, Simd4Float c)
 411 {
 412     return a*b-c;
 413 }
 414
 415 /*! \brief SIMD4 Fused-negated-multiply-add. Result is -a*b+c.
 416  *
 417  * \param a factor1
 418  * \param b factor2
 419  * \param c term
 420  * \return -a*b+c
 421  */
 422 static inline Simd4Float gmx_simdcall
 423 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 424 {
 425     return c-a*b;
 426 }
 427
 428 /*! \brief SIMD4 Fused-negated-multiply-subtract. Result is -a*b-c.
 429  *
 430  * \param a factor1
 431  * \param b factor2
 432  * \param c term
 433  * \return -a*b-c
 434  */
 435 static inline Simd4Float gmx_simdcall
 436 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 437 {
 438     return -a*b-c;
 439 }
 440
 441 /*! \brief SIMD4 1.0/sqrt(x) lookup.
 442  *
 443  * This is a low-level instruction that should only be called from routines
 444  * implementing the inverse square root in simd_math.h.
 445  *
 446  * \param x Argument, x>0
 447  * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
 448  */
 449 static inline Simd4Float gmx_simdcall
 450 rsqrt(Simd4Float x)
 451 {
 452     Simd4Float         res;
 453
 454     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 455     {
 456         res.simdInternal_[i] = 1.0f / std::sqrt(x.simdInternal_[i]);
 457     }
 458     return res;
 459 };
 460
 461
 462 /*! \brief SIMD4 Floating-point fabs().
 463  *
 464  * \param a any floating point values
 465  * \return fabs(a) for each element.
 466  */
 467 static inline Simd4Float gmx_simdcall
 468 abs(Simd4Float a)
 469 {
 470     Simd4Float         res;
 471
 472     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 473     {
 474         res.simdInternal_[i] = std::abs(a.simdInternal_[i]);
 475     }
 476     return res;
 477 }
 478
 479 /*! \brief Set each SIMD4 element to the largest from two variables.
 480  *
 481  * \param a Any floating-point value
 482  * \param b Any floating-point value
 483  * \return max(a,b) for each element.
 484  */
 485 static inline Simd4Float gmx_simdcall
 486 max(Simd4Float a, Simd4Float b)
 487 {
 488     Simd4Float         res;
 489
 490     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 491     {
 492         res.simdInternal_[i] = std::max(a.simdInternal_[i], b.simdInternal_[i]);
 493     }
 494     return res;
 495 }
 496
 497
 498 /*! \brief Set each SIMD4 element to the largest from two variables.
 499  *
 500  * \param a Any floating-point value
 501  * \param b Any floating-point value
 502  * \return max(a,b) for each element.
 503  */
 504 static inline Simd4Float gmx_simdcall
 505 min(Simd4Float a, Simd4Float b)
 506 {
 507     Simd4Float         res;
 508
 509     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 510     {
 511         res.simdInternal_[i] = std::min(a.simdInternal_[i], b.simdInternal_[i]);
 512     }
 513     return res;
 514 }
 515
 516
 517 /*! \brief SIMD4 Round to nearest integer value (in floating-point format).
 518  *
 519  * \param a Any floating-point value
 520  * \return The nearest integer, represented in floating-point format.
 521  */
 522 static inline Simd4Float gmx_simdcall
 523 round(Simd4Float a)
 524 {
 525     Simd4Float         res;
 526
 527     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 528     {
 529         res.simdInternal_[i] = std::round(a.simdInternal_[i]);
 530     }
 531     return res;
 532 }
 533
 534
 535 /*! \brief Truncate SIMD4, i.e. round towards zero - common hardware instruction.
 536  *
 537  * \param a Any floating-point value
 538  * \return Integer rounded towards zero, represented in floating-point format.
 539  *
 540  * \note This is truncation towards zero, not floor(). The reason for this
 541  * is that truncation is virtually always present as a dedicated hardware
 542  * instruction, but floor() frequently isn't.
 543  */
 544 static inline Simd4Float gmx_simdcall
 545 trunc(Simd4Float a)
 546 {
 547     Simd4Float         res;
 548
 549     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 550     {
 551         res.simdInternal_[i] = std::trunc(a.simdInternal_[i]);
 552     }
 553     return res;
 554 }
 555
 556 /*! \brief Return dot product of two single precision SIMD4 variables.
 557  *
 558  * The dot product is calculated between the first three elements in the two
 559  * vectors, while the fourth is ignored. The result is returned as a scalar.
 560  *
 561  * \param a vector1
 562  * \param b vector2
 563  * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
 564  */
 565 static inline float gmx_simdcall
 566 dotProduct(Simd4Float a, Simd4Float b)
 567 {
 568     return
 569         (a.simdInternal_[0] * b.simdInternal_[0] +
 570          a.simdInternal_[1] * b.simdInternal_[1] +
 571          a.simdInternal_[2] * b.simdInternal_[2]);
 572 }
 573
 574 /*! \brief SIMD4 float transpose
 575  *
 576  * \param[in,out] v0  Row 0 on input, column 0 on output
 577  * \param[in,out] v1  Row 1 on input, column 1 on output
 578  * \param[in,out] v2  Row 2 on input, column 2 on output
 579  * \param[in,out] v3  Row 3 on input, column 3 on output
 580  */
 581 static inline void gmx_simdcall
 582 transpose(Simd4Float * v0, Simd4Float * v1,
 583           Simd4Float * v2, Simd4Float * v3)
 584 {
 585     Simd4Float t0 = *v0;
 586     Simd4Float t1 = *v1;
 587     Simd4Float t2 = *v2;
 588     Simd4Float t3 = *v3;
 589     v0->simdInternal_[0] = t0.simdInternal_[0];
 590     v0->simdInternal_[1] = t1.simdInternal_[0];
 591     v0->simdInternal_[2] = t2.simdInternal_[0];
 592     v0->simdInternal_[3] = t3.simdInternal_[0];
 593     v1->simdInternal_[0] = t0.simdInternal_[1];
 594     v1->simdInternal_[1] = t1.simdInternal_[1];
 595     v1->simdInternal_[2] = t2.simdInternal_[1];
 596     v1->simdInternal_[3] = t3.simdInternal_[1];
 597     v2->simdInternal_[0] = t0.simdInternal_[2];
 598     v2->simdInternal_[1] = t1.simdInternal_[2];
 599     v2->simdInternal_[2] = t2.simdInternal_[2];
 600     v2->simdInternal_[3] = t3.simdInternal_[2];
 601     v3->simdInternal_[0] = t0.simdInternal_[3];
 602     v3->simdInternal_[1] = t1.simdInternal_[3];
 603     v3->simdInternal_[2] = t2.simdInternal_[3];
 604     v3->simdInternal_[3] = t3.simdInternal_[3];
 605 }
 606
 607 /*! \brief a==b for SIMD4 float
 608  *
 609  * \param a value1
 610  * \param b value2
 611  * \return Each element of the boolean will be set to true if a==b.
 612  */
 613 static inline Simd4FBool gmx_simdcall
 614 operator==(Simd4Float a, Simd4Float b)
 615 {
 616     Simd4FBool         res;
 617
 618     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 619     {
 620         res.simdInternal_[i] = (a.simdInternal_[i] == b.simdInternal_[i]);
 621     }
 622     return res;
 623 }
 624
 625 /*! \brief a!=b for SIMD4 float
 626  *
 627  * \param a value1
 628  * \param b value2
 629  * \return Each element of the boolean will be set to true if a!=b.
 630  */
 631 static inline Simd4FBool gmx_simdcall
 632 operator!=(Simd4Float a, Simd4Float b)
 633 {
 634     Simd4FBool         res;
 635
 636     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 637     {
 638         res.simdInternal_[i] = (a.simdInternal_[i] != b.simdInternal_[i]);
 639     }
 640     return res;
 641 }
 642
 643 /*! \brief a<b for SIMD4 float
 644  *
 645  * \param a value1
 646  * \param b value2
 647  * \return Each element of the boolean will be set to true if a<b.
 648  */
 649 static inline Simd4FBool gmx_simdcall
 650 operator<(Simd4Float a, Simd4Float b)
 651 {
 652     Simd4FBool          res;
 653
 654     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 655     {
 656         res.simdInternal_[i] = (a.simdInternal_[i] < b.simdInternal_[i]);
 657     }
 658     return res;
 659 }
 660
 661
 662 /*! \brief a<=b for SIMD4 float.
 663  *
 664  * \param a value1
 665  * \param b value2
 666  * \return Each element of the boolean will be set to true if a<=b.
 667  */
 668 static inline Simd4FBool gmx_simdcall
 669 operator<=(Simd4Float a, Simd4Float b)
 670 {
 671     Simd4FBool          res;
 672
 673     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 674     {
 675         res.simdInternal_[i] = (a.simdInternal_[i] <= b.simdInternal_[i]);
 676     }
 677     return res;
 678 }
 679
 680 /*! \brief Logical \a and on single precision SIMD4 booleans.
 681  *
 682  * \param a logical vars 1
 683  * \param b logical vars 2
 684  * \return For each element, the result boolean is true if a \& b are true.
 685  *
 686  * \note This is not necessarily a bitwise operation - the storage format
 687  * of booleans is implementation-dependent.
 688  */
 689 static inline Simd4FBool gmx_simdcall
 690 operator&&(Simd4FBool a, Simd4FBool b)
 691 {
 692     Simd4FBool         res;
 693
 694     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 695     {
 696         res.simdInternal_[i] = (a.simdInternal_[i] && b.simdInternal_[i]);
 697     }
 698     return res;
 699 }
 700
 701 /*! \brief Logical \a or on single precision SIMD4 booleans.
 702  *
 703  * \param a logical vars 1
 704  * \param b logical vars 2
 705  * \return For each element, the result boolean is true if a or b is true.
 706  *
 707  * Note that this is not necessarily a bitwise operation - the storage format
 708  * of booleans is implementation-dependent.
 709  */
 710 static inline Simd4FBool gmx_simdcall
 711 operator||(Simd4FBool a, Simd4FBool b)
 712 {
 713     Simd4FBool         res;
 714
 715     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 716     {
 717         res.simdInternal_[i] = (a.simdInternal_[i] || b.simdInternal_[i]);
 718     }
 719     return res;
 720 }
 721
 722 /*! \brief Returns non-zero if any of the boolean in SIMD4 a is True, otherwise 0.
 723  *
 724  * \param a Logical variable.
 725  * \return true if any element in a is true, otherwise false.
 726  *
 727  * The actual return value for truth will depend on the architecture,
 728  * so any non-zero value is considered truth.
 729  */
 730 static inline bool gmx_simdcall
 731 anyTrue(Simd4FBool a)
 732 {
 733     bool res = false;
 734
 735     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 736     {
 737         res = res || a.simdInternal_[i];
 738     }
 739     return res;
 740 }
 741
 742 /*! \brief Select from single precision SIMD4 variable where boolean is true.
 743  *
 744  * \param a Floating-point variable to select from
 745  * \param mask Boolean selector
 746  * \return  For each element, a is selected for true, 0 for false.
 747  */
 748 static inline Simd4Float gmx_simdcall
 749 selectByMask(Simd4Float a, Simd4FBool mask)
 750 {
 751     Simd4Float          res;
 752
 753     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 754     {
 755         res.simdInternal_[i] = mask.simdInternal_[i] ? a.simdInternal_[i] : 0.0f;
 756     }
 757     return res;
 758 }
 759
 760 /*! \brief Select from single precision SIMD4 variable where boolean is false.
 761  *
 762  * \param a Floating-point variable to select from
 763  * \param mask Boolean selector
 764  * \return  For each element, a is selected for false, 0 for true (sic).
 765  */
 766 static inline Simd4Float gmx_simdcall
 767 selectByNotMask(Simd4Float a, Simd4FBool mask)
 768 {
 769     Simd4Float          res;
 770
 771     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 772     {
 773         res.simdInternal_[i] = mask.simdInternal_[i] ? 0.0f : a.simdInternal_[i];
 774     }
 775     return res;
 776 }
 777
 778
 779 /*! \brief Vector-blend SIMD4 selection.
 780  *
 781  * \param a First source
 782  * \param b Second source
 783  * \param sel Boolean selector
 784  * \return For each element, select b if sel is true, a otherwise.
 785  */
 786 static inline Simd4Float gmx_simdcall
 787 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 788 {
 789     Simd4Float         res;
 790
 791     for (std::size_t i = 0; i < res.simdInternal_.size(); i++)
 792     {
 793         res.simdInternal_[i] = sel.simdInternal_[i] ? b.simdInternal_[i] : a.simdInternal_[i];
 794     }
 795     return res;
 796 }
 797
 798
 799 /*! \brief Return sum of all elements in SIMD4 float variable.
 800  *
 801  * \param a SIMD4 variable to reduce/sum.
 802  * \return The sum of all elements in the argument variable.
 803  *
 804  */
 805 static inline float gmx_simdcall
 806 reduce(Simd4Float a)
 807 {
 808     float sum = 0.0f;
 809
 810     for (std::size_t i = 0; i < a.simdInternal_.size(); i++)
 811     {
 812         sum += a.simdInternal_[i];
 813     }
 814     return sum;
 815 }
 816
 817 /*! \} */
 818
 819 /*! \} */
 820 /*! \endcond */
 821
 822 }      // namespace gmx
 823
 824 #endif // GMX_SIMD_IMPL_REFERENCE_SIMD4_FLOAT_H