src/gromacs/simd/impl_x86_mic/impl_x86_mic_simd_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
  37 #define GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
  38
  39 #include "config.h"
  40
  41 #include <cassert>
  42 #include <cstdint>
  43
  44 #include <immintrin.h>
  45
  46 #include "gromacs/utility/basedefinitions.h"
  47
  48 #include "impl_x86_mic_simd_float.h"
  49
  50 namespace gmx
  51 {
  52
  53 class SimdDouble
  54 {
  55     public:
  56         SimdDouble() {}
  57
  58         SimdDouble(double d) : simdInternal_(_mm512_set1_pd(d)) {}
  59
  60         // Internal utility constructor to simplify return statements
  61         SimdDouble(__m512d simd) : simdInternal_(simd) {}
  62
  63         __m512d  simdInternal_;
  64 };
  65
  66 class SimdDInt32
  67 {
  68     public:
  69         SimdDInt32() {}
  70
  71         SimdDInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
  72
  73         // Internal utility constructor to simplify return statements
  74         SimdDInt32(__m512i simd) : simdInternal_(simd) {}
  75
  76         __m512i  simdInternal_;
  77 };
  78
  79 class SimdDBool
  80 {
  81     public:
  82         SimdDBool() {}
  83
  84         // Internal utility constructor to simplify return statements
  85         SimdDBool(__mmask8 simd) : simdInternal_(simd) {}
  86
  87         __mmask8  simdInternal_;
  88 };
  89
  90 class SimdDIBool
  91 {
  92     public:
  93         SimdDIBool() {}
  94
  95         // Internal utility constructor to simplify return statements
  96         SimdDIBool(__mmask16 simd) : simdInternal_(simd) {}
  97
  98         __mmask16  simdInternal_;
  99 };
 100
 101 static inline SimdDouble gmx_simdcall
 102 load(const double *m)
 103 {
 104     assert(std::size_t(m) % 64 == 0);
 105     return {
 106                _mm512_load_pd(m)
 107     };
 108 }
 109
 110 static inline void gmx_simdcall
 111 store(double *m, SimdDouble a)
 112 {
 113     assert(std::size_t(m) % 64 == 0);
 114     _mm512_store_pd(m, a.simdInternal_);
 115 }
 116
 117 static inline SimdDouble gmx_simdcall
 118 loadU(const double *m)
 119 {
 120     return {
 121                _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8)
 122     };
 123 }
 124
 125 static inline void gmx_simdcall
 126 storeU(double *m, SimdDouble a)
 127 {
 128     _mm512_packstorelo_pd(m, a.simdInternal_);
 129     _mm512_packstorehi_pd(m+8, a.simdInternal_);
 130
 131 }
 132
 133 static inline SimdDouble gmx_simdcall
 134 setZeroD()
 135 {
 136     return {
 137                _mm512_setzero_pd()
 138     };
 139 }
 140
 141 static inline SimdDInt32 gmx_simdcall
 142 loadDI(const std::int32_t * m)
 143 {
 144     assert(std::size_t(m) % 32 == 0);
 145     return {
 146                _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
 147     };
 148 }
 149
 150 static inline void gmx_simdcall
 151 store(std::int32_t * m, SimdDInt32 a)
 152 {
 153     assert(std::size_t(m) % 32 == 0);
 154     _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
 155 }
 156
 157 static inline SimdDInt32 gmx_simdcall
 158 loadUDI(const std::int32_t *m)
 159 {
 160     return {
 161                _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), m),
 162                                               _mm512_int2mask(0x00FF), m+16)
 163     };
 164 }
 165
 166 static inline void gmx_simdcall
 167 storeU(std::int32_t * m, SimdDInt32 a)
 168 {
 169     _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
 170     _mm512_mask_packstorehi_epi32(m+16, _mm512_int2mask(0x00FF), a.simdInternal_);
 171 }
 172
 173 static inline SimdDInt32 gmx_simdcall
 174 setZeroDI()
 175 {
 176     return {
 177                _mm512_setzero_epi32()
 178     };
 179 }
 180
 181 template<int index>
 182 static inline std::int32_t gmx_simdcall
 183 extract(SimdDInt32 a)
 184 {
 185     int r;
 186     _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a.simdInternal_);
 187     return r;
 188 }
 189
 190 static inline SimdDouble gmx_simdcall
 191 operator&(SimdDouble a, SimdDouble b)
 192 {
 193     return {
 194                _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
 195     };
 196 }
 197
 198 static inline SimdDouble gmx_simdcall
 199 andNot(SimdDouble a, SimdDouble b)
 200 {
 201     return {
 202                _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
 203     };
 204 }
 205
 206 static inline SimdDouble gmx_simdcall
 207 operator|(SimdDouble a, SimdDouble b)
 208 {
 209     return {
 210                _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
 211     };
 212 }
 213
 214 static inline SimdDouble gmx_simdcall
 215 operator^(SimdDouble a, SimdDouble b)
 216 {
 217     return {
 218                _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
 219     };
 220 }
 221
 222 static inline SimdDouble gmx_simdcall
 223 operator+(SimdDouble a, SimdDouble b)
 224 {
 225     return {
 226                _mm512_add_pd(a.simdInternal_, b.simdInternal_)
 227     };
 228 }
 229
 230 static inline SimdDouble gmx_simdcall
 231 operator-(SimdDouble a, SimdDouble b)
 232 {
 233     return {
 234                _mm512_sub_pd(a.simdInternal_, b.simdInternal_)
 235     };
 236 }
 237
 238 static inline SimdDouble gmx_simdcall
 239 operator-(SimdDouble x)
 240 {
 241     return {
 242                _mm512_addn_pd(x.simdInternal_, _mm512_setzero_pd())
 243     };
 244 }
 245
 246 static inline SimdDouble gmx_simdcall
 247 operator*(SimdDouble a, SimdDouble b)
 248 {
 249     return {
 250                _mm512_mul_pd(a.simdInternal_, b.simdInternal_)
 251     };
 252 }
 253
 254 static inline SimdDouble gmx_simdcall
 255 fma(SimdDouble a, SimdDouble b, SimdDouble c)
 256 {
 257     return {
 258                _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 259     };
 260 }
 261
 262 static inline SimdDouble gmx_simdcall
 263 fms(SimdDouble a, SimdDouble b, SimdDouble c)
 264 {
 265     return {
 266                _mm512_fmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 267     };
 268 }
 269
 270 static inline SimdDouble gmx_simdcall
 271 fnma(SimdDouble a, SimdDouble b, SimdDouble c)
 272 {
 273     return {
 274                _mm512_fnmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 275     };
 276 }
 277
 278 static inline SimdDouble gmx_simdcall
 279 fnms(SimdDouble a, SimdDouble b, SimdDouble c)
 280 {
 281     return {
 282                _mm512_fnmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 283     };
 284 }
 285
 286 static inline SimdDouble gmx_simdcall
 287 rsqrt(SimdDouble x)
 288 {
 289     return {
 290                _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
 291     };
 292 }
 293
 294 static inline SimdDouble gmx_simdcall
 295 rcp(SimdDouble x)
 296 {
 297     return {
 298                _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
 299     };
 300 }
 301
 302 static inline SimdDouble gmx_simdcall
 303 maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
 304 {
 305     return {
 306                _mm512_mask_add_pd(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_)
 307     };
 308 }
 309
 310 static inline SimdDouble gmx_simdcall
 311 maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
 312 {
 313     return {
 314                _mm512_mask_mul_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_, b.simdInternal_)
 315     };
 316 }
 317
 318 static inline SimdDouble gmx_simdcall
 319 maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
 320 {
 321     return {
 322                _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_))
 323     };
 324 }
 325
 326 static inline SimdDouble gmx_simdcall
 327 maskzRsqrt(SimdDouble x, SimdDBool m)
 328 {
 329     return {
 330                _mm512_cvtpslo_pd(_mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
 331     };
 332 }
 333
 334 static inline SimdDouble gmx_simdcall
 335 maskzRcp(SimdDouble x, SimdDBool m)
 336 {
 337     return {
 338                _mm512_cvtpslo_pd(_mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
 339     };
 340 }
 341
 342 static inline SimdDouble gmx_simdcall
 343 abs(SimdDouble x)
 344 {
 345     return {
 346                _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)), _mm512_castpd_si512(x.simdInternal_)))
 347     };
 348 }
 349
 350 static inline SimdDouble gmx_simdcall
 351 max(SimdDouble a, SimdDouble b)
 352 {
 353     return {
 354                _mm512_gmax_pd(a.simdInternal_, b.simdInternal_)
 355     };
 356 }
 357
 358 static inline SimdDouble gmx_simdcall
 359 min(SimdDouble a, SimdDouble b)
 360 {
 361     return {
 362                _mm512_gmin_pd(a.simdInternal_, b.simdInternal_)
 363     };
 364 }
 365
 366 static inline SimdDouble gmx_simdcall
 367 round(SimdDouble x)
 368 {
 369     return {
 370                _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
 371     };
 372 }
 373
 374 static inline SimdDouble gmx_simdcall
 375 trunc(SimdDouble x)
 376 {
 377     return {
 378                _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
 379     };
 380 }
 381
 382 static inline SimdDouble
 383 frexp(SimdDouble value, SimdDInt32 * exponent)
 384 {
 385     __m512d rExponent = _mm512_getexp_pd(value.simdInternal_);
 386     __m512i iExponent = _mm512_cvtfxpnt_roundpd_epi32lo(rExponent, _MM_FROUND_TO_NEAREST_INT);
 387
 388     exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
 389
 390     return {
 391                _mm512_getmant_pd(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src)
 392     };
 393 }
 394
 395 static inline SimdDouble
 396 ldexp(SimdDouble value, SimdDInt32 exponent)
 397 {
 398     const __m512i exponentBias = _mm512_set1_epi32(1023);
 399     __m512i       iExponent;
 400
 401     iExponent = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), exponent.simdInternal_);
 402     iExponent = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iExponent, exponentBias), 20);
 403     return _mm512_mul_pd(_mm512_castsi512_pd(iExponent), value.simdInternal_);
 404 }
 405
 406 static inline double gmx_simdcall
 407 reduce(SimdDouble a)
 408 {
 409     return _mm512_reduce_add_pd(a.simdInternal_);
 410 }
 411
 412 // Picky, picky, picky:
 413 // icc-16 complains about "Illegal value of immediate argument to intrinsic"
 414 // unless we use
 415 // 1) Ordered-quiet for ==
 416 // 2) Unordered-quiet for !=
 417 // 3) Ordered-signaling for < and <=
 418
 419 static inline SimdDBool gmx_simdcall
 420 operator==(SimdDouble a, SimdDouble b)
 421 {
 422     return {
 423                _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
 424     };
 425 }
 426
 427 static inline SimdDBool gmx_simdcall
 428 operator!=(SimdDouble a, SimdDouble b)
 429 {
 430     return {
 431                _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
 432     };
 433 }
 434
 435 static inline SimdDBool gmx_simdcall
 436 operator<(SimdDouble a, SimdDouble b)
 437 {
 438     return {
 439                _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
 440     };
 441 }
 442
 443 static inline SimdDBool gmx_simdcall
 444 operator<=(SimdDouble a, SimdDouble b)
 445 {
 446     return {
 447                _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
 448     };
 449 }
 450
 451 static inline SimdDBool gmx_simdcall
 452 testBits(SimdDouble a)
 453 {
 454     // This is a bit problematic since Knight's corner does not have any 64-bit integer comparisons,
 455     // and we cannot use floating-point since values with just a single bit set can evaluate to 0.0.
 456     // Instead, we do it as
 457     // 1) Do a logical or of the high/low 32 bits
 458     // 2) Do a permute so we have the low 32 bits of each value in the low 8 32-bit elements
 459     // 3) Do an integer comparison, and cast so we just keep the low 8 bits of the mask.
 460     //
 461     // By default we will use integers for the masks in the nonbonded kernels, so this shouldn't
 462     // have any significant performance drawbacks.
 463
 464     __m512i ia = _mm512_castpd_si512(a.simdInternal_);
 465
 466     ia = _mm512_or_epi32(ia, _mm512_swizzle_epi32(ia, _MM_SWIZ_REG_CDAB));
 467     ia = _mm512_permutevar_epi32( _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), ia);
 468
 469     return {
 470                static_cast<__mmask8>(_mm512_cmp_epi32_mask(ia, _mm512_setzero_si512(), _MM_CMPINT_NE))
 471     };
 472 }
 473
 474 static inline SimdDBool gmx_simdcall
 475 operator&&(SimdDBool a, SimdDBool b)
 476 {
 477     return {
 478                static_cast<__mmask8>(_mm512_kand(a.simdInternal_, b.simdInternal_))
 479     };
 480 }
 481
 482 static inline SimdDBool gmx_simdcall
 483 operator||(SimdDBool a, SimdDBool b)
 484 {
 485     return {
 486                static_cast<__mmask8>(_mm512_kor(a.simdInternal_, b.simdInternal_))
 487     };
 488 }
 489
 490 static inline bool gmx_simdcall
 491 anyTrue(SimdDBool a)
 492 {
 493     return _mm512_mask2int(a.simdInternal_) != 0;
 494 }
 495
 496 static inline SimdDouble gmx_simdcall
 497 selectByMask(SimdDouble a, SimdDBool m)
 498 {
 499     return {
 500                _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
 501     };
 502 }
 503
 504 static inline SimdDouble gmx_simdcall
 505 selectByNotMask(SimdDouble a, SimdDBool m)
 506 {
 507     return {
 508                _mm512_mask_mov_pd(a.simdInternal_, m.simdInternal_, _mm512_setzero_pd())
 509     };
 510 }
 511
 512 static inline SimdDouble gmx_simdcall
 513 blend(SimdDouble a, SimdDouble b, SimdDBool sel)
 514 {
 515     return {
 516                _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
 517     };
 518 }
 519
 520 static inline SimdDInt32 gmx_simdcall
 521 operator<<(SimdDInt32 a, int n)
 522 {
 523     return {
 524                _mm512_slli_epi32(a.simdInternal_, n)
 525     };
 526 }
 527
 528 static inline SimdDInt32 gmx_simdcall
 529 operator>>(SimdDInt32 a, int n)
 530 {
 531     return {
 532                _mm512_srli_epi32(a.simdInternal_, n)
 533     };
 534 }
 535
 536 static inline SimdDInt32 gmx_simdcall
 537 operator&(SimdDInt32 a, SimdDInt32 b)
 538 {
 539     return {
 540                _mm512_and_epi32(a.simdInternal_, b.simdInternal_)
 541     };
 542 }
 543
 544 static inline SimdDInt32 gmx_simdcall
 545 andNot(SimdDInt32 a, SimdDInt32 b)
 546 {
 547     return {
 548                _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_)
 549     };
 550 }
 551
 552 static inline SimdDInt32 gmx_simdcall
 553 operator|(SimdDInt32 a, SimdDInt32 b)
 554 {
 555     return {
 556                _mm512_or_epi32(a.simdInternal_, b.simdInternal_)
 557     };
 558 }
 559
 560 static inline SimdDInt32 gmx_simdcall
 561 operator^(SimdDInt32 a, SimdDInt32 b)
 562 {
 563     return {
 564                _mm512_xor_epi32(a.simdInternal_, b.simdInternal_)
 565     };
 566 }
 567
 568 static inline SimdDInt32 gmx_simdcall
 569 operator+(SimdDInt32 a, SimdDInt32 b)
 570 {
 571     return {
 572                _mm512_add_epi32(a.simdInternal_, b.simdInternal_)
 573     };
 574 }
 575
 576 static inline SimdDInt32 gmx_simdcall
 577 operator-(SimdDInt32 a, SimdDInt32 b)
 578 {
 579     return {
 580                _mm512_sub_epi32(a.simdInternal_, b.simdInternal_)
 581     };
 582 }
 583
 584 static inline SimdDInt32 gmx_simdcall
 585 operator*(SimdDInt32 a, SimdDInt32 b)
 586 {
 587     return {
 588                _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_)
 589     };
 590 }
 591
 592 static inline SimdDIBool gmx_simdcall
 593 operator==(SimdDInt32 a, SimdDInt32 b)
 594 {
 595     return {
 596                _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ)
 597     };
 598 }
 599
 600 static inline SimdDIBool gmx_simdcall
 601 testBits(SimdDInt32 a)
 602 {
 603     return {
 604                _mm512_cmp_epi32_mask(a.simdInternal_, _mm512_setzero_si512(), _MM_CMPINT_NE)
 605     };
 606 }
 607
 608 static inline SimdDIBool gmx_simdcall
 609 operator<(SimdDInt32 a, SimdDInt32 b)
 610 {
 611     return {
 612                _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT)
 613     };
 614 }
 615
 616 static inline SimdDIBool gmx_simdcall
 617 operator&&(SimdDIBool a, SimdDIBool b)
 618 {
 619     return {
 620                _mm512_kand(a.simdInternal_, b.simdInternal_)
 621     };
 622 }
 623
 624 static inline SimdDIBool gmx_simdcall
 625 operator||(SimdDIBool a, SimdDIBool b)
 626 {
 627     return {
 628                _mm512_kor(a.simdInternal_, b.simdInternal_)
 629     };
 630 }
 631
 632 static inline bool gmx_simdcall
 633 anyTrue(SimdDIBool a)
 634 {
 635     return ( _mm512_mask2int(a.simdInternal_) & 0xFF) != 0;
 636 }
 637
 638 static inline SimdDInt32 gmx_simdcall
 639 selectByMask(SimdDInt32 a, SimdDIBool m)
 640 {
 641     return {
 642                _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_)
 643     };
 644 }
 645
 646 static inline SimdDInt32 gmx_simdcall
 647 selectByNotMask(SimdDInt32 a, SimdDIBool m)
 648 {
 649     return {
 650                _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32())
 651     };
 652 }
 653
 654 static inline SimdDInt32 gmx_simdcall
 655 blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
 656 {
 657     return {
 658                _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
 659     };
 660 }
 661
 662 static inline SimdDInt32 gmx_simdcall
 663 cvtR2I(SimdDouble a)
 664 {
 665     return {
 666                _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT)
 667     };
 668 }
 669
 670 static inline SimdDInt32 gmx_simdcall
 671 cvttR2I(SimdDouble a)
 672 {
 673     return {
 674                _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_ZERO)
 675     };
 676 }
 677
 678 static inline SimdDouble gmx_simdcall
 679 cvtI2R(SimdDInt32 a)
 680 {
 681     return {
 682                _mm512_cvtepi32lo_pd(a.simdInternal_)
 683     };
 684 }
 685
 686 static inline SimdDIBool gmx_simdcall
 687 cvtB2IB(SimdDBool a)
 688 {
 689     return {
 690                a.simdInternal_
 691     };
 692 }
 693
 694 static inline SimdDBool gmx_simdcall
 695 cvtIB2B(SimdDIBool a)
 696 {
 697     return {
 698                static_cast<__mmask8>(a.simdInternal_)
 699     };
 700 }
 701
 702 static inline void gmx_simdcall
 703 cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
 704 {
 705     __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f.simdInternal_), _MM_PERM_DCDC);
 706
 707     *d0 = _mm512_cvtpslo_pd(f.simdInternal_);
 708     *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
 709 }
 710
 711 static inline SimdFloat gmx_simdcall
 712 cvtDD2F(SimdDouble d0, SimdDouble d1)
 713 {
 714     __m512 f0 = _mm512_cvtpd_pslo(d0.simdInternal_);
 715     __m512 f1 = _mm512_cvtpd_pslo(d1.simdInternal_);
 716     return {
 717                _mm512_mask_permute4f128_ps(f0, _mm512_int2mask(0xFF00), f1, _MM_PERM_BABA)
 718     };
 719 }
 720
 721 }      // namespace gmx
 722
 723 #endif // GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H