clang/lib/Headers/fmaintrin.h

   1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __FMAINTRIN_H
  15 #define __FMAINTRIN_H
  16
  17 /* Define the default attributes for the functions in this file. */
  18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
  19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
  20
  21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
  22 ///    For each element, computes <c> (__A * __B) + __C </c>.
  23 ///
  24 /// \headerfile <immintrin.h>
  25 ///
  26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
  27 ///
  28 /// \param __A
  29 ///    A 128-bit vector of [4 x float] containing the multiplicand.
  30 /// \param __B
  31 ///    A 128-bit vector of [4 x float] containing the multiplier.
  32 /// \param __C
  33 ///    A 128-bit vector of [4 x float] containing the addend.
  34 /// \returns A 128-bit vector of [4 x float] containing the result.
  35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
  36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
  37 {
  38   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
  39 }
  40
  41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
  42 ///    For each element, computes <c> (__A * __B) + __C </c>.
  43 ///
  44 /// \headerfile <immintrin.h>
  45 ///
  46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
  47 ///
  48 /// \param __A
  49 ///    A 128-bit vector of [2 x double] containing the multiplicand.
  50 /// \param __B
  51 ///    A 128-bit vector of [2 x double] containing the multiplier.
  52 /// \param __C
  53 ///    A 128-bit vector of [2 x double] containing the addend.
  54 /// \returns A 128-bit [2 x double] vector containing the result.
  55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
  56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
  57 {
  58   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
  59 }
  60
  61 /// Computes a scalar multiply-add of the single-precision values in the
  62 ///    low 32 bits of 128-bit vectors of [4 x float].
  63 /// \code
  64 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
  65 /// result[127:32] = __A[127:32]
  66 /// \endcode
  67 ///
  68 /// \headerfile <immintrin.h>
  69 ///
  70 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
  71 ///
  72 /// \param __A
  73 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
  74 ///    32 bits.
  75 /// \param __B
  76 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
  77 ///    32 bits.
  78 /// \param __C
  79 ///    A 128-bit vector of [4 x float] containing the addend in the low
  80 ///    32 bits.
  81 /// \returns A 128-bit vector of [4 x float] containing the result in the low
  82 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
  83 static __inline__ __m128 __DEFAULT_FN_ATTRS128
  84 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
  85 {
  86   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
  87 }
  88
  89 /// Computes a scalar multiply-add of the double-precision values in the
  90 ///    low 64 bits of 128-bit vectors of [2 x double].
  91 /// \code
  92 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
  93 /// result[127:64] = __A[127:64]
  94 /// \endcode
  95 ///
  96 /// \headerfile <immintrin.h>
  97 ///
  98 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
  99 ///
 100 /// \param __A
 101 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 102 ///    64 bits.
 103 /// \param __B
 104 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 105 ///    64 bits.
 106 /// \param __C
 107 ///    A 128-bit vector of [2 x double] containing the addend in the low
 108 ///    64 bits.
 109 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 110 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
 111 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 112 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 113 {
 114   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 115 }
 116
 117 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
 118 ///    For each element, computes <c> (__A * __B) - __C </c>.
 119 ///
 120 /// \headerfile <immintrin.h>
 121 ///
 122 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
 123 ///
 124 /// \param __A
 125 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 126 /// \param __B
 127 ///    A 128-bit vector of [4 x float] containing the multiplier.
 128 /// \param __C
 129 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 130 /// \returns A 128-bit vector of [4 x float] containing the result.
 131 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 132 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 133 {
 134   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 135 }
 136
 137 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
 138 ///    For each element, computes <c> (__A * __B) - __C </c>.
 139 ///
 140 /// \headerfile <immintrin.h>
 141 ///
 142 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
 143 ///
 144 /// \param __A
 145 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 146 /// \param __B
 147 ///    A 128-bit vector of [2 x double] containing the multiplier.
 148 /// \param __C
 149 ///    A 128-bit vector of [2 x double] containing the addend.
 150 /// \returns A 128-bit vector of [2 x double] containing the result.
 151 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 152 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 153 {
 154   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 155 }
 156
 157 /// Computes a scalar multiply-subtract of the single-precision values in
 158 ///    the low 32 bits of 128-bit vectors of [4 x float].
 159 /// \code
 160 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 161 /// result[127:32] = __A[127:32]
 162 /// \endcode
 163 ///
 164 /// \headerfile <immintrin.h>
 165 ///
 166 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
 167 ///
 168 /// \param __A
 169 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 170 ///    32 bits.
 171 /// \param __B
 172 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
 173 ///    32 bits.
 174 /// \param __C
 175 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
 176 ///   32 bits.
 177 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 178 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 179 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 180 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 181 {
 182   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 183 }
 184
 185 /// Computes a scalar multiply-subtract of the double-precision values in
 186 ///    the low 64 bits of 128-bit vectors of [2 x double].
 187 /// \code
 188 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 189 /// result[127:64] = __A[127:64]
 190 /// \endcode
 191 ///
 192 /// \headerfile <immintrin.h>
 193 ///
 194 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
 195 ///
 196 /// \param __A
 197 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 198 ///    64 bits.
 199 /// \param __B
 200 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 201 ///    64 bits.
 202 /// \param __C
 203 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
 204 ///    64 bits.
 205 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 206 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 207 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 208 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 209 {
 210   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 211 }
 212
 213 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
 214 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 215 ///
 216 /// \headerfile <immintrin.h>
 217 ///
 218 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
 219 ///
 220 /// \param __A
 221 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 222 /// \param __B
 223 ///    A 128-bit vector of [4 x float] containing the multiplier.
 224 /// \param __C
 225 ///    A 128-bit vector of [4 x float] containing the addend.
 226 /// \returns A 128-bit [4 x float] vector containing the result.
 227 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 228 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 229 {
 230   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 231 }
 232
 233 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
 234 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 235 ///
 236 /// \headerfile <immintrin.h>
 237 ///
 238 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
 239 ///
 240 /// \param __A
 241 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 242 /// \param __B
 243 ///    A 128-bit vector of [2 x double] containing the multiplier.
 244 /// \param __C
 245 ///    A 128-bit vector of [2 x double] containing the addend.
 246 /// \returns A 128-bit vector of [2 x double] containing the result.
 247 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 248 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 249 {
 250   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 251 }
 252
 253 /// Computes a scalar negated multiply-add of the single-precision values in
 254 ///    the low 32 bits of 128-bit vectors of [4 x float].
 255 /// \code
 256 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
 257 /// result[127:32] = __A[127:32]
 258 /// \endcode
 259 ///
 260 /// \headerfile <immintrin.h>
 261 ///
 262 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
 263 ///
 264 /// \param __A
 265 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 266 ///    32 bits.
 267 /// \param __B
 268 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
 269 ///    32 bits.
 270 /// \param __C
 271 ///    A 128-bit vector of [4 x float] containing the addend in the low
 272 ///    32 bits.
 273 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 274 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 275 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 276 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 277 {
 278   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 279 }
 280
 281 /// Computes a scalar negated multiply-add of the double-precision values
 282 ///    in the low 64 bits of 128-bit vectors of [2 x double].
 283 /// \code
 284 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
 285 /// result[127:64] = __A[127:64]
 286 /// \endcode
 287 ///
 288 /// \headerfile <immintrin.h>
 289 ///
 290 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
 291 ///
 292 /// \param __A
 293 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 294 ///    64 bits.
 295 /// \param __B
 296 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 297 ///    64 bits.
 298 /// \param __C
 299 ///    A 128-bit vector of [2 x double] containing the addend in the low
 300 ///    64 bits.
 301 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 302 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 303 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 304 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 305 {
 306   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 307 }
 308
 309 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
 310 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 311 ///
 312 /// \headerfile <immintrin.h>
 313 ///
 314 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
 315 ///
 316 /// \param __A
 317 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 318 /// \param __B
 319 ///    A 128-bit vector of [4 x float] containing the multiplier.
 320 /// \param __C
 321 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 322 /// \returns A 128-bit vector of [4 x float] containing the result.
 323 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 324 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 325 {
 326   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 327 }
 328
 329 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
 330 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 331 ///
 332 /// \headerfile <immintrin.h>
 333 ///
 334 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
 335 ///
 336 /// \param __A
 337 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 338 /// \param __B
 339 ///    A 128-bit vector of [2 x double] containing the multiplier.
 340 /// \param __C
 341 ///    A 128-bit vector of [2 x double] containing the subtrahend.
 342 /// \returns A 128-bit vector of [2 x double] containing the result.
 343 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 344 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 345 {
 346   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 347 }
 348
 349 /// Computes a scalar negated multiply-subtract of the single-precision
 350 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
 351 /// \code
 352 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
 353 /// result[127:32] = __A[127:32]
 354 /// \endcode
 355 ///
 356 /// \headerfile <immintrin.h>
 357 ///
 358 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
 359 ///
 360 /// \param __A
 361 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 362 ///    32 bits.
 363 /// \param __B
 364 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
 365 ///    32 bits.
 366 /// \param __C
 367 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
 368 ///    32 bits.
 369 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 370 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 371 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 372 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 373 {
 374   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 375 }
 376
 377 /// Computes a scalar negated multiply-subtract of the double-precision
 378 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
 379 /// \code
 380 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
 381 /// result[127:64] = __A[127:64]
 382 /// \endcode
 383 ///
 384 /// \headerfile <immintrin.h>
 385 ///
 386 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
 387 ///
 388 /// \param __A
 389 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 390 ///    64 bits.
 391 /// \param __B
 392 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 393 ///    64 bits.
 394 /// \param __C
 395 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
 396 ///    64 bits.
 397 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 398 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 399 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 400 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 401 {
 402   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 403 }
 404
 405 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 406 ///    [4 x float].
 407 /// \code
 408 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
 409 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 410 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
 411 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
 412 /// \endcode
 413 ///
 414 /// \headerfile <immintrin.h>
 415 ///
 416 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
 417 ///
 418 /// \param __A
 419 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 420 /// \param __B
 421 ///    A 128-bit vector of [4 x float] containing the multiplier.
 422 /// \param __C
 423 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
 424 /// \returns A 128-bit vector of [4 x float] containing the result.
 425 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 426 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 427 {
 428   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 429 }
 430
 431 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 432 ///    [2 x double].
 433 /// \code
 434 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
 435 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 436 /// \endcode
 437 ///
 438 /// \headerfile <immintrin.h>
 439 ///
 440 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
 441 ///
 442 /// \param __A
 443 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 444 /// \param __B
 445 ///    A 128-bit vector of [2 x double] containing the multiplier.
 446 /// \param __C
 447 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
 448 /// \returns A 128-bit vector of [2 x double] containing the result.
 449 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 450 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 451 {
 452   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 453 }
 454
 455 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 456 ///    [4 x float].
 457 /// \code
 458 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
 459 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 460 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
 461 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
 462 /// \endcode
 463 ///
 464 /// \headerfile <immintrin.h>
 465 ///
 466 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
 467 ///
 468 /// \param __A
 469 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 470 /// \param __B
 471 ///    A 128-bit vector of [4 x float] containing the multiplier.
 472 /// \param __C
 473 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
 474 /// \returns A 128-bit vector of [4 x float] containing the result.
 475 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 476 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 477 {
 478   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 479 }
 480
 481 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 482 ///    [2 x double].
 483 /// \code
 484 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
 485 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 486 /// \endcode
 487 ///
 488 /// \headerfile <immintrin.h>
 489 ///
 490 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
 491 ///
 492 /// \param __A
 493 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 494 /// \param __B
 495 ///    A 128-bit vector of [2 x double] containing the multiplier.
 496 /// \param __C
 497 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
 498 /// \returns A 128-bit vector of [2 x double] containing the result.
 499 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 500 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 501 {
 502   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 503 }
 504
 505 /// Computes a multiply-add of 256-bit vectors of [8 x float].
 506 ///    For each element, computes <c> (__A * __B) + __C </c>.
 507 ///
 508 /// \headerfile <immintrin.h>
 509 ///
 510 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
 511 ///
 512 /// \param __A
 513 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 514 /// \param __B
 515 ///    A 256-bit vector of [8 x float] containing the multiplier.
 516 /// \param __C
 517 ///    A 256-bit vector of [8 x float] containing the addend.
 518 /// \returns A 256-bit vector of [8 x float] containing the result.
 519 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 520 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 521 {
 522   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 523 }
 524
 525 /// Computes a multiply-add of 256-bit vectors of [4 x double].
 526 ///    For each element, computes <c> (__A * __B) + __C </c>.
 527 ///
 528 /// \headerfile <immintrin.h>
 529 ///
 530 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
 531 ///
 532 /// \param __A
 533 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 534 /// \param __B
 535 ///    A 256-bit vector of [4 x double] containing the multiplier.
 536 /// \param __C
 537 ///    A 256-bit vector of [4 x double] containing the addend.
 538 /// \returns A 256-bit vector of [4 x double] containing the result.
 539 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 540 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 541 {
 542   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 543 }
 544
 545 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
 546 ///    For each element, computes <c> (__A * __B) - __C </c>.
 547 ///
 548 /// \headerfile <immintrin.h>
 549 ///
 550 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
 551 ///
 552 /// \param __A
 553 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 554 /// \param __B
 555 ///    A 256-bit vector of [8 x float] containing the multiplier.
 556 /// \param __C
 557 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 558 /// \returns A 256-bit vector of [8 x float] containing the result.
 559 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 560 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 561 {
 562   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 563 }
 564
 565 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
 566 ///    For each element, computes <c> (__A * __B) - __C </c>.
 567 ///
 568 /// \headerfile <immintrin.h>
 569 ///
 570 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
 571 ///
 572 /// \param __A
 573 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 574 /// \param __B
 575 ///    A 256-bit vector of [4 x double] containing the multiplier.
 576 /// \param __C
 577 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 578 /// \returns A 256-bit vector of [4 x double] containing the result.
 579 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 580 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 581 {
 582   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 583 }
 584
 585 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
 586 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 587 ///
 588 /// \headerfile <immintrin.h>
 589 ///
 590 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
 591 ///
 592 /// \param __A
 593 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 594 /// \param __B
 595 ///    A 256-bit vector of [8 x float] containing the multiplier.
 596 /// \param __C
 597 ///    A 256-bit vector of [8 x float] containing the addend.
 598 /// \returns A 256-bit vector of [8 x float] containing the result.
 599 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 600 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 601 {
 602   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 603 }
 604
 605 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
 606 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 607 ///
 608 /// \headerfile <immintrin.h>
 609 ///
 610 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
 611 ///
 612 /// \param __A
 613 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 614 /// \param __B
 615 ///    A 256-bit vector of [4 x double] containing the multiplier.
 616 /// \param __C
 617 ///    A 256-bit vector of [4 x double] containing the addend.
 618 /// \returns A 256-bit vector of [4 x double] containing the result.
 619 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 620 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 621 {
 622   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 623 }
 624
 625 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
 626 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 627 ///
 628 /// \headerfile <immintrin.h>
 629 ///
 630 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
 631 ///
 632 /// \param __A
 633 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 634 /// \param __B
 635 ///    A 256-bit vector of [8 x float] containing the multiplier.
 636 /// \param __C
 637 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 638 /// \returns A 256-bit vector of [8 x float] containing the result.
 639 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 640 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 641 {
 642   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 643 }
 644
 645 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
 646 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 647 ///
 648 /// \headerfile <immintrin.h>
 649 ///
 650 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
 651 ///
 652 /// \param __A
 653 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 654 /// \param __B
 655 ///    A 256-bit vector of [4 x double] containing the multiplier.
 656 /// \param __C
 657 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 658 /// \returns A 256-bit vector of [4 x double] containing the result.
 659 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 660 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 661 {
 662   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 663 }
 664
 665 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 666 ///    [8 x float].
 667 /// \code
 668 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 669 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 670 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
 671 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
 672 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
 673 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
 674 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
 675 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
 676 /// \endcode
 677 ///
 678 /// \headerfile <immintrin.h>
 679 ///
 680 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
 681 ///
 682 /// \param __A
 683 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 684 /// \param __B
 685 ///    A 256-bit vector of [8 x float] containing the multiplier.
 686 /// \param __C
 687 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
 688 /// \returns A 256-bit vector of [8 x float] containing the result.
 689 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 690 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 691 {
 692   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 693 }
 694
 695 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 696 ///    [4 x double].
 697 /// \code
 698 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 699 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 700 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
 701 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
 702 /// \endcode
 703 ///
 704 /// \headerfile <immintrin.h>
 705 ///
 706 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
 707 ///
 708 /// \param __A
 709 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 710 /// \param __B
 711 ///    A 256-bit vector of [4 x double] containing the multiplier.
 712 /// \param __C
 713 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
 714 /// \returns A 256-bit vector of [4 x double] containing the result.
 715 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 716 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 717 {
 718   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 719 }
 720
 721 /// Computes a vector multiply with alternating add/subtract of 256-bit
 722 ///    vectors of [8 x float].
 723 /// \code
 724 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 725 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 726 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
 727 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
 728 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
 729 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
 730 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
 731 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
 732 /// \endcode
 733 ///
 734 /// \headerfile <immintrin.h>
 735 ///
 736 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
 737 ///
 738 /// \param __A
 739 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 740 /// \param __B
 741 ///    A 256-bit vector of [8 x float] containing the multiplier.
 742 /// \param __C
 743 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
 744 /// \returns A 256-bit vector of [8 x float] containing the result.
 745 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 746 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 747 {
 748   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 749 }
 750
 751 /// Computes a vector multiply with alternating add/subtract of 256-bit
 752 ///    vectors of [4 x double].
 753 /// \code
 754 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 755 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 756 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
 757 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
 758 /// \endcode
 759 ///
 760 /// \headerfile <immintrin.h>
 761 ///
 762 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
 763 ///
 764 /// \param __A
 765 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 766 /// \param __B
 767 ///    A 256-bit vector of [4 x double] containing the multiplier.
 768 /// \param __C
 769 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
 770 /// \returns A 256-bit vector of [4 x double] containing the result.
 771 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 772 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 773 {
 774   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 775 }
 776
 777 #undef __DEFAULT_FN_ATTRS128
 778 #undef __DEFAULT_FN_ATTRS256
 779
 780 #endif /* __FMAINTRIN_H */