clang/lib/Headers/fmaintrin.h

   1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __IMMINTRIN_H
  11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
  12 #endif
  13
  14 #ifndef __FMAINTRIN_H
  15 #define __FMAINTRIN_H
  16
  17 /* Define the default attributes for the functions in this file. */
  18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
  19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
  20
  21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
  22 ///    For each element, computes <c> (__A * __B) + __C </c>.
  23 ///
  24 /// \headerfile <immintrin.h>
  25 ///
  26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
  27 ///
  28 /// \param __A
  29 ///    A 128-bit vector of [4 x float] containing the multiplicand.
  30 /// \param __B
  31 ///    A 128-bit vector of [4 x float] containing the multiplier.
  32 /// \param __C
  33 ///    A 128-bit vector of [4 x float] containing the addend.
  34 /// \returns A 128-bit vector of [4 x float] containing the result.
  35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
  36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
  37 {
  38   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
  39 }
  40
  41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
  42 ///    For each element, computes <c> (__A * __B) + __C </c>.
  43 ///
  44 /// \headerfile <immintrin.h>
  45 ///
  46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
  47 ///
  48 /// \param __A
  49 ///    A 128-bit vector of [2 x double] containing the multiplicand.
  50 /// \param __B
  51 ///    A 128-bit vector of [2 x double] containing the multiplier.
  52 /// \param __C
  53 ///    A 128-bit vector of [2 x double] containing the addend.
  54 /// \returns A 128-bit [2 x double] vector containing the result.
  55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
  56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
  57 {
  58   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
  59 }
  60
  61 /// Computes a scalar multiply-add of the single-precision values in the
  62 ///    low 32 bits of 128-bit vectors of [4 x float].
  63 ///
  64 /// \code{.operation}
  65 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
  66 /// result[127:32] = __A[127:32]
  67 /// \endcode
  68 ///
  69 /// \headerfile <immintrin.h>
  70 ///
  71 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
  72 ///
  73 /// \param __A
  74 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
  75 ///    32 bits.
  76 /// \param __B
  77 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
  78 ///    32 bits.
  79 /// \param __C
  80 ///    A 128-bit vector of [4 x float] containing the addend in the low
  81 ///    32 bits.
  82 /// \returns A 128-bit vector of [4 x float] containing the result in the low
  83 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
  84 static __inline__ __m128 __DEFAULT_FN_ATTRS128
  85 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
  86 {
  87   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
  88 }
  89
  90 /// Computes a scalar multiply-add of the double-precision values in the
  91 ///    low 64 bits of 128-bit vectors of [2 x double].
  92 ///
  93 /// \code{.operation}
  94 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
  95 /// result[127:64] = __A[127:64]
  96 /// \endcode
  97 ///
  98 /// \headerfile <immintrin.h>
  99 ///
 100 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
 101 ///
 102 /// \param __A
 103 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 104 ///    64 bits.
 105 /// \param __B
 106 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 107 ///    64 bits.
 108 /// \param __C
 109 ///    A 128-bit vector of [2 x double] containing the addend in the low
 110 ///    64 bits.
 111 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 112 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
 113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 114 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 115 {
 116   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 117 }
 118
 119 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
 120 ///    For each element, computes <c> (__A * __B) - __C </c>.
 121 ///
 122 /// \headerfile <immintrin.h>
 123 ///
 124 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
 125 ///
 126 /// \param __A
 127 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 128 /// \param __B
 129 ///    A 128-bit vector of [4 x float] containing the multiplier.
 130 /// \param __C
 131 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 132 /// \returns A 128-bit vector of [4 x float] containing the result.
 133 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 134 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 135 {
 136   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 137 }
 138
 139 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
 140 ///    For each element, computes <c> (__A * __B) - __C </c>.
 141 ///
 142 /// \headerfile <immintrin.h>
 143 ///
 144 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
 145 ///
 146 /// \param __A
 147 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 148 /// \param __B
 149 ///    A 128-bit vector of [2 x double] containing the multiplier.
 150 /// \param __C
 151 ///    A 128-bit vector of [2 x double] containing the addend.
 152 /// \returns A 128-bit vector of [2 x double] containing the result.
 153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 154 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 155 {
 156   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 157 }
 158
 159 /// Computes a scalar multiply-subtract of the single-precision values in
 160 ///    the low 32 bits of 128-bit vectors of [4 x float].
 161 ///
 162 /// \code{.operation}
 163 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 164 /// result[127:32] = __A[127:32]
 165 /// \endcode
 166 ///
 167 /// \headerfile <immintrin.h>
 168 ///
 169 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
 170 ///
 171 /// \param __A
 172 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 173 ///    32 bits.
 174 /// \param __B
 175 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
 176 ///    32 bits.
 177 /// \param __C
 178 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
 179 ///   32 bits.
 180 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 181 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 182 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 183 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 184 {
 185   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 186 }
 187
 188 /// Computes a scalar multiply-subtract of the double-precision values in
 189 ///    the low 64 bits of 128-bit vectors of [2 x double].
 190 ///
 191 /// \code{.operation}
 192 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 193 /// result[127:64] = __A[127:64]
 194 /// \endcode
 195 ///
 196 /// \headerfile <immintrin.h>
 197 ///
 198 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
 199 ///
 200 /// \param __A
 201 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 202 ///    64 bits.
 203 /// \param __B
 204 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 205 ///    64 bits.
 206 /// \param __C
 207 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
 208 ///    64 bits.
 209 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 210 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 211 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 212 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 213 {
 214   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 215 }
 216
 217 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
 218 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 219 ///
 220 /// \headerfile <immintrin.h>
 221 ///
 222 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
 223 ///
 224 /// \param __A
 225 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 226 /// \param __B
 227 ///    A 128-bit vector of [4 x float] containing the multiplier.
 228 /// \param __C
 229 ///    A 128-bit vector of [4 x float] containing the addend.
 230 /// \returns A 128-bit [4 x float] vector containing the result.
 231 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 232 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 233 {
 234   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 235 }
 236
 237 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
 238 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 239 ///
 240 /// \headerfile <immintrin.h>
 241 ///
 242 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
 243 ///
 244 /// \param __A
 245 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 246 /// \param __B
 247 ///    A 128-bit vector of [2 x double] containing the multiplier.
 248 /// \param __C
 249 ///    A 128-bit vector of [2 x double] containing the addend.
 250 /// \returns A 128-bit vector of [2 x double] containing the result.
 251 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 252 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 253 {
 254   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 255 }
 256
 257 /// Computes a scalar negated multiply-add of the single-precision values in
 258 ///    the low 32 bits of 128-bit vectors of [4 x float].
 259 ///
 260 /// \code{.operation}
 261 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
 262 /// result[127:32] = __A[127:32]
 263 /// \endcode
 264 ///
 265 /// \headerfile <immintrin.h>
 266 ///
 267 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
 268 ///
 269 /// \param __A
 270 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 271 ///    32 bits.
 272 /// \param __B
 273 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
 274 ///    32 bits.
 275 /// \param __C
 276 ///    A 128-bit vector of [4 x float] containing the addend in the low
 277 ///    32 bits.
 278 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 279 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 280 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 281 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 282 {
 283   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 284 }
 285
 286 /// Computes a scalar negated multiply-add of the double-precision values
 287 ///    in the low 64 bits of 128-bit vectors of [2 x double].
 288 ///
 289 /// \code{.operation}
 290 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
 291 /// result[127:64] = __A[127:64]
 292 /// \endcode
 293 ///
 294 /// \headerfile <immintrin.h>
 295 ///
 296 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
 297 ///
 298 /// \param __A
 299 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 300 ///    64 bits.
 301 /// \param __B
 302 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 303 ///    64 bits.
 304 /// \param __C
 305 ///    A 128-bit vector of [2 x double] containing the addend in the low
 306 ///    64 bits.
 307 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 308 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 309 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 310 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 311 {
 312   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 313 }
 314
 315 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
 316 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 317 ///
 318 /// \headerfile <immintrin.h>
 319 ///
 320 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
 321 ///
 322 /// \param __A
 323 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 324 /// \param __B
 325 ///    A 128-bit vector of [4 x float] containing the multiplier.
 326 /// \param __C
 327 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 328 /// \returns A 128-bit vector of [4 x float] containing the result.
 329 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 330 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 331 {
 332   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 333 }
 334
 335 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
 336 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 337 ///
 338 /// \headerfile <immintrin.h>
 339 ///
 340 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
 341 ///
 342 /// \param __A
 343 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 344 /// \param __B
 345 ///    A 128-bit vector of [2 x double] containing the multiplier.
 346 /// \param __C
 347 ///    A 128-bit vector of [2 x double] containing the subtrahend.
 348 /// \returns A 128-bit vector of [2 x double] containing the result.
 349 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 350 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 351 {
 352   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 353 }
 354
 355 /// Computes a scalar negated multiply-subtract of the single-precision
 356 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
 357 ///
 358 /// \code{.operation}
 359 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
 360 /// result[127:32] = __A[127:32]
 361 /// \endcode
 362 ///
 363 /// \headerfile <immintrin.h>
 364 ///
 365 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
 366 ///
 367 /// \param __A
 368 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 369 ///    32 bits.
 370 /// \param __B
 371 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
 372 ///    32 bits.
 373 /// \param __C
 374 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
 375 ///    32 bits.
 376 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 377 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 378 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 379 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 380 {
 381   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 382 }
 383
 384 /// Computes a scalar negated multiply-subtract of the double-precision
 385 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
 386 ///
 387 /// \code{.operation}
 388 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
 389 /// result[127:64] = __A[127:64]
 390 /// \endcode
 391 ///
 392 /// \headerfile <immintrin.h>
 393 ///
 394 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
 395 ///
 396 /// \param __A
 397 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
 398 ///    64 bits.
 399 /// \param __B
 400 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
 401 ///    64 bits.
 402 /// \param __C
 403 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
 404 ///    64 bits.
 405 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 406 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 407 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 408 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 409 {
 410   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 411 }
 412
 413 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 414 ///    [4 x float].
 415 ///
 416 /// \code{.operation}
 417 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
 418 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 419 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
 420 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
 421 /// \endcode
 422 ///
 423 /// \headerfile <immintrin.h>
 424 ///
 425 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
 426 ///
 427 /// \param __A
 428 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 429 /// \param __B
 430 ///    A 128-bit vector of [4 x float] containing the multiplier.
 431 /// \param __C
 432 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
 433 /// \returns A 128-bit vector of [4 x float] containing the result.
 434 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 435 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 436 {
 437   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 438 }
 439
 440 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 441 ///    [2 x double].
 442 ///
 443 /// \code{.operation}
 444 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
 445 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 446 /// \endcode
 447 ///
 448 /// \headerfile <immintrin.h>
 449 ///
 450 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
 451 ///
 452 /// \param __A
 453 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 454 /// \param __B
 455 ///    A 128-bit vector of [2 x double] containing the multiplier.
 456 /// \param __C
 457 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
 458 /// \returns A 128-bit vector of [2 x double] containing the result.
 459 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 460 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 461 {
 462   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 463 }
 464
 465 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 466 ///    [4 x float].
 467 ///
 468 /// \code{.operation}
 469 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
 470 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 471 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
 472 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
 473 /// \endcode
 474 ///
 475 /// \headerfile <immintrin.h>
 476 ///
 477 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
 478 ///
 479 /// \param __A
 480 ///    A 128-bit vector of [4 x float] containing the multiplicand.
 481 /// \param __B
 482 ///    A 128-bit vector of [4 x float] containing the multiplier.
 483 /// \param __C
 484 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
 485 /// \returns A 128-bit vector of [4 x float] containing the result.
 486 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 487 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 488 {
 489   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 490 }
 491
 492 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 493 ///    [2 x double].
 494 ///
 495 /// \code{.operation}
 496 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
 497 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 498 /// \endcode
 499 ///
 500 /// \headerfile <immintrin.h>
 501 ///
 502 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
 503 ///
 504 /// \param __A
 505 ///    A 128-bit vector of [2 x double] containing the multiplicand.
 506 /// \param __B
 507 ///    A 128-bit vector of [2 x double] containing the multiplier.
 508 /// \param __C
 509 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
 510 /// \returns A 128-bit vector of [2 x double] containing the result.
 511 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 512 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 513 {
 514   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 515 }
 516
 517 /// Computes a multiply-add of 256-bit vectors of [8 x float].
 518 ///    For each element, computes <c> (__A * __B) + __C </c>.
 519 ///
 520 /// \headerfile <immintrin.h>
 521 ///
 522 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
 523 ///
 524 /// \param __A
 525 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 526 /// \param __B
 527 ///    A 256-bit vector of [8 x float] containing the multiplier.
 528 /// \param __C
 529 ///    A 256-bit vector of [8 x float] containing the addend.
 530 /// \returns A 256-bit vector of [8 x float] containing the result.
 531 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 532 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 533 {
 534   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 535 }
 536
 537 /// Computes a multiply-add of 256-bit vectors of [4 x double].
 538 ///    For each element, computes <c> (__A * __B) + __C </c>.
 539 ///
 540 /// \headerfile <immintrin.h>
 541 ///
 542 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
 543 ///
 544 /// \param __A
 545 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 546 /// \param __B
 547 ///    A 256-bit vector of [4 x double] containing the multiplier.
 548 /// \param __C
 549 ///    A 256-bit vector of [4 x double] containing the addend.
 550 /// \returns A 256-bit vector of [4 x double] containing the result.
 551 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 552 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 553 {
 554   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 555 }
 556
 557 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
 558 ///    For each element, computes <c> (__A * __B) - __C </c>.
 559 ///
 560 /// \headerfile <immintrin.h>
 561 ///
 562 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
 563 ///
 564 /// \param __A
 565 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 566 /// \param __B
 567 ///    A 256-bit vector of [8 x float] containing the multiplier.
 568 /// \param __C
 569 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 570 /// \returns A 256-bit vector of [8 x float] containing the result.
 571 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 572 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 573 {
 574   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 575 }
 576
 577 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
 578 ///    For each element, computes <c> (__A * __B) - __C </c>.
 579 ///
 580 /// \headerfile <immintrin.h>
 581 ///
 582 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
 583 ///
 584 /// \param __A
 585 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 586 /// \param __B
 587 ///    A 256-bit vector of [4 x double] containing the multiplier.
 588 /// \param __C
 589 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 590 /// \returns A 256-bit vector of [4 x double] containing the result.
 591 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 592 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 593 {
 594   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 595 }
 596
 597 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
 598 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 599 ///
 600 /// \headerfile <immintrin.h>
 601 ///
 602 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
 603 ///
 604 /// \param __A
 605 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 606 /// \param __B
 607 ///    A 256-bit vector of [8 x float] containing the multiplier.
 608 /// \param __C
 609 ///    A 256-bit vector of [8 x float] containing the addend.
 610 /// \returns A 256-bit vector of [8 x float] containing the result.
 611 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 612 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 613 {
 614   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 615 }
 616
 617 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
 618 ///    For each element, computes <c> -(__A * __B) + __C </c>.
 619 ///
 620 /// \headerfile <immintrin.h>
 621 ///
 622 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
 623 ///
 624 /// \param __A
 625 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 626 /// \param __B
 627 ///    A 256-bit vector of [4 x double] containing the multiplier.
 628 /// \param __C
 629 ///    A 256-bit vector of [4 x double] containing the addend.
 630 /// \returns A 256-bit vector of [4 x double] containing the result.
 631 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 632 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 633 {
 634   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 635 }
 636
 637 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
 638 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 639 ///
 640 /// \headerfile <immintrin.h>
 641 ///
 642 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
 643 ///
 644 /// \param __A
 645 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 646 /// \param __B
 647 ///    A 256-bit vector of [8 x float] containing the multiplier.
 648 /// \param __C
 649 ///    A 256-bit vector of [8 x float] containing the subtrahend.
 650 /// \returns A 256-bit vector of [8 x float] containing the result.
 651 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 652 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 653 {
 654   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 655 }
 656
 657 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
 658 ///    For each element, computes <c> -(__A * __B) - __C </c>.
 659 ///
 660 /// \headerfile <immintrin.h>
 661 ///
 662 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
 663 ///
 664 /// \param __A
 665 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 666 /// \param __B
 667 ///    A 256-bit vector of [4 x double] containing the multiplier.
 668 /// \param __C
 669 ///    A 256-bit vector of [4 x double] containing the subtrahend.
 670 /// \returns A 256-bit vector of [4 x double] containing the result.
 671 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 672 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 673 {
 674   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 675 }
 676
 677 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 678 ///    [8 x float].
 679 ///
 680 /// \code{.operation}
 681 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 682 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 683 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
 684 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
 685 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
 686 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
 687 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
 688 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
 689 /// \endcode
 690 ///
 691 /// \headerfile <immintrin.h>
 692 ///
 693 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
 694 ///
 695 /// \param __A
 696 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 697 /// \param __B
 698 ///    A 256-bit vector of [8 x float] containing the multiplier.
 699 /// \param __C
 700 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
 701 /// \returns A 256-bit vector of [8 x float] containing the result.
 702 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 703 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 704 {
 705   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 706 }
 707
 708 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 709 ///    [4 x double].
 710 ///
 711 /// \code{.operation}
 712 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 713 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 714 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
 715 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
 716 /// \endcode
 717 ///
 718 /// \headerfile <immintrin.h>
 719 ///
 720 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
 721 ///
 722 /// \param __A
 723 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 724 /// \param __B
 725 ///    A 256-bit vector of [4 x double] containing the multiplier.
 726 /// \param __C
 727 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
 728 /// \returns A 256-bit vector of [4 x double] containing the result.
 729 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 730 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 731 {
 732   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 733 }
 734
 735 /// Computes a vector multiply with alternating add/subtract of 256-bit
 736 ///    vectors of [8 x float].
 737 ///
 738 /// \code{.operation}
 739 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 740 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 741 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
 742 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
 743 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
 744 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
 745 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
 746 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
 747 /// \endcode
 748 ///
 749 /// \headerfile <immintrin.h>
 750 ///
 751 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
 752 ///
 753 /// \param __A
 754 ///    A 256-bit vector of [8 x float] containing the multiplicand.
 755 /// \param __B
 756 ///    A 256-bit vector of [8 x float] containing the multiplier.
 757 /// \param __C
 758 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
 759 /// \returns A 256-bit vector of [8 x float] containing the result.
 760 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 761 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 762 {
 763   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 764 }
 765
 766 /// Computes a vector multiply with alternating add/subtract of 256-bit
 767 ///    vectors of [4 x double].
 768 ///
 769 /// \code{.operation}
 770 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 771 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 772 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
 773 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
 774 /// \endcode
 775 ///
 776 /// \headerfile <immintrin.h>
 777 ///
 778 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
 779 ///
 780 /// \param __A
 781 ///    A 256-bit vector of [4 x double] containing the multiplicand.
 782 /// \param __B
 783 ///    A 256-bit vector of [4 x double] containing the multiplier.
 784 /// \param __C
 785 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
 786 /// \returns A 256-bit vector of [4 x double] containing the result.
 787 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 788 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 789 {
 790   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 791 }
 792
 793 #undef __DEFAULT_FN_ATTRS128
 794 #undef __DEFAULT_FN_ATTRS256
 795
 796 #endif /* __FMAINTRIN_H */