1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 /// For each element, computes <c> (__A * __B) + __C </c>.
24 /// \headerfile <immintrin.h>
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
29 /// A 128-bit vector of [4 x float] containing the multiplicand.
31 /// A 128-bit vector of [4 x float] containing the multiplier.
33 /// A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
36 _mm_fmadd_ps(__m128 __A
, __m128 __B
, __m128 __C
)
38 return (__m128
)__builtin_ia32_vfmaddps((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 /// For each element, computes <c> (__A * __B) + __C </c>.
44 /// \headerfile <immintrin.h>
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
49 /// A 128-bit vector of [2 x double] containing the multiplicand.
51 /// A 128-bit vector of [2 x double] containing the multiplier.
53 /// A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
56 _mm_fmadd_pd(__m128d __A
, __m128d __B
, __m128d __C
)
58 return (__m128d
)__builtin_ia32_vfmaddpd((__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
61 /// Computes a scalar multiply-add of the single-precision values in the
62 /// low 32 bits of 128-bit vectors of [4 x float].
65 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
66 /// result[127:32] = __A[127:32]
69 /// \headerfile <immintrin.h>
71 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
74 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
77 /// A 128-bit vector of [4 x float] containing the multiplier in the low
80 /// A 128-bit vector of [4 x float] containing the addend in the low
82 /// \returns A 128-bit vector of [4 x float] containing the result in the low
83 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
84 static __inline__ __m128 __DEFAULT_FN_ATTRS128
85 _mm_fmadd_ss(__m128 __A
, __m128 __B
, __m128 __C
)
87 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
90 /// Computes a scalar multiply-add of the double-precision values in the
91 /// low 64 bits of 128-bit vectors of [2 x double].
94 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
95 /// result[127:64] = __A[127:64]
98 /// \headerfile <immintrin.h>
100 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
103 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
106 /// A 128-bit vector of [2 x double] containing the multiplier in the low
109 /// A 128-bit vector of [2 x double] containing the addend in the low
111 /// \returns A 128-bit vector of [2 x double] containing the result in the low
112 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
114 _mm_fmadd_sd(__m128d __A
, __m128d __B
, __m128d __C
)
116 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
119 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
120 /// For each element, computes <c> (__A * __B) - __C </c>.
122 /// \headerfile <immintrin.h>
124 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
127 /// A 128-bit vector of [4 x float] containing the multiplicand.
129 /// A 128-bit vector of [4 x float] containing the multiplier.
131 /// A 128-bit vector of [4 x float] containing the subtrahend.
132 /// \returns A 128-bit vector of [4 x float] containing the result.
133 static __inline__ __m128 __DEFAULT_FN_ATTRS128
134 _mm_fmsub_ps(__m128 __A
, __m128 __B
, __m128 __C
)
136 return (__m128
)__builtin_ia32_vfmaddps((__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
139 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
140 /// For each element, computes <c> (__A * __B) - __C </c>.
142 /// \headerfile <immintrin.h>
144 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
147 /// A 128-bit vector of [2 x double] containing the multiplicand.
149 /// A 128-bit vector of [2 x double] containing the multiplier.
151 /// A 128-bit vector of [2 x double] containing the addend.
152 /// \returns A 128-bit vector of [2 x double] containing the result.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
154 _mm_fmsub_pd(__m128d __A
, __m128d __B
, __m128d __C
)
156 return (__m128d
)__builtin_ia32_vfmaddpd((__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
159 /// Computes a scalar multiply-subtract of the single-precision values in
160 /// the low 32 bits of 128-bit vectors of [4 x float].
162 /// \code{.operation}
163 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
164 /// result[127:32] = __A[127:32]
167 /// \headerfile <immintrin.h>
169 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
172 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
175 /// A 128-bit vector of [4 x float] containing the multiplier in the low
178 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
180 /// \returns A 128-bit vector of [4 x float] containing the result in the low
181 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
182 static __inline__ __m128 __DEFAULT_FN_ATTRS128
183 _mm_fmsub_ss(__m128 __A
, __m128 __B
, __m128 __C
)
185 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
188 /// Computes a scalar multiply-subtract of the double-precision values in
189 /// the low 64 bits of 128-bit vectors of [2 x double].
191 /// \code{.operation}
192 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
193 /// result[127:64] = __A[127:64]
196 /// \headerfile <immintrin.h>
198 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
201 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
204 /// A 128-bit vector of [2 x double] containing the multiplier in the low
207 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
209 /// \returns A 128-bit vector of [2 x double] containing the result in the low
210 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS128
212 _mm_fmsub_sd(__m128d __A
, __m128d __B
, __m128d __C
)
214 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
217 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
218 /// For each element, computes <c> -(__A * __B) + __C </c>.
220 /// \headerfile <immintrin.h>
222 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
225 /// A 128-bit vector of [4 x float] containing the multiplicand.
227 /// A 128-bit vector of [4 x float] containing the multiplier.
229 /// A 128-bit vector of [4 x float] containing the addend.
230 /// \returns A 128-bit [4 x float] vector containing the result.
231 static __inline__ __m128 __DEFAULT_FN_ATTRS128
232 _mm_fnmadd_ps(__m128 __A
, __m128 __B
, __m128 __C
)
234 return (__m128
)__builtin_ia32_vfmaddps(-(__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
237 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
238 /// For each element, computes <c> -(__A * __B) + __C </c>.
240 /// \headerfile <immintrin.h>
242 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
245 /// A 128-bit vector of [2 x double] containing the multiplicand.
247 /// A 128-bit vector of [2 x double] containing the multiplier.
249 /// A 128-bit vector of [2 x double] containing the addend.
250 /// \returns A 128-bit vector of [2 x double] containing the result.
251 static __inline__ __m128d __DEFAULT_FN_ATTRS128
252 _mm_fnmadd_pd(__m128d __A
, __m128d __B
, __m128d __C
)
254 return (__m128d
)__builtin_ia32_vfmaddpd(-(__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
257 /// Computes a scalar negated multiply-add of the single-precision values in
258 /// the low 32 bits of 128-bit vectors of [4 x float].
260 /// \code{.operation}
261 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
262 /// result[127:32] = __A[127:32]
265 /// \headerfile <immintrin.h>
267 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
270 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
273 /// A 128-bit vector of [4 x float] containing the multiplier in the low
276 /// A 128-bit vector of [4 x float] containing the addend in the low
278 /// \returns A 128-bit vector of [4 x float] containing the result in the low
279 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
280 static __inline__ __m128 __DEFAULT_FN_ATTRS128
281 _mm_fnmadd_ss(__m128 __A
, __m128 __B
, __m128 __C
)
283 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, -(__v4sf
)__B
, (__v4sf
)__C
);
286 /// Computes a scalar negated multiply-add of the double-precision values
287 /// in the low 64 bits of 128-bit vectors of [2 x double].
289 /// \code{.operation}
290 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
291 /// result[127:64] = __A[127:64]
294 /// \headerfile <immintrin.h>
296 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
299 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
302 /// A 128-bit vector of [2 x double] containing the multiplier in the low
305 /// A 128-bit vector of [2 x double] containing the addend in the low
307 /// \returns A 128-bit vector of [2 x double] containing the result in the low
308 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
309 static __inline__ __m128d __DEFAULT_FN_ATTRS128
310 _mm_fnmadd_sd(__m128d __A
, __m128d __B
, __m128d __C
)
312 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, -(__v2df
)__B
, (__v2df
)__C
);
315 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
316 /// For each element, computes <c> -(__A * __B) - __C </c>.
318 /// \headerfile <immintrin.h>
320 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
323 /// A 128-bit vector of [4 x float] containing the multiplicand.
325 /// A 128-bit vector of [4 x float] containing the multiplier.
327 /// A 128-bit vector of [4 x float] containing the subtrahend.
328 /// \returns A 128-bit vector of [4 x float] containing the result.
329 static __inline__ __m128 __DEFAULT_FN_ATTRS128
330 _mm_fnmsub_ps(__m128 __A
, __m128 __B
, __m128 __C
)
332 return (__m128
)__builtin_ia32_vfmaddps(-(__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
335 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
336 /// For each element, computes <c> -(__A * __B) - __C </c>.
338 /// \headerfile <immintrin.h>
340 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
343 /// A 128-bit vector of [2 x double] containing the multiplicand.
345 /// A 128-bit vector of [2 x double] containing the multiplier.
347 /// A 128-bit vector of [2 x double] containing the subtrahend.
348 /// \returns A 128-bit vector of [2 x double] containing the result.
349 static __inline__ __m128d __DEFAULT_FN_ATTRS128
350 _mm_fnmsub_pd(__m128d __A
, __m128d __B
, __m128d __C
)
352 return (__m128d
)__builtin_ia32_vfmaddpd(-(__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
355 /// Computes a scalar negated multiply-subtract of the single-precision
356 /// values in the low 32 bits of 128-bit vectors of [4 x float].
358 /// \code{.operation}
359 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
360 /// result[127:32] = __A[127:32]
363 /// \headerfile <immintrin.h>
365 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
368 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
371 /// A 128-bit vector of [4 x float] containing the multiplier in the low
374 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
376 /// \returns A 128-bit vector of [4 x float] containing the result in the low
377 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
378 static __inline__ __m128 __DEFAULT_FN_ATTRS128
379 _mm_fnmsub_ss(__m128 __A
, __m128 __B
, __m128 __C
)
381 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, -(__v4sf
)__B
, -(__v4sf
)__C
);
384 /// Computes a scalar negated multiply-subtract of the double-precision
385 /// values in the low 64 bits of 128-bit vectors of [2 x double].
387 /// \code{.operation}
388 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
389 /// result[127:64] = __A[127:64]
392 /// \headerfile <immintrin.h>
394 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
397 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
400 /// A 128-bit vector of [2 x double] containing the multiplier in the low
403 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
405 /// \returns A 128-bit vector of [2 x double] containing the result in the low
406 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS128
408 _mm_fnmsub_sd(__m128d __A
, __m128d __B
, __m128d __C
)
410 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, -(__v2df
)__B
, -(__v2df
)__C
);
413 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
416 /// \code{.operation}
417 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
418 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
419 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
420 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
423 /// \headerfile <immintrin.h>
425 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
428 /// A 128-bit vector of [4 x float] containing the multiplicand.
430 /// A 128-bit vector of [4 x float] containing the multiplier.
432 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
433 /// \returns A 128-bit vector of [4 x float] containing the result.
434 static __inline__ __m128 __DEFAULT_FN_ATTRS128
435 _mm_fmaddsub_ps(__m128 __A
, __m128 __B
, __m128 __C
)
437 return (__m128
)__builtin_ia32_vfmaddsubps((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
440 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
443 /// \code{.operation}
444 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
445 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
448 /// \headerfile <immintrin.h>
450 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
453 /// A 128-bit vector of [2 x double] containing the multiplicand.
455 /// A 128-bit vector of [2 x double] containing the multiplier.
457 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
458 /// \returns A 128-bit vector of [2 x double] containing the result.
459 static __inline__ __m128d __DEFAULT_FN_ATTRS128
460 _mm_fmaddsub_pd(__m128d __A
, __m128d __B
, __m128d __C
)
462 return (__m128d
)__builtin_ia32_vfmaddsubpd((__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
465 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
468 /// \code{.operation}
469 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
470 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
471 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
472 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
475 /// \headerfile <immintrin.h>
477 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
480 /// A 128-bit vector of [4 x float] containing the multiplicand.
482 /// A 128-bit vector of [4 x float] containing the multiplier.
484 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
485 /// \returns A 128-bit vector of [4 x float] containing the result.
486 static __inline__ __m128 __DEFAULT_FN_ATTRS128
487 _mm_fmsubadd_ps(__m128 __A
, __m128 __B
, __m128 __C
)
489 return (__m128
)__builtin_ia32_vfmaddsubps((__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
492 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
495 /// \code{.operation}
496 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
497 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
500 /// \headerfile <immintrin.h>
502 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
505 /// A 128-bit vector of [2 x double] containing the multiplicand.
507 /// A 128-bit vector of [2 x double] containing the multiplier.
509 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
510 /// \returns A 128-bit vector of [2 x double] containing the result.
511 static __inline__ __m128d __DEFAULT_FN_ATTRS128
512 _mm_fmsubadd_pd(__m128d __A
, __m128d __B
, __m128d __C
)
514 return (__m128d
)__builtin_ia32_vfmaddsubpd((__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
517 /// Computes a multiply-add of 256-bit vectors of [8 x float].
518 /// For each element, computes <c> (__A * __B) + __C </c>.
520 /// \headerfile <immintrin.h>
522 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
525 /// A 256-bit vector of [8 x float] containing the multiplicand.
527 /// A 256-bit vector of [8 x float] containing the multiplier.
529 /// A 256-bit vector of [8 x float] containing the addend.
530 /// \returns A 256-bit vector of [8 x float] containing the result.
531 static __inline__ __m256 __DEFAULT_FN_ATTRS256
532 _mm256_fmadd_ps(__m256 __A
, __m256 __B
, __m256 __C
)
534 return (__m256
)__builtin_ia32_vfmaddps256((__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
);
537 /// Computes a multiply-add of 256-bit vectors of [4 x double].
538 /// For each element, computes <c> (__A * __B) + __C </c>.
540 /// \headerfile <immintrin.h>
542 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
545 /// A 256-bit vector of [4 x double] containing the multiplicand.
547 /// A 256-bit vector of [4 x double] containing the multiplier.
549 /// A 256-bit vector of [4 x double] containing the addend.
550 /// \returns A 256-bit vector of [4 x double] containing the result.
551 static __inline__ __m256d __DEFAULT_FN_ATTRS256
552 _mm256_fmadd_pd(__m256d __A
, __m256d __B
, __m256d __C
)
554 return (__m256d
)__builtin_ia32_vfmaddpd256((__v4df
)__A
, (__v4df
)__B
, (__v4df
)__C
);
557 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
558 /// For each element, computes <c> (__A * __B) - __C </c>.
560 /// \headerfile <immintrin.h>
562 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
565 /// A 256-bit vector of [8 x float] containing the multiplicand.
567 /// A 256-bit vector of [8 x float] containing the multiplier.
569 /// A 256-bit vector of [8 x float] containing the subtrahend.
570 /// \returns A 256-bit vector of [8 x float] containing the result.
571 static __inline__ __m256 __DEFAULT_FN_ATTRS256
572 _mm256_fmsub_ps(__m256 __A
, __m256 __B
, __m256 __C
)
574 return (__m256
)__builtin_ia32_vfmaddps256((__v8sf
)__A
, (__v8sf
)__B
, -(__v8sf
)__C
);
577 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
578 /// For each element, computes <c> (__A * __B) - __C </c>.
580 /// \headerfile <immintrin.h>
582 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
585 /// A 256-bit vector of [4 x double] containing the multiplicand.
587 /// A 256-bit vector of [4 x double] containing the multiplier.
589 /// A 256-bit vector of [4 x double] containing the subtrahend.
590 /// \returns A 256-bit vector of [4 x double] containing the result.
591 static __inline__ __m256d __DEFAULT_FN_ATTRS256
592 _mm256_fmsub_pd(__m256d __A
, __m256d __B
, __m256d __C
)
594 return (__m256d
)__builtin_ia32_vfmaddpd256((__v4df
)__A
, (__v4df
)__B
, -(__v4df
)__C
);
597 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
598 /// For each element, computes <c> -(__A * __B) + __C </c>.
600 /// \headerfile <immintrin.h>
602 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
605 /// A 256-bit vector of [8 x float] containing the multiplicand.
607 /// A 256-bit vector of [8 x float] containing the multiplier.
609 /// A 256-bit vector of [8 x float] containing the addend.
610 /// \returns A 256-bit vector of [8 x float] containing the result.
611 static __inline__ __m256 __DEFAULT_FN_ATTRS256
612 _mm256_fnmadd_ps(__m256 __A
, __m256 __B
, __m256 __C
)
614 return (__m256
)__builtin_ia32_vfmaddps256(-(__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
);
617 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
618 /// For each element, computes <c> -(__A * __B) + __C </c>.
620 /// \headerfile <immintrin.h>
622 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
625 /// A 256-bit vector of [4 x double] containing the multiplicand.
627 /// A 256-bit vector of [4 x double] containing the multiplier.
629 /// A 256-bit vector of [4 x double] containing the addend.
630 /// \returns A 256-bit vector of [4 x double] containing the result.
631 static __inline__ __m256d __DEFAULT_FN_ATTRS256
632 _mm256_fnmadd_pd(__m256d __A
, __m256d __B
, __m256d __C
)
634 return (__m256d
)__builtin_ia32_vfmaddpd256(-(__v4df
)__A
, (__v4df
)__B
, (__v4df
)__C
);
637 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
638 /// For each element, computes <c> -(__A * __B) - __C </c>.
640 /// \headerfile <immintrin.h>
642 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
645 /// A 256-bit vector of [8 x float] containing the multiplicand.
647 /// A 256-bit vector of [8 x float] containing the multiplier.
649 /// A 256-bit vector of [8 x float] containing the subtrahend.
650 /// \returns A 256-bit vector of [8 x float] containing the result.
651 static __inline__ __m256 __DEFAULT_FN_ATTRS256
652 _mm256_fnmsub_ps(__m256 __A
, __m256 __B
, __m256 __C
)
654 return (__m256
)__builtin_ia32_vfmaddps256(-(__v8sf
)__A
, (__v8sf
)__B
, -(__v8sf
)__C
);
657 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
658 /// For each element, computes <c> -(__A * __B) - __C </c>.
660 /// \headerfile <immintrin.h>
662 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
665 /// A 256-bit vector of [4 x double] containing the multiplicand.
667 /// A 256-bit vector of [4 x double] containing the multiplier.
669 /// A 256-bit vector of [4 x double] containing the subtrahend.
670 /// \returns A 256-bit vector of [4 x double] containing the result.
671 static __inline__ __m256d __DEFAULT_FN_ATTRS256
672 _mm256_fnmsub_pd(__m256d __A
, __m256d __B
, __m256d __C
)
674 return (__m256d
)__builtin_ia32_vfmaddpd256(-(__v4df
)__A
, (__v4df
)__B
, -(__v4df
)__C
);
677 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
680 /// \code{.operation}
681 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
682 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
683 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
684 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
685 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
686 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
687 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
688 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
691 /// \headerfile <immintrin.h>
693 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
696 /// A 256-bit vector of [8 x float] containing the multiplicand.
698 /// A 256-bit vector of [8 x float] containing the multiplier.
700 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
701 /// \returns A 256-bit vector of [8 x float] containing the result.
702 static __inline__ __m256 __DEFAULT_FN_ATTRS256
703 _mm256_fmaddsub_ps(__m256 __A
, __m256 __B
, __m256 __C
)
705 return (__m256
)__builtin_ia32_vfmaddsubps256((__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
);
708 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
711 /// \code{.operation}
712 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
713 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
714 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
715 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
718 /// \headerfile <immintrin.h>
720 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
723 /// A 256-bit vector of [4 x double] containing the multiplicand.
725 /// A 256-bit vector of [4 x double] containing the multiplier.
727 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
728 /// \returns A 256-bit vector of [4 x double] containing the result.
729 static __inline__ __m256d __DEFAULT_FN_ATTRS256
730 _mm256_fmaddsub_pd(__m256d __A
, __m256d __B
, __m256d __C
)
732 return (__m256d
)__builtin_ia32_vfmaddsubpd256((__v4df
)__A
, (__v4df
)__B
, (__v4df
)__C
);
735 /// Computes a vector multiply with alternating add/subtract of 256-bit
736 /// vectors of [8 x float].
738 /// \code{.operation}
739 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
740 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
741 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
742 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
743 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
744 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
745 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
746 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
749 /// \headerfile <immintrin.h>
751 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
754 /// A 256-bit vector of [8 x float] containing the multiplicand.
756 /// A 256-bit vector of [8 x float] containing the multiplier.
758 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
759 /// \returns A 256-bit vector of [8 x float] containing the result.
760 static __inline__ __m256 __DEFAULT_FN_ATTRS256
761 _mm256_fmsubadd_ps(__m256 __A
, __m256 __B
, __m256 __C
)
763 return (__m256
)__builtin_ia32_vfmaddsubps256((__v8sf
)__A
, (__v8sf
)__B
, -(__v8sf
)__C
);
766 /// Computes a vector multiply with alternating add/subtract of 256-bit
767 /// vectors of [4 x double].
769 /// \code{.operation}
770 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
771 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
772 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
773 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
776 /// \headerfile <immintrin.h>
778 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
781 /// A 256-bit vector of [4 x double] containing the multiplicand.
783 /// A 256-bit vector of [4 x double] containing the multiplier.
785 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
786 /// \returns A 256-bit vector of [4 x double] containing the result.
787 static __inline__ __m256d __DEFAULT_FN_ATTRS256
788 _mm256_fmsubadd_pd(__m256d __A
, __m256d __B
, __m256d __C
)
790 return (__m256d
)__builtin_ia32_vfmaddsubpd256((__v4df
)__A
, (__v4df
)__B
, -(__v4df
)__C
);
793 #undef __DEFAULT_FN_ATTRS128
794 #undef __DEFAULT_FN_ATTRS256
796 #endif /* __FMAINTRIN_H */