1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 /// For each element, computes <c> (__A * __B) + __C </c>.
24 /// \headerfile <immintrin.h>
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
29 /// A 128-bit vector of [4 x float] containing the multiplicand.
31 /// A 128-bit vector of [4 x float] containing the multiplier.
33 /// A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
36 _mm_fmadd_ps(__m128 __A
, __m128 __B
, __m128 __C
)
38 return (__m128
)__builtin_ia32_vfmaddps((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 /// For each element, computes <c> (__A * __B) + __C </c>.
44 /// \headerfile <immintrin.h>
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
49 /// A 128-bit vector of [2 x double] containing the multiplicand.
51 /// A 128-bit vector of [2 x double] containing the multiplier.
53 /// A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
56 _mm_fmadd_pd(__m128d __A
, __m128d __B
, __m128d __C
)
58 return (__m128d
)__builtin_ia32_vfmaddpd((__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
61 /// Computes a scalar multiply-add of the single-precision values in the
62 /// low 32 bits of 128-bit vectors of [4 x float].
64 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
65 /// result[127:32] = __A[127:32]
68 /// \headerfile <immintrin.h>
70 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
73 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
76 /// A 128-bit vector of [4 x float] containing the multiplier in the low
79 /// A 128-bit vector of [4 x float] containing the addend in the low
81 /// \returns A 128-bit vector of [4 x float] containing the result in the low
82 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
83 static __inline__ __m128 __DEFAULT_FN_ATTRS128
84 _mm_fmadd_ss(__m128 __A
, __m128 __B
, __m128 __C
)
86 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
89 /// Computes a scalar multiply-add of the double-precision values in the
90 /// low 64 bits of 128-bit vectors of [2 x double].
92 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
93 /// result[127:64] = __A[127:64]
96 /// \headerfile <immintrin.h>
98 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
101 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
104 /// A 128-bit vector of [2 x double] containing the multiplier in the low
107 /// A 128-bit vector of [2 x double] containing the addend in the low
109 /// \returns A 128-bit vector of [2 x double] containing the result in the low
110 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
111 static __inline__ __m128d __DEFAULT_FN_ATTRS128
112 _mm_fmadd_sd(__m128d __A
, __m128d __B
, __m128d __C
)
114 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
117 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
118 /// For each element, computes <c> (__A * __B) - __C </c>.
120 /// \headerfile <immintrin.h>
122 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
125 /// A 128-bit vector of [4 x float] containing the multiplicand.
127 /// A 128-bit vector of [4 x float] containing the multiplier.
129 /// A 128-bit vector of [4 x float] containing the subtrahend.
130 /// \returns A 128-bit vector of [4 x float] containing the result.
131 static __inline__ __m128 __DEFAULT_FN_ATTRS128
132 _mm_fmsub_ps(__m128 __A
, __m128 __B
, __m128 __C
)
134 return (__m128
)__builtin_ia32_vfmaddps((__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
137 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
138 /// For each element, computes <c> (__A * __B) - __C </c>.
140 /// \headerfile <immintrin.h>
142 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
145 /// A 128-bit vector of [2 x double] containing the multiplicand.
147 /// A 128-bit vector of [2 x double] containing the multiplier.
149 /// A 128-bit vector of [2 x double] containing the addend.
150 /// \returns A 128-bit vector of [2 x double] containing the result.
151 static __inline__ __m128d __DEFAULT_FN_ATTRS128
152 _mm_fmsub_pd(__m128d __A
, __m128d __B
, __m128d __C
)
154 return (__m128d
)__builtin_ia32_vfmaddpd((__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
157 /// Computes a scalar multiply-subtract of the single-precision values in
158 /// the low 32 bits of 128-bit vectors of [4 x float].
160 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
161 /// result[127:32] = __A[127:32]
164 /// \headerfile <immintrin.h>
166 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
169 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
172 /// A 128-bit vector of [4 x float] containing the multiplier in the low
175 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
177 /// \returns A 128-bit vector of [4 x float] containing the result in the low
178 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
179 static __inline__ __m128 __DEFAULT_FN_ATTRS128
180 _mm_fmsub_ss(__m128 __A
, __m128 __B
, __m128 __C
)
182 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
185 /// Computes a scalar multiply-subtract of the double-precision values in
186 /// the low 64 bits of 128-bit vectors of [2 x double].
188 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
189 /// result[127:64] = __A[127:64]
192 /// \headerfile <immintrin.h>
194 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
197 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
200 /// A 128-bit vector of [2 x double] containing the multiplier in the low
203 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
205 /// \returns A 128-bit vector of [2 x double] containing the result in the low
206 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
207 static __inline__ __m128d __DEFAULT_FN_ATTRS128
208 _mm_fmsub_sd(__m128d __A
, __m128d __B
, __m128d __C
)
210 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
213 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
214 /// For each element, computes <c> -(__A * __B) + __C </c>.
216 /// \headerfile <immintrin.h>
218 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
221 /// A 128-bit vector of [4 x float] containing the multiplicand.
223 /// A 128-bit vector of [4 x float] containing the multiplier.
225 /// A 128-bit vector of [4 x float] containing the addend.
226 /// \returns A 128-bit [4 x float] vector containing the result.
227 static __inline__ __m128 __DEFAULT_FN_ATTRS128
228 _mm_fnmadd_ps(__m128 __A
, __m128 __B
, __m128 __C
)
230 return (__m128
)__builtin_ia32_vfmaddps(-(__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
233 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
234 /// For each element, computes <c> -(__A * __B) + __C </c>.
236 /// \headerfile <immintrin.h>
238 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
241 /// A 128-bit vector of [2 x double] containing the multiplicand.
243 /// A 128-bit vector of [2 x double] containing the multiplier.
245 /// A 128-bit vector of [2 x double] containing the addend.
246 /// \returns A 128-bit vector of [2 x double] containing the result.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS128
248 _mm_fnmadd_pd(__m128d __A
, __m128d __B
, __m128d __C
)
250 return (__m128d
)__builtin_ia32_vfmaddpd(-(__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
253 /// Computes a scalar negated multiply-add of the single-precision values in
254 /// the low 32 bits of 128-bit vectors of [4 x float].
256 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
257 /// result[127:32] = __A[127:32]
260 /// \headerfile <immintrin.h>
262 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
265 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
268 /// A 128-bit vector of [4 x float] containing the multiplier in the low
271 /// A 128-bit vector of [4 x float] containing the addend in the low
273 /// \returns A 128-bit vector of [4 x float] containing the result in the low
274 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
275 static __inline__ __m128 __DEFAULT_FN_ATTRS128
276 _mm_fnmadd_ss(__m128 __A
, __m128 __B
, __m128 __C
)
278 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, -(__v4sf
)__B
, (__v4sf
)__C
);
281 /// Computes a scalar negated multiply-add of the double-precision values
282 /// in the low 64 bits of 128-bit vectors of [2 x double].
284 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
285 /// result[127:64] = __A[127:64]
288 /// \headerfile <immintrin.h>
290 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
293 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
296 /// A 128-bit vector of [2 x double] containing the multiplier in the low
299 /// A 128-bit vector of [2 x double] containing the addend in the low
301 /// \returns A 128-bit vector of [2 x double] containing the result in the low
302 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
303 static __inline__ __m128d __DEFAULT_FN_ATTRS128
304 _mm_fnmadd_sd(__m128d __A
, __m128d __B
, __m128d __C
)
306 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, -(__v2df
)__B
, (__v2df
)__C
);
309 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
310 /// For each element, computes <c> -(__A * __B) - __C </c>.
312 /// \headerfile <immintrin.h>
314 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
317 /// A 128-bit vector of [4 x float] containing the multiplicand.
319 /// A 128-bit vector of [4 x float] containing the multiplier.
321 /// A 128-bit vector of [4 x float] containing the subtrahend.
322 /// \returns A 128-bit vector of [4 x float] containing the result.
323 static __inline__ __m128 __DEFAULT_FN_ATTRS128
324 _mm_fnmsub_ps(__m128 __A
, __m128 __B
, __m128 __C
)
326 return (__m128
)__builtin_ia32_vfmaddps(-(__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
329 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
330 /// For each element, computes <c> -(__A * __B) - __C </c>.
332 /// \headerfile <immintrin.h>
334 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
337 /// A 128-bit vector of [2 x double] containing the multiplicand.
339 /// A 128-bit vector of [2 x double] containing the multiplier.
341 /// A 128-bit vector of [2 x double] containing the subtrahend.
342 /// \returns A 128-bit vector of [2 x double] containing the result.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS128
344 _mm_fnmsub_pd(__m128d __A
, __m128d __B
, __m128d __C
)
346 return (__m128d
)__builtin_ia32_vfmaddpd(-(__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
349 /// Computes a scalar negated multiply-subtract of the single-precision
350 /// values in the low 32 bits of 128-bit vectors of [4 x float].
352 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
353 /// result[127:32] = __A[127:32]
356 /// \headerfile <immintrin.h>
358 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
361 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
364 /// A 128-bit vector of [4 x float] containing the multiplier in the low
367 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
369 /// \returns A 128-bit vector of [4 x float] containing the result in the low
370 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
371 static __inline__ __m128 __DEFAULT_FN_ATTRS128
372 _mm_fnmsub_ss(__m128 __A
, __m128 __B
, __m128 __C
)
374 return (__m128
)__builtin_ia32_vfmaddss3((__v4sf
)__A
, -(__v4sf
)__B
, -(__v4sf
)__C
);
377 /// Computes a scalar negated multiply-subtract of the double-precision
378 /// values in the low 64 bits of 128-bit vectors of [2 x double].
380 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
381 /// result[127:64] = __A[127:64]
384 /// \headerfile <immintrin.h>
386 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
389 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
392 /// A 128-bit vector of [2 x double] containing the multiplier in the low
395 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
397 /// \returns A 128-bit vector of [2 x double] containing the result in the low
398 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
399 static __inline__ __m128d __DEFAULT_FN_ATTRS128
400 _mm_fnmsub_sd(__m128d __A
, __m128d __B
, __m128d __C
)
402 return (__m128d
)__builtin_ia32_vfmaddsd3((__v2df
)__A
, -(__v2df
)__B
, -(__v2df
)__C
);
405 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
408 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
409 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
410 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
411 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
414 /// \headerfile <immintrin.h>
416 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
419 /// A 128-bit vector of [4 x float] containing the multiplicand.
421 /// A 128-bit vector of [4 x float] containing the multiplier.
423 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
424 /// \returns A 128-bit vector of [4 x float] containing the result.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS128
426 _mm_fmaddsub_ps(__m128 __A
, __m128 __B
, __m128 __C
)
428 return (__m128
)__builtin_ia32_vfmaddsubps((__v4sf
)__A
, (__v4sf
)__B
, (__v4sf
)__C
);
431 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
434 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
435 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
438 /// \headerfile <immintrin.h>
440 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
443 /// A 128-bit vector of [2 x double] containing the multiplicand.
445 /// A 128-bit vector of [2 x double] containing the multiplier.
447 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
448 /// \returns A 128-bit vector of [2 x double] containing the result.
449 static __inline__ __m128d __DEFAULT_FN_ATTRS128
450 _mm_fmaddsub_pd(__m128d __A
, __m128d __B
, __m128d __C
)
452 return (__m128d
)__builtin_ia32_vfmaddsubpd((__v2df
)__A
, (__v2df
)__B
, (__v2df
)__C
);
455 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
458 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
459 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
460 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
461 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
464 /// \headerfile <immintrin.h>
466 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
469 /// A 128-bit vector of [4 x float] containing the multiplicand.
471 /// A 128-bit vector of [4 x float] containing the multiplier.
473 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
474 /// \returns A 128-bit vector of [4 x float] containing the result.
475 static __inline__ __m128 __DEFAULT_FN_ATTRS128
476 _mm_fmsubadd_ps(__m128 __A
, __m128 __B
, __m128 __C
)
478 return (__m128
)__builtin_ia32_vfmaddsubps((__v4sf
)__A
, (__v4sf
)__B
, -(__v4sf
)__C
);
481 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
484 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
485 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
488 /// \headerfile <immintrin.h>
490 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
493 /// A 128-bit vector of [2 x double] containing the multiplicand.
495 /// A 128-bit vector of [2 x double] containing the multiplier.
497 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
498 /// \returns A 128-bit vector of [2 x double] containing the result.
499 static __inline__ __m128d __DEFAULT_FN_ATTRS128
500 _mm_fmsubadd_pd(__m128d __A
, __m128d __B
, __m128d __C
)
502 return (__m128d
)__builtin_ia32_vfmaddsubpd((__v2df
)__A
, (__v2df
)__B
, -(__v2df
)__C
);
505 /// Computes a multiply-add of 256-bit vectors of [8 x float].
506 /// For each element, computes <c> (__A * __B) + __C </c>.
508 /// \headerfile <immintrin.h>
510 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
513 /// A 256-bit vector of [8 x float] containing the multiplicand.
515 /// A 256-bit vector of [8 x float] containing the multiplier.
517 /// A 256-bit vector of [8 x float] containing the addend.
518 /// \returns A 256-bit vector of [8 x float] containing the result.
519 static __inline__ __m256 __DEFAULT_FN_ATTRS256
520 _mm256_fmadd_ps(__m256 __A
, __m256 __B
, __m256 __C
)
522 return (__m256
)__builtin_ia32_vfmaddps256((__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
);
525 /// Computes a multiply-add of 256-bit vectors of [4 x double].
526 /// For each element, computes <c> (__A * __B) + __C </c>.
528 /// \headerfile <immintrin.h>
530 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
533 /// A 256-bit vector of [4 x double] containing the multiplicand.
535 /// A 256-bit vector of [4 x double] containing the multiplier.
537 /// A 256-bit vector of [4 x double] containing the addend.
538 /// \returns A 256-bit vector of [4 x double] containing the result.
539 static __inline__ __m256d __DEFAULT_FN_ATTRS256
540 _mm256_fmadd_pd(__m256d __A
, __m256d __B
, __m256d __C
)
542 return (__m256d
)__builtin_ia32_vfmaddpd256((__v4df
)__A
, (__v4df
)__B
, (__v4df
)__C
);
545 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
546 /// For each element, computes <c> (__A * __B) - __C </c>.
548 /// \headerfile <immintrin.h>
550 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
553 /// A 256-bit vector of [8 x float] containing the multiplicand.
555 /// A 256-bit vector of [8 x float] containing the multiplier.
557 /// A 256-bit vector of [8 x float] containing the subtrahend.
558 /// \returns A 256-bit vector of [8 x float] containing the result.
559 static __inline__ __m256 __DEFAULT_FN_ATTRS256
560 _mm256_fmsub_ps(__m256 __A
, __m256 __B
, __m256 __C
)
562 return (__m256
)__builtin_ia32_vfmaddps256((__v8sf
)__A
, (__v8sf
)__B
, -(__v8sf
)__C
);
565 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
566 /// For each element, computes <c> (__A * __B) - __C </c>.
568 /// \headerfile <immintrin.h>
570 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
573 /// A 256-bit vector of [4 x double] containing the multiplicand.
575 /// A 256-bit vector of [4 x double] containing the multiplier.
577 /// A 256-bit vector of [4 x double] containing the subtrahend.
578 /// \returns A 256-bit vector of [4 x double] containing the result.
579 static __inline__ __m256d __DEFAULT_FN_ATTRS256
580 _mm256_fmsub_pd(__m256d __A
, __m256d __B
, __m256d __C
)
582 return (__m256d
)__builtin_ia32_vfmaddpd256((__v4df
)__A
, (__v4df
)__B
, -(__v4df
)__C
);
585 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
586 /// For each element, computes <c> -(__A * __B) + __C </c>.
588 /// \headerfile <immintrin.h>
590 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
593 /// A 256-bit vector of [8 x float] containing the multiplicand.
595 /// A 256-bit vector of [8 x float] containing the multiplier.
597 /// A 256-bit vector of [8 x float] containing the addend.
598 /// \returns A 256-bit vector of [8 x float] containing the result.
599 static __inline__ __m256 __DEFAULT_FN_ATTRS256
600 _mm256_fnmadd_ps(__m256 __A
, __m256 __B
, __m256 __C
)
602 return (__m256
)__builtin_ia32_vfmaddps256(-(__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
);
605 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
606 /// For each element, computes <c> -(__A * __B) + __C </c>.
608 /// \headerfile <immintrin.h>
610 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
613 /// A 256-bit vector of [4 x double] containing the multiplicand.
615 /// A 256-bit vector of [4 x double] containing the multiplier.
617 /// A 256-bit vector of [4 x double] containing the addend.
618 /// \returns A 256-bit vector of [4 x double] containing the result.
619 static __inline__ __m256d __DEFAULT_FN_ATTRS256
620 _mm256_fnmadd_pd(__m256d __A
, __m256d __B
, __m256d __C
)
622 return (__m256d
)__builtin_ia32_vfmaddpd256(-(__v4df
)__A
, (__v4df
)__B
, (__v4df
)__C
);
625 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
626 /// For each element, computes <c> -(__A * __B) - __C </c>.
628 /// \headerfile <immintrin.h>
630 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
633 /// A 256-bit vector of [8 x float] containing the multiplicand.
635 /// A 256-bit vector of [8 x float] containing the multiplier.
637 /// A 256-bit vector of [8 x float] containing the subtrahend.
638 /// \returns A 256-bit vector of [8 x float] containing the result.
639 static __inline__ __m256 __DEFAULT_FN_ATTRS256
640 _mm256_fnmsub_ps(__m256 __A
, __m256 __B
, __m256 __C
)
642 return (__m256
)__builtin_ia32_vfmaddps256(-(__v8sf
)__A
, (__v8sf
)__B
, -(__v8sf
)__C
);
645 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
646 /// For each element, computes <c> -(__A * __B) - __C </c>.
648 /// \headerfile <immintrin.h>
650 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
653 /// A 256-bit vector of [4 x double] containing the multiplicand.
655 /// A 256-bit vector of [4 x double] containing the multiplier.
657 /// A 256-bit vector of [4 x double] containing the subtrahend.
658 /// \returns A 256-bit vector of [4 x double] containing the result.
659 static __inline__ __m256d __DEFAULT_FN_ATTRS256
660 _mm256_fnmsub_pd(__m256d __A
, __m256d __B
, __m256d __C
)
662 return (__m256d
)__builtin_ia32_vfmaddpd256(-(__v4df
)__A
, (__v4df
)__B
, -(__v4df
)__C
);
665 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
668 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
669 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
670 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
671 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
672 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
673 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
674 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
675 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
678 /// \headerfile <immintrin.h>
680 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
683 /// A 256-bit vector of [8 x float] containing the multiplicand.
685 /// A 256-bit vector of [8 x float] containing the multiplier.
687 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
688 /// \returns A 256-bit vector of [8 x float] containing the result.
689 static __inline__ __m256 __DEFAULT_FN_ATTRS256
690 _mm256_fmaddsub_ps(__m256 __A
, __m256 __B
, __m256 __C
)
692 return (__m256
)__builtin_ia32_vfmaddsubps256((__v8sf
)__A
, (__v8sf
)__B
, (__v8sf
)__C
);
695 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
698 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
699 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
700 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
701 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
704 /// \headerfile <immintrin.h>
706 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
709 /// A 256-bit vector of [4 x double] containing the multiplicand.
711 /// A 256-bit vector of [4 x double] containing the multiplier.
713 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
714 /// \returns A 256-bit vector of [4 x double] containing the result.
715 static __inline__ __m256d __DEFAULT_FN_ATTRS256
716 _mm256_fmaddsub_pd(__m256d __A
, __m256d __B
, __m256d __C
)
718 return (__m256d
)__builtin_ia32_vfmaddsubpd256((__v4df
)__A
, (__v4df
)__B
, (__v4df
)__C
);
721 /// Computes a vector multiply with alternating add/subtract of 256-bit
722 /// vectors of [8 x float].
724 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
725 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
726 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
727 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
728 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
729 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
730 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
731 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
734 /// \headerfile <immintrin.h>
736 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
739 /// A 256-bit vector of [8 x float] containing the multiplicand.
741 /// A 256-bit vector of [8 x float] containing the multiplier.
743 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
744 /// \returns A 256-bit vector of [8 x float] containing the result.
745 static __inline__ __m256 __DEFAULT_FN_ATTRS256
746 _mm256_fmsubadd_ps(__m256 __A
, __m256 __B
, __m256 __C
)
748 return (__m256
)__builtin_ia32_vfmaddsubps256((__v8sf
)__A
, (__v8sf
)__B
, -(__v8sf
)__C
);
751 /// Computes a vector multiply with alternating add/subtract of 256-bit
752 /// vectors of [4 x double].
754 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
755 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
756 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
757 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
760 /// \headerfile <immintrin.h>
762 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
765 /// A 256-bit vector of [4 x double] containing the multiplicand.
767 /// A 256-bit vector of [4 x double] containing the multiplier.
769 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
770 /// \returns A 256-bit vector of [4 x double] containing the result.
771 static __inline__ __m256d __DEFAULT_FN_ATTRS256
772 _mm256_fmsubadd_pd(__m256d __A
, __m256d __B
, __m256d __C
)
774 return (__m256d
)__builtin_ia32_vfmaddsubpd256((__v4df
)__A
, (__v4df
)__B
, -(__v4df
)__C
);
777 #undef __DEFAULT_FN_ATTRS128
778 #undef __DEFAULT_FN_ATTRS256
780 #endif /* __FMAINTRIN_H */