[AMDGPU][AsmParser][NFC] Get rid of custom default operand handlers.
[llvm-project.git] / clang / lib / Headers / fmaintrin.h
blobea832fac4f99226475d5b290e1c10ef6aa592b64
1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __FMAINTRIN_H
15 #define __FMAINTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 /// For each element, computes <c> (__A * __B) + __C </c>.
23 ///
24 /// \headerfile <immintrin.h>
25 ///
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
27 ///
28 /// \param __A
29 /// A 128-bit vector of [4 x float] containing the multiplicand.
30 /// \param __B
31 /// A 128-bit vector of [4 x float] containing the multiplier.
32 /// \param __C
33 /// A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
38 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 /// For each element, computes <c> (__A * __B) + __C </c>.
43 ///
44 /// \headerfile <immintrin.h>
45 ///
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
47 ///
48 /// \param __A
49 /// A 128-bit vector of [2 x double] containing the multiplicand.
50 /// \param __B
51 /// A 128-bit vector of [2 x double] containing the multiplier.
52 /// \param __C
53 /// A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
58 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
61 /// Computes a scalar multiply-add of the single-precision values in the
62 /// low 32 bits of 128-bit vectors of [4 x float].
63 /// \code
64 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
65 /// result[127:32] = __A[127:32]
66 /// \endcode
67 ///
68 /// \headerfile <immintrin.h>
69 ///
70 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
71 ///
72 /// \param __A
73 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
74 /// 32 bits.
75 /// \param __B
76 /// A 128-bit vector of [4 x float] containing the multiplier in the low
77 /// 32 bits.
78 /// \param __C
79 /// A 128-bit vector of [4 x float] containing the addend in the low
80 /// 32 bits.
81 /// \returns A 128-bit vector of [4 x float] containing the result in the low
82 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
83 static __inline__ __m128 __DEFAULT_FN_ATTRS128
84 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
86 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
89 /// Computes a scalar multiply-add of the double-precision values in the
90 /// low 64 bits of 128-bit vectors of [2 x double].
91 /// \code
92 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
93 /// result[127:64] = __A[127:64]
94 /// \endcode
95 ///
96 /// \headerfile <immintrin.h>
97 ///
98 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
99 ///
100 /// \param __A
101 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
102 /// 64 bits.
103 /// \param __B
104 /// A 128-bit vector of [2 x double] containing the multiplier in the low
105 /// 64 bits.
106 /// \param __C
107 /// A 128-bit vector of [2 x double] containing the addend in the low
108 /// 64 bits.
109 /// \returns A 128-bit vector of [2 x double] containing the result in the low
110 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
111 static __inline__ __m128d __DEFAULT_FN_ATTRS128
112 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
114 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
117 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
118 /// For each element, computes <c> (__A * __B) - __C </c>.
120 /// \headerfile <immintrin.h>
122 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
124 /// \param __A
125 /// A 128-bit vector of [4 x float] containing the multiplicand.
126 /// \param __B
127 /// A 128-bit vector of [4 x float] containing the multiplier.
128 /// \param __C
129 /// A 128-bit vector of [4 x float] containing the subtrahend.
130 /// \returns A 128-bit vector of [4 x float] containing the result.
131 static __inline__ __m128 __DEFAULT_FN_ATTRS128
132 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
134 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
137 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
138 /// For each element, computes <c> (__A * __B) - __C </c>.
140 /// \headerfile <immintrin.h>
142 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
144 /// \param __A
145 /// A 128-bit vector of [2 x double] containing the multiplicand.
146 /// \param __B
147 /// A 128-bit vector of [2 x double] containing the multiplier.
148 /// \param __C
149 /// A 128-bit vector of [2 x double] containing the addend.
150 /// \returns A 128-bit vector of [2 x double] containing the result.
151 static __inline__ __m128d __DEFAULT_FN_ATTRS128
152 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
154 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
157 /// Computes a scalar multiply-subtract of the single-precision values in
158 /// the low 32 bits of 128-bit vectors of [4 x float].
159 /// \code
160 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
161 /// result[127:32] = __A[127:32]
162 /// \endcode
164 /// \headerfile <immintrin.h>
166 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
168 /// \param __A
169 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
170 /// 32 bits.
171 /// \param __B
172 /// A 128-bit vector of [4 x float] containing the multiplier in the low
173 /// 32 bits.
174 /// \param __C
175 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
176 /// 32 bits.
177 /// \returns A 128-bit vector of [4 x float] containing the result in the low
178 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
179 static __inline__ __m128 __DEFAULT_FN_ATTRS128
180 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
182 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
185 /// Computes a scalar multiply-subtract of the double-precision values in
186 /// the low 64 bits of 128-bit vectors of [2 x double].
187 /// \code
188 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
189 /// result[127:64] = __A[127:64]
190 /// \endcode
192 /// \headerfile <immintrin.h>
194 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
196 /// \param __A
197 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
198 /// 64 bits.
199 /// \param __B
200 /// A 128-bit vector of [2 x double] containing the multiplier in the low
201 /// 64 bits.
202 /// \param __C
203 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
204 /// 64 bits.
205 /// \returns A 128-bit vector of [2 x double] containing the result in the low
206 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
207 static __inline__ __m128d __DEFAULT_FN_ATTRS128
208 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
210 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
213 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
214 /// For each element, computes <c> -(__A * __B) + __C </c>.
216 /// \headerfile <immintrin.h>
218 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
220 /// \param __A
221 /// A 128-bit vector of [4 x float] containing the multiplicand.
222 /// \param __B
223 /// A 128-bit vector of [4 x float] containing the multiplier.
224 /// \param __C
225 /// A 128-bit vector of [4 x float] containing the addend.
226 /// \returns A 128-bit [4 x float] vector containing the result.
227 static __inline__ __m128 __DEFAULT_FN_ATTRS128
228 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
230 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
233 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
234 /// For each element, computes <c> -(__A * __B) + __C </c>.
236 /// \headerfile <immintrin.h>
238 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
240 /// \param __A
241 /// A 128-bit vector of [2 x double] containing the multiplicand.
242 /// \param __B
243 /// A 128-bit vector of [2 x double] containing the multiplier.
244 /// \param __C
245 /// A 128-bit vector of [2 x double] containing the addend.
246 /// \returns A 128-bit vector of [2 x double] containing the result.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS128
248 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
250 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
253 /// Computes a scalar negated multiply-add of the single-precision values in
254 /// the low 32 bits of 128-bit vectors of [4 x float].
255 /// \code
256 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
257 /// result[127:32] = __A[127:32]
258 /// \endcode
260 /// \headerfile <immintrin.h>
262 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
264 /// \param __A
265 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
266 /// 32 bits.
267 /// \param __B
268 /// A 128-bit vector of [4 x float] containing the multiplier in the low
269 /// 32 bits.
270 /// \param __C
271 /// A 128-bit vector of [4 x float] containing the addend in the low
272 /// 32 bits.
273 /// \returns A 128-bit vector of [4 x float] containing the result in the low
274 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
275 static __inline__ __m128 __DEFAULT_FN_ATTRS128
276 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
278 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
281 /// Computes a scalar negated multiply-add of the double-precision values
282 /// in the low 64 bits of 128-bit vectors of [2 x double].
283 /// \code
284 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
285 /// result[127:64] = __A[127:64]
286 /// \endcode
288 /// \headerfile <immintrin.h>
290 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
292 /// \param __A
293 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
294 /// 64 bits.
295 /// \param __B
296 /// A 128-bit vector of [2 x double] containing the multiplier in the low
297 /// 64 bits.
298 /// \param __C
299 /// A 128-bit vector of [2 x double] containing the addend in the low
300 /// 64 bits.
301 /// \returns A 128-bit vector of [2 x double] containing the result in the low
302 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
303 static __inline__ __m128d __DEFAULT_FN_ATTRS128
304 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
306 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
309 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
310 /// For each element, computes <c> -(__A * __B) - __C </c>.
312 /// \headerfile <immintrin.h>
314 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
316 /// \param __A
317 /// A 128-bit vector of [4 x float] containing the multiplicand.
318 /// \param __B
319 /// A 128-bit vector of [4 x float] containing the multiplier.
320 /// \param __C
321 /// A 128-bit vector of [4 x float] containing the subtrahend.
322 /// \returns A 128-bit vector of [4 x float] containing the result.
323 static __inline__ __m128 __DEFAULT_FN_ATTRS128
324 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
326 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
329 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
330 /// For each element, computes <c> -(__A * __B) - __C </c>.
332 /// \headerfile <immintrin.h>
334 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
336 /// \param __A
337 /// A 128-bit vector of [2 x double] containing the multiplicand.
338 /// \param __B
339 /// A 128-bit vector of [2 x double] containing the multiplier.
340 /// \param __C
341 /// A 128-bit vector of [2 x double] containing the subtrahend.
342 /// \returns A 128-bit vector of [2 x double] containing the result.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS128
344 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
346 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
349 /// Computes a scalar negated multiply-subtract of the single-precision
350 /// values in the low 32 bits of 128-bit vectors of [4 x float].
351 /// \code
352 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
353 /// result[127:32] = __A[127:32]
354 /// \endcode
356 /// \headerfile <immintrin.h>
358 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
360 /// \param __A
361 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
362 /// 32 bits.
363 /// \param __B
364 /// A 128-bit vector of [4 x float] containing the multiplier in the low
365 /// 32 bits.
366 /// \param __C
367 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
368 /// 32 bits.
369 /// \returns A 128-bit vector of [4 x float] containing the result in the low
370 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
371 static __inline__ __m128 __DEFAULT_FN_ATTRS128
372 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
374 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
377 /// Computes a scalar negated multiply-subtract of the double-precision
378 /// values in the low 64 bits of 128-bit vectors of [2 x double].
379 /// \code
380 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
381 /// result[127:64] = __A[127:64]
382 /// \endcode
384 /// \headerfile <immintrin.h>
386 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
388 /// \param __A
389 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
390 /// 64 bits.
391 /// \param __B
392 /// A 128-bit vector of [2 x double] containing the multiplier in the low
393 /// 64 bits.
394 /// \param __C
395 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
396 /// 64 bits.
397 /// \returns A 128-bit vector of [2 x double] containing the result in the low
398 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
399 static __inline__ __m128d __DEFAULT_FN_ATTRS128
400 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
402 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
405 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
406 /// [4 x float].
407 /// \code
408 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
409 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
410 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
411 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
412 /// \endcode
414 /// \headerfile <immintrin.h>
416 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
418 /// \param __A
419 /// A 128-bit vector of [4 x float] containing the multiplicand.
420 /// \param __B
421 /// A 128-bit vector of [4 x float] containing the multiplier.
422 /// \param __C
423 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
424 /// \returns A 128-bit vector of [4 x float] containing the result.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS128
426 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
428 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
431 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
432 /// [2 x double].
433 /// \code
434 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
435 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
436 /// \endcode
438 /// \headerfile <immintrin.h>
440 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
442 /// \param __A
443 /// A 128-bit vector of [2 x double] containing the multiplicand.
444 /// \param __B
445 /// A 128-bit vector of [2 x double] containing the multiplier.
446 /// \param __C
447 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
448 /// \returns A 128-bit vector of [2 x double] containing the result.
449 static __inline__ __m128d __DEFAULT_FN_ATTRS128
450 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
452 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
455 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
456 /// [4 x float].
457 /// \code
458 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
459 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
460 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
461 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
462 /// \endcode
464 /// \headerfile <immintrin.h>
466 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
468 /// \param __A
469 /// A 128-bit vector of [4 x float] containing the multiplicand.
470 /// \param __B
471 /// A 128-bit vector of [4 x float] containing the multiplier.
472 /// \param __C
473 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
474 /// \returns A 128-bit vector of [4 x float] containing the result.
475 static __inline__ __m128 __DEFAULT_FN_ATTRS128
476 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
478 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
481 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
482 /// [2 x double].
483 /// \code
484 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
485 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
486 /// \endcode
488 /// \headerfile <immintrin.h>
490 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
492 /// \param __A
493 /// A 128-bit vector of [2 x double] containing the multiplicand.
494 /// \param __B
495 /// A 128-bit vector of [2 x double] containing the multiplier.
496 /// \param __C
497 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
498 /// \returns A 128-bit vector of [2 x double] containing the result.
499 static __inline__ __m128d __DEFAULT_FN_ATTRS128
500 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
502 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
505 /// Computes a multiply-add of 256-bit vectors of [8 x float].
506 /// For each element, computes <c> (__A * __B) + __C </c>.
508 /// \headerfile <immintrin.h>
510 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
512 /// \param __A
513 /// A 256-bit vector of [8 x float] containing the multiplicand.
514 /// \param __B
515 /// A 256-bit vector of [8 x float] containing the multiplier.
516 /// \param __C
517 /// A 256-bit vector of [8 x float] containing the addend.
518 /// \returns A 256-bit vector of [8 x float] containing the result.
519 static __inline__ __m256 __DEFAULT_FN_ATTRS256
520 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
522 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
525 /// Computes a multiply-add of 256-bit vectors of [4 x double].
526 /// For each element, computes <c> (__A * __B) + __C </c>.
528 /// \headerfile <immintrin.h>
530 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
532 /// \param __A
533 /// A 256-bit vector of [4 x double] containing the multiplicand.
534 /// \param __B
535 /// A 256-bit vector of [4 x double] containing the multiplier.
536 /// \param __C
537 /// A 256-bit vector of [4 x double] containing the addend.
538 /// \returns A 256-bit vector of [4 x double] containing the result.
539 static __inline__ __m256d __DEFAULT_FN_ATTRS256
540 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
542 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
545 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
546 /// For each element, computes <c> (__A * __B) - __C </c>.
548 /// \headerfile <immintrin.h>
550 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
552 /// \param __A
553 /// A 256-bit vector of [8 x float] containing the multiplicand.
554 /// \param __B
555 /// A 256-bit vector of [8 x float] containing the multiplier.
556 /// \param __C
557 /// A 256-bit vector of [8 x float] containing the subtrahend.
558 /// \returns A 256-bit vector of [8 x float] containing the result.
559 static __inline__ __m256 __DEFAULT_FN_ATTRS256
560 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
562 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
565 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
566 /// For each element, computes <c> (__A * __B) - __C </c>.
568 /// \headerfile <immintrin.h>
570 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
572 /// \param __A
573 /// A 256-bit vector of [4 x double] containing the multiplicand.
574 /// \param __B
575 /// A 256-bit vector of [4 x double] containing the multiplier.
576 /// \param __C
577 /// A 256-bit vector of [4 x double] containing the subtrahend.
578 /// \returns A 256-bit vector of [4 x double] containing the result.
579 static __inline__ __m256d __DEFAULT_FN_ATTRS256
580 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
582 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
585 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
586 /// For each element, computes <c> -(__A * __B) + __C </c>.
588 /// \headerfile <immintrin.h>
590 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
592 /// \param __A
593 /// A 256-bit vector of [8 x float] containing the multiplicand.
594 /// \param __B
595 /// A 256-bit vector of [8 x float] containing the multiplier.
596 /// \param __C
597 /// A 256-bit vector of [8 x float] containing the addend.
598 /// \returns A 256-bit vector of [8 x float] containing the result.
599 static __inline__ __m256 __DEFAULT_FN_ATTRS256
600 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
602 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
605 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
606 /// For each element, computes <c> -(__A * __B) + __C </c>.
608 /// \headerfile <immintrin.h>
610 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
612 /// \param __A
613 /// A 256-bit vector of [4 x double] containing the multiplicand.
614 /// \param __B
615 /// A 256-bit vector of [4 x double] containing the multiplier.
616 /// \param __C
617 /// A 256-bit vector of [4 x double] containing the addend.
618 /// \returns A 256-bit vector of [4 x double] containing the result.
619 static __inline__ __m256d __DEFAULT_FN_ATTRS256
620 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
622 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
625 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
626 /// For each element, computes <c> -(__A * __B) - __C </c>.
628 /// \headerfile <immintrin.h>
630 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
632 /// \param __A
633 /// A 256-bit vector of [8 x float] containing the multiplicand.
634 /// \param __B
635 /// A 256-bit vector of [8 x float] containing the multiplier.
636 /// \param __C
637 /// A 256-bit vector of [8 x float] containing the subtrahend.
638 /// \returns A 256-bit vector of [8 x float] containing the result.
639 static __inline__ __m256 __DEFAULT_FN_ATTRS256
640 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
642 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
645 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
646 /// For each element, computes <c> -(__A * __B) - __C </c>.
648 /// \headerfile <immintrin.h>
650 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
652 /// \param __A
653 /// A 256-bit vector of [4 x double] containing the multiplicand.
654 /// \param __B
655 /// A 256-bit vector of [4 x double] containing the multiplier.
656 /// \param __C
657 /// A 256-bit vector of [4 x double] containing the subtrahend.
658 /// \returns A 256-bit vector of [4 x double] containing the result.
659 static __inline__ __m256d __DEFAULT_FN_ATTRS256
660 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
662 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
665 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
666 /// [8 x float].
667 /// \code
668 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
669 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
670 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
671 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
672 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
673 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
674 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
675 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
676 /// \endcode
678 /// \headerfile <immintrin.h>
680 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
682 /// \param __A
683 /// A 256-bit vector of [8 x float] containing the multiplicand.
684 /// \param __B
685 /// A 256-bit vector of [8 x float] containing the multiplier.
686 /// \param __C
687 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
688 /// \returns A 256-bit vector of [8 x float] containing the result.
689 static __inline__ __m256 __DEFAULT_FN_ATTRS256
690 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
692 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
695 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
696 /// [4 x double].
697 /// \code
698 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
699 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
700 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
701 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
702 /// \endcode
704 /// \headerfile <immintrin.h>
706 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
708 /// \param __A
709 /// A 256-bit vector of [4 x double] containing the multiplicand.
710 /// \param __B
711 /// A 256-bit vector of [4 x double] containing the multiplier.
712 /// \param __C
713 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
714 /// \returns A 256-bit vector of [4 x double] containing the result.
715 static __inline__ __m256d __DEFAULT_FN_ATTRS256
716 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
718 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
721 /// Computes a vector multiply with alternating add/subtract of 256-bit
722 /// vectors of [8 x float].
723 /// \code
724 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
725 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
726 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
727 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
728 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
729 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
730 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
731 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
732 /// \endcode
734 /// \headerfile <immintrin.h>
736 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
738 /// \param __A
739 /// A 256-bit vector of [8 x float] containing the multiplicand.
740 /// \param __B
741 /// A 256-bit vector of [8 x float] containing the multiplier.
742 /// \param __C
743 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
744 /// \returns A 256-bit vector of [8 x float] containing the result.
745 static __inline__ __m256 __DEFAULT_FN_ATTRS256
746 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
748 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
751 /// Computes a vector multiply with alternating add/subtract of 256-bit
752 /// vectors of [4 x double].
753 /// \code
754 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
755 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
756 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
757 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
758 /// \endcode
760 /// \headerfile <immintrin.h>
762 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
764 /// \param __A
765 /// A 256-bit vector of [4 x double] containing the multiplicand.
766 /// \param __B
767 /// A 256-bit vector of [4 x double] containing the multiplier.
768 /// \param __C
769 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
770 /// \returns A 256-bit vector of [4 x double] containing the result.
771 static __inline__ __m256d __DEFAULT_FN_ATTRS256
772 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
774 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
777 #undef __DEFAULT_FN_ATTRS128
778 #undef __DEFAULT_FN_ATTRS256
780 #endif /* __FMAINTRIN_H */