[mlir][tensor] fix typo in pad tiling comment
[llvm-project.git] / clang / lib / Headers / fmaintrin.h
blob22d1a780bbfd4eb2885640eeffae74d21b7f2b76
1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 #ifndef __IMMINTRIN_H
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12 #endif
14 #ifndef __FMAINTRIN_H
15 #define __FMAINTRIN_H
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 /// For each element, computes <c> (__A * __B) + __C </c>.
23 ///
24 /// \headerfile <immintrin.h>
25 ///
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
27 ///
28 /// \param __A
29 /// A 128-bit vector of [4 x float] containing the multiplicand.
30 /// \param __B
31 /// A 128-bit vector of [4 x float] containing the multiplier.
32 /// \param __C
33 /// A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
38 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 /// For each element, computes <c> (__A * __B) + __C </c>.
43 ///
44 /// \headerfile <immintrin.h>
45 ///
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
47 ///
48 /// \param __A
49 /// A 128-bit vector of [2 x double] containing the multiplicand.
50 /// \param __B
51 /// A 128-bit vector of [2 x double] containing the multiplier.
52 /// \param __C
53 /// A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
58 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
61 /// Computes a scalar multiply-add of the single-precision values in the
62 /// low 32 bits of 128-bit vectors of [4 x float].
63 ///
64 /// \code{.operation}
65 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
66 /// result[127:32] = __A[127:32]
67 /// \endcode
68 ///
69 /// \headerfile <immintrin.h>
70 ///
71 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
72 ///
73 /// \param __A
74 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
75 /// 32 bits.
76 /// \param __B
77 /// A 128-bit vector of [4 x float] containing the multiplier in the low
78 /// 32 bits.
79 /// \param __C
80 /// A 128-bit vector of [4 x float] containing the addend in the low
81 /// 32 bits.
82 /// \returns A 128-bit vector of [4 x float] containing the result in the low
83 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
84 static __inline__ __m128 __DEFAULT_FN_ATTRS128
85 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
87 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
90 /// Computes a scalar multiply-add of the double-precision values in the
91 /// low 64 bits of 128-bit vectors of [2 x double].
92 ///
93 /// \code{.operation}
94 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
95 /// result[127:64] = __A[127:64]
96 /// \endcode
97 ///
98 /// \headerfile <immintrin.h>
99 ///
100 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
102 /// \param __A
103 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
104 /// 64 bits.
105 /// \param __B
106 /// A 128-bit vector of [2 x double] containing the multiplier in the low
107 /// 64 bits.
108 /// \param __C
109 /// A 128-bit vector of [2 x double] containing the addend in the low
110 /// 64 bits.
111 /// \returns A 128-bit vector of [2 x double] containing the result in the low
112 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
113 static __inline__ __m128d __DEFAULT_FN_ATTRS128
114 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
116 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
119 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
120 /// For each element, computes <c> (__A * __B) - __C </c>.
122 /// \headerfile <immintrin.h>
124 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
126 /// \param __A
127 /// A 128-bit vector of [4 x float] containing the multiplicand.
128 /// \param __B
129 /// A 128-bit vector of [4 x float] containing the multiplier.
130 /// \param __C
131 /// A 128-bit vector of [4 x float] containing the subtrahend.
132 /// \returns A 128-bit vector of [4 x float] containing the result.
133 static __inline__ __m128 __DEFAULT_FN_ATTRS128
134 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
136 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
139 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
140 /// For each element, computes <c> (__A * __B) - __C </c>.
142 /// \headerfile <immintrin.h>
144 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
146 /// \param __A
147 /// A 128-bit vector of [2 x double] containing the multiplicand.
148 /// \param __B
149 /// A 128-bit vector of [2 x double] containing the multiplier.
150 /// \param __C
151 /// A 128-bit vector of [2 x double] containing the addend.
152 /// \returns A 128-bit vector of [2 x double] containing the result.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
154 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
156 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
159 /// Computes a scalar multiply-subtract of the single-precision values in
160 /// the low 32 bits of 128-bit vectors of [4 x float].
162 /// \code{.operation}
163 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
164 /// result[127:32] = __A[127:32]
165 /// \endcode
167 /// \headerfile <immintrin.h>
169 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
171 /// \param __A
172 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
173 /// 32 bits.
174 /// \param __B
175 /// A 128-bit vector of [4 x float] containing the multiplier in the low
176 /// 32 bits.
177 /// \param __C
178 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
179 /// 32 bits.
180 /// \returns A 128-bit vector of [4 x float] containing the result in the low
181 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
182 static __inline__ __m128 __DEFAULT_FN_ATTRS128
183 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
185 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
188 /// Computes a scalar multiply-subtract of the double-precision values in
189 /// the low 64 bits of 128-bit vectors of [2 x double].
191 /// \code{.operation}
192 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
193 /// result[127:64] = __A[127:64]
194 /// \endcode
196 /// \headerfile <immintrin.h>
198 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
200 /// \param __A
201 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
202 /// 64 bits.
203 /// \param __B
204 /// A 128-bit vector of [2 x double] containing the multiplier in the low
205 /// 64 bits.
206 /// \param __C
207 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
208 /// 64 bits.
209 /// \returns A 128-bit vector of [2 x double] containing the result in the low
210 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS128
212 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
214 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
217 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
218 /// For each element, computes <c> -(__A * __B) + __C </c>.
220 /// \headerfile <immintrin.h>
222 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
224 /// \param __A
225 /// A 128-bit vector of [4 x float] containing the multiplicand.
226 /// \param __B
227 /// A 128-bit vector of [4 x float] containing the multiplier.
228 /// \param __C
229 /// A 128-bit vector of [4 x float] containing the addend.
230 /// \returns A 128-bit [4 x float] vector containing the result.
231 static __inline__ __m128 __DEFAULT_FN_ATTRS128
232 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
234 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
237 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
238 /// For each element, computes <c> -(__A * __B) + __C </c>.
240 /// \headerfile <immintrin.h>
242 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
244 /// \param __A
245 /// A 128-bit vector of [2 x double] containing the multiplicand.
246 /// \param __B
247 /// A 128-bit vector of [2 x double] containing the multiplier.
248 /// \param __C
249 /// A 128-bit vector of [2 x double] containing the addend.
250 /// \returns A 128-bit vector of [2 x double] containing the result.
251 static __inline__ __m128d __DEFAULT_FN_ATTRS128
252 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
254 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
257 /// Computes a scalar negated multiply-add of the single-precision values in
258 /// the low 32 bits of 128-bit vectors of [4 x float].
260 /// \code{.operation}
261 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
262 /// result[127:32] = __A[127:32]
263 /// \endcode
265 /// \headerfile <immintrin.h>
267 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
269 /// \param __A
270 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
271 /// 32 bits.
272 /// \param __B
273 /// A 128-bit vector of [4 x float] containing the multiplier in the low
274 /// 32 bits.
275 /// \param __C
276 /// A 128-bit vector of [4 x float] containing the addend in the low
277 /// 32 bits.
278 /// \returns A 128-bit vector of [4 x float] containing the result in the low
279 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
280 static __inline__ __m128 __DEFAULT_FN_ATTRS128
281 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
283 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
286 /// Computes a scalar negated multiply-add of the double-precision values
287 /// in the low 64 bits of 128-bit vectors of [2 x double].
289 /// \code{.operation}
290 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
291 /// result[127:64] = __A[127:64]
292 /// \endcode
294 /// \headerfile <immintrin.h>
296 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
298 /// \param __A
299 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
300 /// 64 bits.
301 /// \param __B
302 /// A 128-bit vector of [2 x double] containing the multiplier in the low
303 /// 64 bits.
304 /// \param __C
305 /// A 128-bit vector of [2 x double] containing the addend in the low
306 /// 64 bits.
307 /// \returns A 128-bit vector of [2 x double] containing the result in the low
308 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
309 static __inline__ __m128d __DEFAULT_FN_ATTRS128
310 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
312 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
315 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
316 /// For each element, computes <c> -(__A * __B) - __C </c>.
318 /// \headerfile <immintrin.h>
320 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
322 /// \param __A
323 /// A 128-bit vector of [4 x float] containing the multiplicand.
324 /// \param __B
325 /// A 128-bit vector of [4 x float] containing the multiplier.
326 /// \param __C
327 /// A 128-bit vector of [4 x float] containing the subtrahend.
328 /// \returns A 128-bit vector of [4 x float] containing the result.
329 static __inline__ __m128 __DEFAULT_FN_ATTRS128
330 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
332 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
335 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
336 /// For each element, computes <c> -(__A * __B) - __C </c>.
338 /// \headerfile <immintrin.h>
340 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
342 /// \param __A
343 /// A 128-bit vector of [2 x double] containing the multiplicand.
344 /// \param __B
345 /// A 128-bit vector of [2 x double] containing the multiplier.
346 /// \param __C
347 /// A 128-bit vector of [2 x double] containing the subtrahend.
348 /// \returns A 128-bit vector of [2 x double] containing the result.
349 static __inline__ __m128d __DEFAULT_FN_ATTRS128
350 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
352 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
355 /// Computes a scalar negated multiply-subtract of the single-precision
356 /// values in the low 32 bits of 128-bit vectors of [4 x float].
358 /// \code{.operation}
359 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
360 /// result[127:32] = __A[127:32]
361 /// \endcode
363 /// \headerfile <immintrin.h>
365 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
367 /// \param __A
368 /// A 128-bit vector of [4 x float] containing the multiplicand in the low
369 /// 32 bits.
370 /// \param __B
371 /// A 128-bit vector of [4 x float] containing the multiplier in the low
372 /// 32 bits.
373 /// \param __C
374 /// A 128-bit vector of [4 x float] containing the subtrahend in the low
375 /// 32 bits.
376 /// \returns A 128-bit vector of [4 x float] containing the result in the low
377 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
378 static __inline__ __m128 __DEFAULT_FN_ATTRS128
379 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
381 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
384 /// Computes a scalar negated multiply-subtract of the double-precision
385 /// values in the low 64 bits of 128-bit vectors of [2 x double].
387 /// \code{.operation}
388 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
389 /// result[127:64] = __A[127:64]
390 /// \endcode
392 /// \headerfile <immintrin.h>
394 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
396 /// \param __A
397 /// A 128-bit vector of [2 x double] containing the multiplicand in the low
398 /// 64 bits.
399 /// \param __B
400 /// A 128-bit vector of [2 x double] containing the multiplier in the low
401 /// 64 bits.
402 /// \param __C
403 /// A 128-bit vector of [2 x double] containing the subtrahend in the low
404 /// 64 bits.
405 /// \returns A 128-bit vector of [2 x double] containing the result in the low
406 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS128
408 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
410 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
413 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
414 /// [4 x float].
416 /// \code{.operation}
417 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
418 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
419 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
420 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
421 /// \endcode
423 /// \headerfile <immintrin.h>
425 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
427 /// \param __A
428 /// A 128-bit vector of [4 x float] containing the multiplicand.
429 /// \param __B
430 /// A 128-bit vector of [4 x float] containing the multiplier.
431 /// \param __C
432 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
433 /// \returns A 128-bit vector of [4 x float] containing the result.
434 static __inline__ __m128 __DEFAULT_FN_ATTRS128
435 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
437 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
440 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
441 /// [2 x double].
443 /// \code{.operation}
444 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
445 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
446 /// \endcode
448 /// \headerfile <immintrin.h>
450 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
452 /// \param __A
453 /// A 128-bit vector of [2 x double] containing the multiplicand.
454 /// \param __B
455 /// A 128-bit vector of [2 x double] containing the multiplier.
456 /// \param __C
457 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
458 /// \returns A 128-bit vector of [2 x double] containing the result.
459 static __inline__ __m128d __DEFAULT_FN_ATTRS128
460 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
462 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
465 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
466 /// [4 x float].
468 /// \code{.operation}
469 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
470 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
471 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
472 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
473 /// \endcode
475 /// \headerfile <immintrin.h>
477 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
479 /// \param __A
480 /// A 128-bit vector of [4 x float] containing the multiplicand.
481 /// \param __B
482 /// A 128-bit vector of [4 x float] containing the multiplier.
483 /// \param __C
484 /// A 128-bit vector of [4 x float] containing the addend/subtrahend.
485 /// \returns A 128-bit vector of [4 x float] containing the result.
486 static __inline__ __m128 __DEFAULT_FN_ATTRS128
487 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
489 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
492 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
493 /// [2 x double].
495 /// \code{.operation}
496 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
497 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
498 /// \endcode
500 /// \headerfile <immintrin.h>
502 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
504 /// \param __A
505 /// A 128-bit vector of [2 x double] containing the multiplicand.
506 /// \param __B
507 /// A 128-bit vector of [2 x double] containing the multiplier.
508 /// \param __C
509 /// A 128-bit vector of [2 x double] containing the addend/subtrahend.
510 /// \returns A 128-bit vector of [2 x double] containing the result.
511 static __inline__ __m128d __DEFAULT_FN_ATTRS128
512 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
514 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
517 /// Computes a multiply-add of 256-bit vectors of [8 x float].
518 /// For each element, computes <c> (__A * __B) + __C </c>.
520 /// \headerfile <immintrin.h>
522 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
524 /// \param __A
525 /// A 256-bit vector of [8 x float] containing the multiplicand.
526 /// \param __B
527 /// A 256-bit vector of [8 x float] containing the multiplier.
528 /// \param __C
529 /// A 256-bit vector of [8 x float] containing the addend.
530 /// \returns A 256-bit vector of [8 x float] containing the result.
531 static __inline__ __m256 __DEFAULT_FN_ATTRS256
532 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
534 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
537 /// Computes a multiply-add of 256-bit vectors of [4 x double].
538 /// For each element, computes <c> (__A * __B) + __C </c>.
540 /// \headerfile <immintrin.h>
542 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
544 /// \param __A
545 /// A 256-bit vector of [4 x double] containing the multiplicand.
546 /// \param __B
547 /// A 256-bit vector of [4 x double] containing the multiplier.
548 /// \param __C
549 /// A 256-bit vector of [4 x double] containing the addend.
550 /// \returns A 256-bit vector of [4 x double] containing the result.
551 static __inline__ __m256d __DEFAULT_FN_ATTRS256
552 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
554 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
557 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
558 /// For each element, computes <c> (__A * __B) - __C </c>.
560 /// \headerfile <immintrin.h>
562 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
564 /// \param __A
565 /// A 256-bit vector of [8 x float] containing the multiplicand.
566 /// \param __B
567 /// A 256-bit vector of [8 x float] containing the multiplier.
568 /// \param __C
569 /// A 256-bit vector of [8 x float] containing the subtrahend.
570 /// \returns A 256-bit vector of [8 x float] containing the result.
571 static __inline__ __m256 __DEFAULT_FN_ATTRS256
572 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
574 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
577 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
578 /// For each element, computes <c> (__A * __B) - __C </c>.
580 /// \headerfile <immintrin.h>
582 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
584 /// \param __A
585 /// A 256-bit vector of [4 x double] containing the multiplicand.
586 /// \param __B
587 /// A 256-bit vector of [4 x double] containing the multiplier.
588 /// \param __C
589 /// A 256-bit vector of [4 x double] containing the subtrahend.
590 /// \returns A 256-bit vector of [4 x double] containing the result.
591 static __inline__ __m256d __DEFAULT_FN_ATTRS256
592 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
594 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
597 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
598 /// For each element, computes <c> -(__A * __B) + __C </c>.
600 /// \headerfile <immintrin.h>
602 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
604 /// \param __A
605 /// A 256-bit vector of [8 x float] containing the multiplicand.
606 /// \param __B
607 /// A 256-bit vector of [8 x float] containing the multiplier.
608 /// \param __C
609 /// A 256-bit vector of [8 x float] containing the addend.
610 /// \returns A 256-bit vector of [8 x float] containing the result.
611 static __inline__ __m256 __DEFAULT_FN_ATTRS256
612 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
614 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
617 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
618 /// For each element, computes <c> -(__A * __B) + __C </c>.
620 /// \headerfile <immintrin.h>
622 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
624 /// \param __A
625 /// A 256-bit vector of [4 x double] containing the multiplicand.
626 /// \param __B
627 /// A 256-bit vector of [4 x double] containing the multiplier.
628 /// \param __C
629 /// A 256-bit vector of [4 x double] containing the addend.
630 /// \returns A 256-bit vector of [4 x double] containing the result.
631 static __inline__ __m256d __DEFAULT_FN_ATTRS256
632 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
634 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
637 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
638 /// For each element, computes <c> -(__A * __B) - __C </c>.
640 /// \headerfile <immintrin.h>
642 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
644 /// \param __A
645 /// A 256-bit vector of [8 x float] containing the multiplicand.
646 /// \param __B
647 /// A 256-bit vector of [8 x float] containing the multiplier.
648 /// \param __C
649 /// A 256-bit vector of [8 x float] containing the subtrahend.
650 /// \returns A 256-bit vector of [8 x float] containing the result.
651 static __inline__ __m256 __DEFAULT_FN_ATTRS256
652 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
654 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
657 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
658 /// For each element, computes <c> -(__A * __B) - __C </c>.
660 /// \headerfile <immintrin.h>
662 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
664 /// \param __A
665 /// A 256-bit vector of [4 x double] containing the multiplicand.
666 /// \param __B
667 /// A 256-bit vector of [4 x double] containing the multiplier.
668 /// \param __C
669 /// A 256-bit vector of [4 x double] containing the subtrahend.
670 /// \returns A 256-bit vector of [4 x double] containing the result.
671 static __inline__ __m256d __DEFAULT_FN_ATTRS256
672 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
674 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
677 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
678 /// [8 x float].
680 /// \code{.operation}
681 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
682 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
683 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
684 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
685 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
686 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
687 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
688 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
689 /// \endcode
691 /// \headerfile <immintrin.h>
693 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
695 /// \param __A
696 /// A 256-bit vector of [8 x float] containing the multiplicand.
697 /// \param __B
698 /// A 256-bit vector of [8 x float] containing the multiplier.
699 /// \param __C
700 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
701 /// \returns A 256-bit vector of [8 x float] containing the result.
702 static __inline__ __m256 __DEFAULT_FN_ATTRS256
703 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
705 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
708 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
709 /// [4 x double].
711 /// \code{.operation}
712 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
713 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
714 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
715 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
716 /// \endcode
718 /// \headerfile <immintrin.h>
720 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
722 /// \param __A
723 /// A 256-bit vector of [4 x double] containing the multiplicand.
724 /// \param __B
725 /// A 256-bit vector of [4 x double] containing the multiplier.
726 /// \param __C
727 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
728 /// \returns A 256-bit vector of [4 x double] containing the result.
729 static __inline__ __m256d __DEFAULT_FN_ATTRS256
730 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
732 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
735 /// Computes a vector multiply with alternating add/subtract of 256-bit
736 /// vectors of [8 x float].
738 /// \code{.operation}
739 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
740 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
741 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
742 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
743 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
744 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
745 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
746 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
747 /// \endcode
749 /// \headerfile <immintrin.h>
751 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
753 /// \param __A
754 /// A 256-bit vector of [8 x float] containing the multiplicand.
755 /// \param __B
756 /// A 256-bit vector of [8 x float] containing the multiplier.
757 /// \param __C
758 /// A 256-bit vector of [8 x float] containing the addend/subtrahend.
759 /// \returns A 256-bit vector of [8 x float] containing the result.
760 static __inline__ __m256 __DEFAULT_FN_ATTRS256
761 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
763 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
766 /// Computes a vector multiply with alternating add/subtract of 256-bit
767 /// vectors of [4 x double].
769 /// \code{.operation}
770 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
771 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
772 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
773 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
774 /// \endcode
776 /// \headerfile <immintrin.h>
778 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
780 /// \param __A
781 /// A 256-bit vector of [4 x double] containing the multiplicand.
782 /// \param __B
783 /// A 256-bit vector of [4 x double] containing the multiplier.
784 /// \param __C
785 /// A 256-bit vector of [4 x double] containing the addend/subtrahend.
786 /// \returns A 256-bit vector of [4 x double] containing the result.
787 static __inline__ __m256d __DEFAULT_FN_ATTRS256
788 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
790 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
793 #undef __DEFAULT_FN_ATTRS128
794 #undef __DEFAULT_FN_ATTRS256
796 #endif /* __FMAINTRIN_H */