1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
17 #include <emmintrin.h>
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS \
21 __attribute__((__always_inline__, __nodebug__, \
22 __target__("sse3,no-evex512"), __min_vector_width__(128)))
24 /// Loads data from an unaligned memory location to elements in a 128-bit
27 /// If the address of the data is not 16-byte aligned, the instruction may
28 /// read two adjacent aligned blocks of memory to retrieve the requested
31 /// \headerfile <x86intrin.h>
33 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
36 /// A pointer to a 128-bit integer vector containing integer values.
37 /// \returns A 128-bit vector containing the moved values.
38 static __inline__ __m128i __DEFAULT_FN_ATTRS
39 _mm_lddqu_si128(__m128i_u
const *__p
)
41 return (__m128i
)__builtin_ia32_lddqu((char const *)__p
);
44 /// Adds the even-indexed values and subtracts the odd-indexed values of
45 /// two 128-bit vectors of [4 x float].
47 /// \headerfile <x86intrin.h>
49 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
52 /// A 128-bit vector of [4 x float] containing the left source operand.
54 /// A 128-bit vector of [4 x float] containing the right source operand.
55 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
56 /// differences of both operands.
57 static __inline__ __m128 __DEFAULT_FN_ATTRS
58 _mm_addsub_ps(__m128 __a
, __m128 __b
)
60 return __builtin_ia32_addsubps((__v4sf
)__a
, (__v4sf
)__b
);
63 /// Horizontally adds the adjacent pairs of values contained in two
64 /// 128-bit vectors of [4 x float].
66 /// \headerfile <x86intrin.h>
68 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
71 /// A 128-bit vector of [4 x float] containing one of the source operands.
72 /// The horizontal sums of the values are stored in the lower bits of the
75 /// A 128-bit vector of [4 x float] containing one of the source operands.
76 /// The horizontal sums of the values are stored in the upper bits of the
78 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
80 static __inline__ __m128 __DEFAULT_FN_ATTRS
81 _mm_hadd_ps(__m128 __a
, __m128 __b
)
83 return __builtin_ia32_haddps((__v4sf
)__a
, (__v4sf
)__b
);
86 /// Horizontally subtracts the adjacent pairs of values contained in two
87 /// 128-bit vectors of [4 x float].
89 /// \headerfile <x86intrin.h>
91 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
94 /// A 128-bit vector of [4 x float] containing one of the source operands.
95 /// The horizontal differences between the values are stored in the lower
96 /// bits of the destination.
98 /// A 128-bit vector of [4 x float] containing one of the source operands.
99 /// The horizontal differences between the values are stored in the upper
100 /// bits of the destination.
101 /// \returns A 128-bit vector of [4 x float] containing the horizontal
102 /// differences of both operands.
103 static __inline__ __m128 __DEFAULT_FN_ATTRS
104 _mm_hsub_ps(__m128 __a
, __m128 __b
)
106 return __builtin_ia32_hsubps((__v4sf
)__a
, (__v4sf
)__b
);
109 /// Moves and duplicates odd-indexed values from a 128-bit vector
110 /// of [4 x float] to float values stored in a 128-bit vector of
113 /// \headerfile <x86intrin.h>
115 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
118 /// A 128-bit vector of [4 x float]. \n
119 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
120 /// the destination. \n
121 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
123 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
125 static __inline__ __m128 __DEFAULT_FN_ATTRS
126 _mm_movehdup_ps(__m128 __a
)
128 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 1, 1, 3, 3);
131 /// Duplicates even-indexed values from a 128-bit vector of
132 /// [4 x float] to float values stored in a 128-bit vector of [4 x float].
134 /// \headerfile <x86intrin.h>
136 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
139 /// A 128-bit vector of [4 x float] \n
140 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
141 /// the destination. \n
142 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
144 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
146 static __inline__ __m128 __DEFAULT_FN_ATTRS
147 _mm_moveldup_ps(__m128 __a
)
149 return __builtin_shufflevector((__v4sf
)__a
, (__v4sf
)__a
, 0, 0, 2, 2);
152 /// Adds the even-indexed values and subtracts the odd-indexed values of
153 /// two 128-bit vectors of [2 x double].
155 /// \headerfile <x86intrin.h>
157 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
160 /// A 128-bit vector of [2 x double] containing the left source operand.
162 /// A 128-bit vector of [2 x double] containing the right source operand.
163 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
164 /// and differences of both operands.
165 static __inline__ __m128d __DEFAULT_FN_ATTRS
166 _mm_addsub_pd(__m128d __a
, __m128d __b
)
168 return __builtin_ia32_addsubpd((__v2df
)__a
, (__v2df
)__b
);
171 /// Horizontally adds the pairs of values contained in two 128-bit
172 /// vectors of [2 x double].
174 /// \headerfile <x86intrin.h>
176 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
179 /// A 128-bit vector of [2 x double] containing one of the source operands.
180 /// The horizontal sum of the values is stored in the lower bits of the
183 /// A 128-bit vector of [2 x double] containing one of the source operands.
184 /// The horizontal sum of the values is stored in the upper bits of the
186 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
188 static __inline__ __m128d __DEFAULT_FN_ATTRS
189 _mm_hadd_pd(__m128d __a
, __m128d __b
)
191 return __builtin_ia32_haddpd((__v2df
)__a
, (__v2df
)__b
);
194 /// Horizontally subtracts the pairs of values contained in two 128-bit
195 /// vectors of [2 x double].
197 /// \headerfile <x86intrin.h>
199 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
202 /// A 128-bit vector of [2 x double] containing one of the source operands.
203 /// The horizontal difference of the values is stored in the lower bits of
206 /// A 128-bit vector of [2 x double] containing one of the source operands.
207 /// The horizontal difference of the values is stored in the upper bits of
209 /// \returns A 128-bit vector of [2 x double] containing the horizontal
210 /// differences of both operands.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS
212 _mm_hsub_pd(__m128d __a
, __m128d __b
)
214 return __builtin_ia32_hsubpd((__v2df
)__a
, (__v2df
)__b
);
217 /// Moves and duplicates one double-precision value to double-precision
218 /// values stored in a 128-bit vector of [2 x double].
220 /// \headerfile <x86intrin.h>
223 /// __m128d _mm_loaddup_pd(double const *dp);
226 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
229 /// A pointer to a double-precision value to be moved and duplicated.
230 /// \returns A 128-bit vector of [2 x double] containing the moved and
231 /// duplicated values.
232 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
234 /// Moves and duplicates the double-precision value in the lower bits of
235 /// a 128-bit vector of [2 x double] to double-precision values stored in a
236 /// 128-bit vector of [2 x double].
238 /// \headerfile <x86intrin.h>
240 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
243 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
244 /// [127:64] and [63:0] of the destination.
245 /// \returns A 128-bit vector of [2 x double] containing the moved and
246 /// duplicated values.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS
248 _mm_movedup_pd(__m128d __a
)
250 return __builtin_shufflevector((__v2df
)__a
, (__v2df
)__a
, 0, 0);
253 /// Establishes a linear address memory range to be monitored and puts
254 /// the processor in the monitor event pending state. Data stored in the
255 /// monitored address range causes the processor to exit the pending state.
257 /// The \c MONITOR instruction can be used in kernel mode, and in other modes
258 /// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
260 /// \headerfile <x86intrin.h>
262 /// This intrinsic corresponds to the \c MONITOR instruction.
265 /// The memory range to be monitored. The size of the range is determined by
266 /// CPUID function 0000_0005h.
267 /// \param __extensions
268 /// Optional extensions for the monitoring state.
270 /// Optional hints for the monitoring state.
271 static __inline__
void __DEFAULT_FN_ATTRS
272 _mm_monitor(void const *__p
, unsigned __extensions
, unsigned __hints
)
274 __builtin_ia32_monitor(__p
, __extensions
, __hints
);
277 /// Used with the \c MONITOR instruction to wait while the processor is in
278 /// the monitor event pending state. Data stored in the monitored address
279 /// range, or an interrupt, causes the processor to exit the pending state.
281 /// The \c MWAIT instruction can be used in kernel mode, and in other modes if
282 /// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
284 /// \headerfile <x86intrin.h>
286 /// This intrinsic corresponds to the \c MWAIT instruction.
288 /// \param __extensions
289 /// Optional extensions for the monitoring state, which can vary by
292 /// Optional hints for the monitoring state, which can vary by processor.
293 static __inline__
void __DEFAULT_FN_ATTRS
294 _mm_mwait(unsigned __extensions
, unsigned __hints
)
296 __builtin_ia32_mwait(__extensions
, __hints
);
299 #undef __DEFAULT_FN_ATTRS
301 #endif /* __PMMINTRIN_H */