Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / X86 / sse41-builtins.c
blobbfe7a917a88555e0d20464906e74c315f4ae478c
1 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
2 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
3 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
4 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
7 #include <immintrin.h>
9 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
11 __m128i test_mm_blend_epi16(__m128i V1, __m128i V2) {
12 // CHECK-LABEL: test_mm_blend_epi16
13 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
14 return _mm_blend_epi16(V1, V2, 42);
17 __m128d test_mm_blend_pd(__m128d V1, __m128d V2) {
18 // CHECK-LABEL: test_mm_blend_pd
19 // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 3>
20 return _mm_blend_pd(V1, V2, 2);
23 __m128 test_mm_blend_ps(__m128 V1, __m128 V2) {
24 // CHECK-LABEL: test_mm_blend_ps
25 // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
26 return _mm_blend_ps(V1, V2, 6);
29 __m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) {
30 // CHECK-LABEL: test_mm_blendv_epi8
31 // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
32 return _mm_blendv_epi8(V1, V2, V3);
35 __m128d test_mm_blendv_pd(__m128d V1, __m128d V2, __m128d V3) {
36 // CHECK-LABEL: test_mm_blendv_pd
37 // CHECK: call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
38 return _mm_blendv_pd(V1, V2, V3);
41 __m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
42 // CHECK-LABEL: test_mm_blendv_ps
43 // CHECK: call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
44 return _mm_blendv_ps(V1, V2, V3);
47 __m128d test_mm_ceil_pd(__m128d x) {
48 // CHECK-LABEL: test_mm_ceil_pd
49 // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
50 return _mm_ceil_pd(x);
53 __m128 test_mm_ceil_ps(__m128 x) {
54 // CHECK-LABEL: test_mm_ceil_ps
55 // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
56 return _mm_ceil_ps(x);
59 __m128d test_mm_ceil_sd(__m128d x, __m128d y) {
60 // CHECK-LABEL: test_mm_ceil_sd
61 // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
62 return _mm_ceil_sd(x, y);
65 __m128 test_mm_ceil_ss(__m128 x, __m128 y) {
66 // CHECK-LABEL: test_mm_ceil_ss
67 // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
68 return _mm_ceil_ss(x, y);
71 __m128i test_mm_cmpeq_epi64(__m128i A, __m128i B) {
72 // CHECK-LABEL: test_mm_cmpeq_epi64
73 // CHECK: icmp eq <2 x i64>
74 // CHECK: sext <2 x i1> %{{.*}} to <2 x i64>
75 return _mm_cmpeq_epi64(A, B);
78 __m128i test_mm_cvtepi8_epi16(__m128i a) {
79 // CHECK-LABEL: test_mm_cvtepi8_epi16
80 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
81 // CHECK: sext <8 x i8> {{.*}} to <8 x i16>
82 return _mm_cvtepi8_epi16(a);
85 __m128i test_mm_cvtepi8_epi32(__m128i a) {
86 // CHECK-LABEL: test_mm_cvtepi8_epi32
87 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
88 // CHECK: sext <4 x i8> {{.*}} to <4 x i32>
89 return _mm_cvtepi8_epi32(a);
92 __m128i test_mm_cvtepi8_epi64(__m128i a) {
93 // CHECK-LABEL: test_mm_cvtepi8_epi64
94 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <2 x i32> <i32 0, i32 1>
95 // CHECK: sext <2 x i8> {{.*}} to <2 x i64>
96 return _mm_cvtepi8_epi64(a);
99 __m128i test_mm_cvtepi16_epi32(__m128i a) {
100 // CHECK-LABEL: test_mm_cvtepi16_epi32
101 // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
102 // CHECK: sext <4 x i16> {{.*}} to <4 x i32>
103 return _mm_cvtepi16_epi32(a);
106 __m128i test_mm_cvtepi16_epi64(__m128i a) {
107 // CHECK-LABEL: test_mm_cvtepi16_epi64
108 // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <2 x i32> <i32 0, i32 1>
109 // CHECK: sext <2 x i16> {{.*}} to <2 x i64>
110 return _mm_cvtepi16_epi64(a);
113 __m128i test_mm_cvtepi32_epi64(__m128i a) {
114 // CHECK-LABEL: test_mm_cvtepi32_epi64
115 // CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <2 x i32> <i32 0, i32 1>
116 // CHECK: sext <2 x i32> {{.*}} to <2 x i64>
117 return _mm_cvtepi32_epi64(a);
120 __m128i test_mm_cvtepu8_epi16(__m128i a) {
121 // CHECK-LABEL: test_mm_cvtepu8_epi16
122 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
123 // CHECK: zext <8 x i8> {{.*}} to <8 x i16>
124 return _mm_cvtepu8_epi16(a);
127 __m128i test_mm_cvtepu8_epi32(__m128i a) {
128 // CHECK-LABEL: test_mm_cvtepu8_epi32
129 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
130 // CHECK: zext <4 x i8> {{.*}} to <4 x i32>
131 return _mm_cvtepu8_epi32(a);
134 __m128i test_mm_cvtepu8_epi64(__m128i a) {
135 // CHECK-LABEL: test_mm_cvtepu8_epi64
136 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <2 x i32> <i32 0, i32 1>
137 // CHECK: zext <2 x i8> {{.*}} to <2 x i64>
138 return _mm_cvtepu8_epi64(a);
141 __m128i test_mm_cvtepu16_epi32(__m128i a) {
142 // CHECK-LABEL: test_mm_cvtepu16_epi32
143 // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
144 // CHECK: zext <4 x i16> {{.*}} to <4 x i32>
145 return _mm_cvtepu16_epi32(a);
148 __m128i test_mm_cvtepu16_epi64(__m128i a) {
149 // CHECK-LABEL: test_mm_cvtepu16_epi64
150 // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <2 x i32> <i32 0, i32 1>
151 // CHECK: zext <2 x i16> {{.*}} to <2 x i64>
152 return _mm_cvtepu16_epi64(a);
155 __m128i test_mm_cvtepu32_epi64(__m128i a) {
156 // CHECK-LABEL: test_mm_cvtepu32_epi64
157 // CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <2 x i32> <i32 0, i32 1>
158 // CHECK: zext <2 x i32> {{.*}} to <2 x i64>
159 return _mm_cvtepu32_epi64(a);
162 __m128d test_mm_dp_pd(__m128d x, __m128d y) {
163 // CHECK-LABEL: test_mm_dp_pd
164 // CHECK: call <2 x double> @llvm.x86.sse41.dppd(<2 x double> {{.*}}, <2 x double> {{.*}}, i8 7)
165 return _mm_dp_pd(x, y, 7);
168 __m128 test_mm_dp_ps(__m128 x, __m128 y) {
169 // CHECK-LABEL: test_mm_dp_ps
170 // CHECK: call <4 x float> @llvm.x86.sse41.dpps(<4 x float> {{.*}}, <4 x float> {{.*}}, i8 7)
171 return _mm_dp_ps(x, y, 7);
174 int test_mm_extract_epi8(__m128i x) {
175 // CHECK-LABEL: test_mm_extract_epi8
176 // CHECK: extractelement <16 x i8> %{{.*}}, {{i32|i64}} 1
177 // CHECK: zext i8 %{{.*}} to i32
178 return _mm_extract_epi8(x, 1);
181 int test_mm_extract_epi32(__m128i x) {
182 // CHECK-LABEL: test_mm_extract_epi32
183 // CHECK: extractelement <4 x i32> %{{.*}}, {{i32|i64}} 1
184 return _mm_extract_epi32(x, 1);
187 long long test_mm_extract_epi64(__m128i x) {
188 // CHECK-LABEL: test_mm_extract_epi64
189 // CHECK: extractelement <2 x i64> %{{.*}}, {{i32|i64}} 1
190 return _mm_extract_epi64(x, 1);
193 int test_mm_extract_ps(__m128 x) {
194 // CHECK-LABEL: test_mm_extract_ps
195 // CHECK: extractelement <4 x float> %{{.*}}, {{i32|i64}} 1
196 return _mm_extract_ps(x, 1);
199 __m128d test_mm_floor_pd(__m128d x) {
200 // CHECK-LABEL: test_mm_floor_pd
201 // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
202 return _mm_floor_pd(x);
205 __m128 test_mm_floor_ps(__m128 x) {
206 // CHECK-LABEL: test_mm_floor_ps
207 // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
208 return _mm_floor_ps(x);
211 __m128d test_mm_floor_sd(__m128d x, __m128d y) {
212 // CHECK-LABEL: test_mm_floor_sd
213 // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
214 return _mm_floor_sd(x, y);
217 __m128 test_mm_floor_ss(__m128 x, __m128 y) {
218 // CHECK-LABEL: test_mm_floor_ss
219 // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
220 return _mm_floor_ss(x, y);
223 __m128i test_mm_insert_epi8(__m128i x, char b) {
224 // CHECK-LABEL: test_mm_insert_epi8
225 // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, {{i32|i64}} 1
226 return _mm_insert_epi8(x, b, 1);
229 __m128i test_mm_insert_epi32(__m128i x, int b) {
230 // CHECK-LABEL: test_mm_insert_epi32
231 // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, {{i32|i64}} 1
232 return _mm_insert_epi32(x, b, 1);
235 #ifdef __x86_64__
236 __m128i test_mm_insert_epi64(__m128i x, long long b) {
237 // X64-LABEL: test_mm_insert_epi64
238 // X64: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, {{i32|i64}} 1
239 return _mm_insert_epi64(x, b, 1);
241 #endif
243 __m128 test_mm_insert_ps(__m128 x, __m128 y) {
244 // CHECK-LABEL: test_mm_insert_ps
245 // CHECK: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
246 return _mm_insert_ps(x, y, 4);
249 __m128i test_mm_max_epi8(__m128i x, __m128i y) {
250 // CHECK-LABEL: test_mm_max_epi8
251 // CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
252 return _mm_max_epi8(x, y);
255 __m128i test_mm_max_epi32(__m128i x, __m128i y) {
256 // CHECK-LABEL: test_mm_max_epi32
257 // CHECK: call <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
258 return _mm_max_epi32(x, y);
261 __m128i test_mm_max_epu16(__m128i x, __m128i y) {
262 // CHECK-LABEL: test_mm_max_epu16
263 // CHECK: call <8 x i16> @llvm.umax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
264 return _mm_max_epu16(x, y);
267 __m128i test_mm_max_epu32(__m128i x, __m128i y) {
268 // CHECK-LABEL: test_mm_max_epu32
269 // CHECK: call <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
270 return _mm_max_epu32(x, y);
273 __m128i test_mm_min_epi8(__m128i x, __m128i y) {
274 // CHECK-LABEL: test_mm_min_epi8
275 // CHECK: call <16 x i8> @llvm.smin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
276 return _mm_min_epi8(x, y);
279 __m128i test_mm_min_epi32(__m128i x, __m128i y) {
280 // CHECK-LABEL: test_mm_min_epi32
281 // CHECK: call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
282 return _mm_min_epi32(x, y);
285 __m128i test_mm_min_epu16(__m128i x, __m128i y) {
286 // CHECK-LABEL: test_mm_min_epu16
287 // CHECK: call <8 x i16> @llvm.umin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
288 return _mm_min_epu16(x, y);
291 __m128i test_mm_min_epu32(__m128i x, __m128i y) {
292 // CHECK-LABEL: test_mm_min_epu32
293 // CHECK: call <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
294 return _mm_min_epu32(x, y);
297 __m128i test_mm_minpos_epu16(__m128i x) {
298 // CHECK-LABEL: test_mm_minpos_epu16
299 // CHECK: call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %{{.*}})
300 return _mm_minpos_epu16(x);
303 __m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
304 // CHECK-LABEL: test_mm_mpsadbw_epu8
305 // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 1)
306 return _mm_mpsadbw_epu8(x, y, 1);
309 __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
310 // CHECK-LABEL: test_mm_mul_epi32
311 // CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
312 // CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
313 // CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
314 // CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
315 // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
316 return _mm_mul_epi32(x, y);
319 __m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
320 // CHECK-LABEL: test_mm_mullo_epi32
321 // CHECK: mul <4 x i32>
322 return _mm_mullo_epi32(x, y);
325 __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
326 // CHECK-LABEL: test_mm_packus_epi32
327 // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
328 return _mm_packus_epi32(x, y);
331 __m128d test_mm_round_pd(__m128d x) {
332 // CHECK-LABEL: test_mm_round_pd
333 // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
334 return _mm_round_pd(x, 4);
337 __m128 test_mm_round_ps(__m128 x) {
338 // CHECK-LABEL: test_mm_round_ps
339 // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
340 return _mm_round_ps(x, 4);
343 __m128d test_mm_round_sd(__m128d x, __m128d y) {
344 // CHECK-LABEL: test_mm_round_sd
345 // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
346 return _mm_round_sd(x, y, 4);
349 __m128 test_mm_round_ss(__m128 x, __m128 y) {
350 // CHECK-LABEL: test_mm_round_ss
351 // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
352 return _mm_round_ss(x, y, 4);
355 __m128i test_mm_stream_load_si128(__m128i const *a) {
356 // CHECK-LABEL: test_mm_stream_load_si128
357 // CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal
358 return _mm_stream_load_si128(a);
361 __m128i test_mm_stream_load_si128_void(const void *a) {
362 // CHECK-LABEL: test_mm_stream_load_si128_void
363 // CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal
364 return _mm_stream_load_si128(a);
367 int test_mm_test_all_ones(__m128i x) {
368 // CHECK-LABEL: test_mm_test_all_ones
369 // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
370 return _mm_test_all_ones(x);
373 int test_mm_test_all_zeros(__m128i x, __m128i y) {
374 // CHECK-LABEL: test_mm_test_all_zeros
375 // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
376 return _mm_test_all_zeros(x, y);
379 int test_mm_test_mix_ones_zeros(__m128i x, __m128i y) {
380 // CHECK-LABEL: test_mm_test_mix_ones_zeros
381 // CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
382 return _mm_test_mix_ones_zeros(x, y);
385 int test_mm_testc_si128(__m128i x, __m128i y) {
386 // CHECK-LABEL: test_mm_testc_si128
387 // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
388 return _mm_testc_si128(x, y);
391 int test_mm_testnzc_si128(__m128i x, __m128i y) {
392 // CHECK-LABEL: test_mm_testnzc_si128
393 // CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
394 return _mm_testnzc_si128(x, y);
397 int test_mm_testz_si128(__m128i x, __m128i y) {
398 // CHECK-LABEL: test_mm_testz_si128
399 // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
400 return _mm_testz_si128(x, y);
403 // Make sure brackets work after macro intrinsics.
404 float pr51324(__m128 a) {
405 // CHECK-LABEL: pr51324
406 // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
407 // CHECK: extractelement <4 x float> %{{.*}}, i32 0
408 return _mm_round_ps(a, 0)[0];
411 // Ensure _mm_test_all_ones macro doesn't reuse argument
412 __m128i expensive_call();
413 int pr60006() {
414 // CHECK-LABEL: pr60006
415 // CHECK: call {{.*}} @expensive_call
416 // CHECK-NOT: call {{.*}} @expensive_call
417 // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
418 return _mm_test_all_ones(expensive_call());