1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
7 define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
8 ; CHECK-LABEL: test_mm256_add_pd:
10 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
11 ; CHECK-NEXT: ret{{[l|q]}}
12 %res = fadd <4 x double> %a0, %a1
16 define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
17 ; CHECK-LABEL: test_mm256_add_ps:
19 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
20 ; CHECK-NEXT: ret{{[l|q]}}
21 %res = fadd <8 x float> %a0, %a1
25 define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
26 ; CHECK-LABEL: test_mm256_addsub_pd:
28 ; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
29 ; CHECK-NEXT: ret{{[l|q]}}
30 %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
33 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
35 define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
36 ; CHECK-LABEL: test_mm256_addsub_ps:
38 ; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
39 ; CHECK-NEXT: ret{{[l|q]}}
40 %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
43 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
45 define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
46 ; CHECK-LABEL: test_mm256_and_pd:
48 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
49 ; CHECK-NEXT: ret{{[l|q]}}
50 %1 = bitcast <4 x double> %a0 to <4 x i64>
51 %2 = bitcast <4 x double> %a1 to <4 x i64>
52 %res = and <4 x i64> %1, %2
53 %bc = bitcast <4 x i64> %res to <4 x double>
57 define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
58 ; CHECK-LABEL: test_mm256_and_ps:
60 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
61 ; CHECK-NEXT: ret{{[l|q]}}
62 %1 = bitcast <8 x float> %a0 to <8 x i32>
63 %2 = bitcast <8 x float> %a1 to <8 x i32>
64 %res = and <8 x i32> %1, %2
65 %bc = bitcast <8 x i32> %res to <8 x float>
69 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
70 ; CHECK-LABEL: test_mm256_andnot_pd:
72 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
73 ; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
74 ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
75 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
76 ; CHECK-NEXT: ret{{[l|q]}}
77 %1 = bitcast <4 x double> %a0 to <4 x i64>
78 %2 = bitcast <4 x double> %a1 to <4 x i64>
79 %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
80 %res = and <4 x i64> %3, %2
81 %bc = bitcast <4 x i64> %res to <4 x double>
85 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
86 ; CHECK-LABEL: test_mm256_andnot_ps:
88 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
89 ; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
90 ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
91 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
92 ; CHECK-NEXT: ret{{[l|q]}}
93 %1 = bitcast <8 x float> %a0 to <8 x i32>
94 %2 = bitcast <8 x float> %a1 to <8 x i32>
95 %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
96 %res = and <8 x i32> %3, %2
97 %bc = bitcast <8 x i32> %res to <8 x float>
101 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
102 ; CHECK-LABEL: test_mm256_blend_pd:
104 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
105 ; CHECK-NEXT: ret{{[l|q]}}
106 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
107 ret <4 x double> %res
110 define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
111 ; CHECK-LABEL: test_mm256_blend_ps:
113 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
114 ; CHECK-NEXT: ret{{[l|q]}}
115 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
119 define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
120 ; CHECK-LABEL: test_mm256_blendv_pd:
122 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
123 ; CHECK-NEXT: ret{{[l|q]}}
124 %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
125 ret <4 x double> %res
127 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
129 define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
130 ; CHECK-LABEL: test_mm256_blendv_ps:
132 ; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
133 ; CHECK-NEXT: ret{{[l|q]}}
134 %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
137 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
139 define <4 x double> @test_mm256_broadcast_pd(ptr %a0) nounwind {
140 ; X86-LABEL: test_mm256_broadcast_pd:
142 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
143 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
146 ; X64-LABEL: test_mm256_broadcast_pd:
148 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
150 %ld = load <2 x double>, ptr %a0
151 %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
152 ret <4 x double> %res
155 define <8 x float> @test_mm256_broadcast_ps(ptr %a0) nounwind {
156 ; X86-LABEL: test_mm256_broadcast_ps:
158 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
159 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
162 ; X64-LABEL: test_mm256_broadcast_ps:
164 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
166 %ld = load <4 x float>, ptr %a0
167 %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
171 define <4 x double> @test_mm256_broadcast_sd(ptr %a0) nounwind {
172 ; X86-LABEL: test_mm256_broadcast_sd:
174 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
175 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
178 ; X64-LABEL: test_mm256_broadcast_sd:
180 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
182 %ld = load double, ptr %a0
183 %ins0 = insertelement <4 x double> undef, double %ld, i32 0
184 %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
185 %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
186 %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
187 ret <4 x double> %ins3
190 define <4 x float> @test_mm_broadcast_ss(ptr %a0) nounwind {
191 ; X86-LABEL: test_mm_broadcast_ss:
193 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
194 ; X86-NEXT: vbroadcastss (%eax), %xmm0
197 ; X64-LABEL: test_mm_broadcast_ss:
199 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
201 %ld = load float, ptr %a0
202 %ins0 = insertelement <4 x float> undef, float %ld, i32 0
203 %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
204 %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
205 %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
206 ret <4 x float> %ins3
209 define <8 x float> @test_mm256_broadcast_ss(ptr %a0) nounwind {
210 ; X86-LABEL: test_mm256_broadcast_ss:
212 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
213 ; X86-NEXT: vbroadcastss (%eax), %ymm0
216 ; X64-LABEL: test_mm256_broadcast_ss:
218 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
220 %ld = load float, ptr %a0
221 %ins0 = insertelement <8 x float> undef, float %ld, i32 0
222 %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
223 %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
224 %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
225 %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
226 %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
227 %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
228 %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
229 ret <8 x float> %ins7
232 define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
233 ; CHECK-LABEL: test_mm256_castpd_ps:
235 ; CHECK-NEXT: ret{{[l|q]}}
236 %res = bitcast <4 x double> %a0 to <8 x float>
240 define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
241 ; CHECK-LABEL: test_mm256_castpd_si256:
243 ; CHECK-NEXT: ret{{[l|q]}}
244 %res = bitcast <4 x double> %a0 to <4 x i64>
248 define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
249 ; CHECK-LABEL: test_mm256_castpd128_pd256:
251 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
252 ; CHECK-NEXT: ret{{[l|q]}}
253 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
254 ret <4 x double> %res
257 define <4 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind {
258 ; CHECK-LABEL: test_mm256_castpd128_pd256_freeze:
260 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
261 ; CHECK-NEXT: ret{{[l|q]}}
262 %a1 = freeze <2 x double> poison
263 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
264 ret <4 x double> %res
267 define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
268 ; CHECK-LABEL: test_mm256_castpd256_pd128:
270 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
271 ; CHECK-NEXT: vzeroupper
272 ; CHECK-NEXT: ret{{[l|q]}}
273 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
274 ret <2 x double> %res
277 define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
278 ; CHECK-LABEL: test_mm256_castps_pd:
280 ; CHECK-NEXT: ret{{[l|q]}}
281 %res = bitcast <8 x float> %a0 to <4 x double>
282 ret <4 x double> %res
285 define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
286 ; CHECK-LABEL: test_mm256_castps_si256:
288 ; CHECK-NEXT: ret{{[l|q]}}
289 %res = bitcast <8 x float> %a0 to <4 x i64>
293 define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
294 ; CHECK-LABEL: test_mm256_castps128_ps256:
296 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
297 ; CHECK-NEXT: ret{{[l|q]}}
298 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
302 define <8 x float> @test_mm256_castps128_ps256_freeze(<4 x float> %a0) nounwind {
303 ; CHECK-LABEL: test_mm256_castps128_ps256_freeze:
305 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
306 ; CHECK-NEXT: ret{{[l|q]}}
307 %a1 = freeze <4 x float> poison
308 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
312 define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
313 ; CHECK-LABEL: test_mm256_castps256_ps128:
315 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
316 ; CHECK-NEXT: vzeroupper
317 ; CHECK-NEXT: ret{{[l|q]}}
318 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
322 define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
323 ; CHECK-LABEL: test_mm256_castsi128_si256:
325 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
326 ; CHECK-NEXT: ret{{[l|q]}}
327 %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
331 define <4 x i64> @test_mm256_castsi128_si256_freeze(<2 x i64> %a0) nounwind {
332 ; CHECK-LABEL: test_mm256_castsi128_si256_freeze:
334 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
335 ; CHECK-NEXT: ret{{[l|q]}}
336 %a1 = freeze <2 x i64> poison
337 %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
341 define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
342 ; CHECK-LABEL: test_mm256_castsi256_pd:
344 ; CHECK-NEXT: ret{{[l|q]}}
345 %res = bitcast <4 x i64> %a0 to <4 x double>
346 ret <4 x double> %res
349 define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
350 ; CHECK-LABEL: test_mm256_castsi256_ps:
352 ; CHECK-NEXT: ret{{[l|q]}}
353 %res = bitcast <4 x i64> %a0 to <8 x float>
357 define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
358 ; CHECK-LABEL: test_mm256_castsi256_si128:
360 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
361 ; CHECK-NEXT: vzeroupper
362 ; CHECK-NEXT: ret{{[l|q]}}
363 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
367 define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
368 ; CHECK-LABEL: test_mm256_ceil_pd:
370 ; CHECK-NEXT: vroundpd $2, %ymm0, %ymm0
371 ; CHECK-NEXT: ret{{[l|q]}}
372 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
373 ret <4 x double> %res
375 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
377 define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
378 ; CHECK-LABEL: test_mm256_ceil_ps:
380 ; CHECK-NEXT: vroundps $2, %ymm0, %ymm0
381 ; CHECK-NEXT: ret{{[l|q]}}
382 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
385 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
387 define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
388 ; CHECK-LABEL: test_mm_cmp_pd:
390 ; CHECK-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
391 ; CHECK-NEXT: ret{{[l|q]}}
392 %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
393 ret <2 x double> %res
395 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
397 define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
398 ; CHECK-LABEL: test_mm256_cmp_pd:
400 ; CHECK-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
401 ; CHECK-NEXT: ret{{[l|q]}}
402 %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
403 ret <4 x double> %res
405 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
407 define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
408 ; CHECK-LABEL: test_mm_cmp_ps:
410 ; CHECK-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
411 ; CHECK-NEXT: ret{{[l|q]}}
412 %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
415 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
417 define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
418 ; CHECK-LABEL: test_mm256_cmp_ps:
420 ; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
421 ; CHECK-NEXT: ret{{[l|q]}}
422 %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
425 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
427 define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
428 ; CHECK-LABEL: test_mm_cmp_sd:
430 ; CHECK-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
431 ; CHECK-NEXT: ret{{[l|q]}}
432 %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
433 ret <2 x double> %res
435 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
437 define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
438 ; CHECK-LABEL: test_mm_cmp_ss:
440 ; CHECK-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
441 ; CHECK-NEXT: ret{{[l|q]}}
442 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
445 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
447 define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
448 ; CHECK-LABEL: test_mm256_cvtepi32_pd:
450 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
451 ; CHECK-NEXT: ret{{[l|q]}}
452 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
453 %res = sitofp <4 x i32> %arg0 to <4 x double>
454 ret <4 x double> %res
457 define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
458 ; CHECK-LABEL: test_mm256_cvtepi32_ps:
460 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
461 ; CHECK-NEXT: ret{{[l|q]}}
462 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
463 %res = sitofp <8 x i32> %arg0 to <8 x float>
467 define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
468 ; CHECK-LABEL: test_mm256_cvtpd_epi32:
470 ; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0
471 ; CHECK-NEXT: vzeroupper
472 ; CHECK-NEXT: ret{{[l|q]}}
473 %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
474 %res = bitcast <4 x i32> %cvt to <2 x i64>
477 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
479 define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
480 ; CHECK-LABEL: test_mm256_cvtpd_ps:
482 ; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0
483 ; CHECK-NEXT: vzeroupper
484 ; CHECK-NEXT: ret{{[l|q]}}
485 %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
488 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
490 define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
491 ; CHECK-LABEL: test_mm256_cvtps_epi32:
493 ; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0
494 ; CHECK-NEXT: ret{{[l|q]}}
495 %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
496 %res = bitcast <8 x i32> %cvt to <4 x i64>
499 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
501 define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
502 ; CHECK-LABEL: test_mm256_cvtps_pd:
504 ; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
505 ; CHECK-NEXT: ret{{[l|q]}}
506 %res = fpext <4 x float> %a0 to <4 x double>
507 ret <4 x double> %res
510 define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
511 ; CHECK-LABEL: test_mm256_cvttpd_epi32:
513 ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
514 ; CHECK-NEXT: vzeroupper
515 ; CHECK-NEXT: ret{{[l|q]}}
516 %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
517 %res = bitcast <4 x i32> %cvt to <2 x i64>
520 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
522 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
523 ; CHECK-LABEL: test_mm256_cvttps_epi32:
525 ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
526 ; CHECK-NEXT: ret{{[l|q]}}
527 %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
528 %res = bitcast <8 x i32> %cvt to <4 x i64>
531 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
533 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
534 ; CHECK-LABEL: test_mm256_div_pd:
536 ; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm0
537 ; CHECK-NEXT: ret{{[l|q]}}
538 %res = fdiv <4 x double> %a0, %a1
539 ret <4 x double> %res
542 define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
543 ; CHECK-LABEL: test_mm256_div_ps:
545 ; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0
546 ; CHECK-NEXT: ret{{[l|q]}}
547 %res = fdiv <8 x float> %a0, %a1
551 define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
552 ; CHECK-LABEL: test_mm256_dp_ps:
554 ; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
555 ; CHECK-NEXT: ret{{[l|q]}}
556 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
559 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
561 define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
562 ; CHECK-LABEL: test_mm256_extract_epi8:
564 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
565 ; CHECK-NEXT: vpextrb $15, %xmm0, %eax
566 ; CHECK-NEXT: movzbl %al, %eax
567 ; CHECK-NEXT: vzeroupper
568 ; CHECK-NEXT: ret{{[l|q]}}
569 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
570 %ext = extractelement <32 x i8> %arg0, i32 31
571 %res = zext i8 %ext to i32
575 define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
576 ; CHECK-LABEL: test_mm256_extract_epi16:
578 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
579 ; CHECK-NEXT: vpextrw $3, %xmm0, %eax
580 ; CHECK-NEXT: movzwl %ax, %eax
581 ; CHECK-NEXT: vzeroupper
582 ; CHECK-NEXT: ret{{[l|q]}}
583 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
584 %ext = extractelement <16 x i16> %arg0, i32 11
585 %res = zext i16 %ext to i32
589 define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
590 ; CHECK-LABEL: test_mm256_extract_epi32:
592 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
593 ; CHECK-NEXT: vextractps $1, %xmm0, %eax
594 ; CHECK-NEXT: vzeroupper
595 ; CHECK-NEXT: ret{{[l|q]}}
596 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
597 %res = extractelement <8 x i32> %arg0, i32 5
601 define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
602 ; X86-LABEL: test_mm256_extract_epi64:
604 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
605 ; X86-NEXT: vextractps $2, %xmm0, %eax
606 ; X86-NEXT: vextractps $3, %xmm0, %edx
607 ; X86-NEXT: vzeroupper
610 ; X64-LABEL: test_mm256_extract_epi64:
612 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
613 ; X64-NEXT: vpextrq $1, %xmm0, %rax
614 ; X64-NEXT: vzeroupper
616 %res = extractelement <4 x i64> %a0, i32 3
620 define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
621 ; CHECK-LABEL: test_mm256_extractf128_pd:
623 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
624 ; CHECK-NEXT: vzeroupper
625 ; CHECK-NEXT: ret{{[l|q]}}
626 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
627 ret <2 x double> %res
630 define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
631 ; CHECK-LABEL: test_mm256_extractf128_ps:
633 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
634 ; CHECK-NEXT: vzeroupper
635 ; CHECK-NEXT: ret{{[l|q]}}
636 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
640 define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
641 ; CHECK-LABEL: test_mm256_extractf128_si256:
643 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
644 ; CHECK-NEXT: vzeroupper
645 ; CHECK-NEXT: ret{{[l|q]}}
646 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
650 define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
651 ; CHECK-LABEL: test_mm256_floor_pd:
653 ; CHECK-NEXT: vroundpd $1, %ymm0, %ymm0
654 ; CHECK-NEXT: ret{{[l|q]}}
655 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
656 ret <4 x double> %res
659 define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
660 ; CHECK-LABEL: test_mm256_floor_ps:
662 ; CHECK-NEXT: vroundps $1, %ymm0, %ymm0
663 ; CHECK-NEXT: ret{{[l|q]}}
664 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
668 define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
669 ; CHECK-LABEL: test_mm256_hadd_pd:
671 ; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
672 ; CHECK-NEXT: ret{{[l|q]}}
673 %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
674 ret <4 x double> %res
676 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
678 define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
679 ; CHECK-LABEL: test_mm256_hadd_ps:
681 ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0
682 ; CHECK-NEXT: ret{{[l|q]}}
683 %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
686 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
688 define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
689 ; CHECK-LABEL: test_mm256_hsub_pd:
691 ; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
692 ; CHECK-NEXT: ret{{[l|q]}}
693 %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
694 ret <4 x double> %res
696 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
698 define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
699 ; CHECK-LABEL: test_mm256_hsub_ps:
701 ; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0
702 ; CHECK-NEXT: ret{{[l|q]}}
703 %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
706 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
708 define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
709 ; X86-LABEL: test_mm256_insert_epi8:
711 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
712 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
713 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
716 ; X64-LABEL: test_mm256_insert_epi8:
718 ; X64-NEXT: vpinsrb $4, %edi, %xmm0, %xmm1
719 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
721 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
722 %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
723 %bc = bitcast <32 x i8> %res to <4 x i64>
727 define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
728 ; X86-LABEL: test_mm256_insert_epi16:
730 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
731 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
732 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
733 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
736 ; X64-LABEL: test_mm256_insert_epi16:
738 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
739 ; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1
740 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
742 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
743 %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
744 %bc = bitcast <16 x i16> %res to <4 x i64>
748 define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
749 ; X86-LABEL: test_mm256_insert_epi32:
751 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
752 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
755 ; X64-LABEL: test_mm256_insert_epi32:
757 ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1
758 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
760 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
761 %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
762 %bc = bitcast <8 x i32> %res to <4 x i64>
766 define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
767 ; X86-LABEL: test_mm256_insert_epi64:
769 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
770 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
771 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
772 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
775 ; X64-LABEL: test_mm256_insert_epi64:
777 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
778 ; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
779 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
781 %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
785 define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
786 ; CHECK-LABEL: test_mm256_insertf128_pd:
788 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
789 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
790 ; CHECK-NEXT: ret{{[l|q]}}
791 %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
792 %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
793 ret <4 x double> %res
796 define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
797 ; CHECK-LABEL: test_mm256_insertf128_ps:
799 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
800 ; CHECK-NEXT: ret{{[l|q]}}
801 %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
802 %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
806 define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
807 ; CHECK-LABEL: test_mm256_insertf128_si256:
809 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
810 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
811 ; CHECK-NEXT: ret{{[l|q]}}
812 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
813 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
817 define <4 x i64> @test_mm256_lddqu_si256(ptr %a0) nounwind {
818 ; X86-LABEL: test_mm256_lddqu_si256:
820 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
821 ; X86-NEXT: vlddqu (%eax), %ymm0
824 ; X64-LABEL: test_mm256_lddqu_si256:
826 ; X64-NEXT: vlddqu (%rdi), %ymm0
828 %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr %a0)
829 %bc = bitcast <32 x i8> %res to <4 x i64>
832 declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readnone
834 define <4 x double> @test_mm256_load_pd(ptr %a0) nounwind {
835 ; X86-LABEL: test_mm256_load_pd:
837 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
838 ; X86-NEXT: vmovaps (%eax), %ymm0
841 ; X64-LABEL: test_mm256_load_pd:
843 ; X64-NEXT: vmovaps (%rdi), %ymm0
845 %res = load <4 x double>, ptr %a0, align 32
846 ret <4 x double> %res
849 define <8 x float> @test_mm256_load_ps(ptr %a0) nounwind {
850 ; X86-LABEL: test_mm256_load_ps:
852 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
853 ; X86-NEXT: vmovaps (%eax), %ymm0
856 ; X64-LABEL: test_mm256_load_ps:
858 ; X64-NEXT: vmovaps (%rdi), %ymm0
860 %res = load <8 x float>, ptr %a0, align 32
864 define <4 x i64> @test_mm256_load_si256(ptr %a0) nounwind {
865 ; X86-LABEL: test_mm256_load_si256:
867 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
868 ; X86-NEXT: vmovaps (%eax), %ymm0
871 ; X64-LABEL: test_mm256_load_si256:
873 ; X64-NEXT: vmovaps (%rdi), %ymm0
875 %res = load <4 x i64>, ptr %a0, align 32
879 define <4 x double> @test_mm256_loadu_pd(ptr %a0) nounwind {
880 ; X86-LABEL: test_mm256_loadu_pd:
882 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
883 ; X86-NEXT: vmovups (%eax), %ymm0
886 ; X64-LABEL: test_mm256_loadu_pd:
888 ; X64-NEXT: vmovups (%rdi), %ymm0
890 %res = load <4 x double>, ptr %a0, align 1
891 ret <4 x double> %res
894 define <8 x float> @test_mm256_loadu_ps(ptr %a0) nounwind {
895 ; X86-LABEL: test_mm256_loadu_ps:
897 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
898 ; X86-NEXT: vmovups (%eax), %ymm0
901 ; X64-LABEL: test_mm256_loadu_ps:
903 ; X64-NEXT: vmovups (%rdi), %ymm0
905 %res = load <8 x float>, ptr %a0, align 1
909 define <4 x i64> @test_mm256_loadu_si256(ptr %a0) nounwind {
910 ; X86-LABEL: test_mm256_loadu_si256:
912 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
913 ; X86-NEXT: vmovups (%eax), %ymm0
916 ; X64-LABEL: test_mm256_loadu_si256:
918 ; X64-NEXT: vmovups (%rdi), %ymm0
920 %res = load <4 x i64>, ptr %a0, align 1
924 define <8 x float> @test_mm256_loadu2_m128(ptr %a0, ptr %a1) nounwind {
925 ; X86-LABEL: test_mm256_loadu2_m128:
927 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
928 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
929 ; X86-NEXT: vmovups (%eax), %xmm0
930 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
933 ; X64-LABEL: test_mm256_loadu2_m128:
935 ; X64-NEXT: vmovups (%rsi), %xmm0
936 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
938 %hi4 = load <4 x float>, ptr %a0, align 1
939 %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
940 %lo4 = load <4 x float>, ptr %a1, align 1
941 %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
942 %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
946 define <4 x double> @test_mm256_loadu2_m128d(ptr %a0, ptr %a1) nounwind {
947 ; X86-LABEL: test_mm256_loadu2_m128d:
949 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
950 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
951 ; X86-NEXT: vmovups (%eax), %xmm0
952 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
955 ; X64-LABEL: test_mm256_loadu2_m128d:
957 ; X64-NEXT: vmovups (%rsi), %xmm0
958 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
960 %hi2 = load <2 x double>, ptr %a0, align 1
961 %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
962 %lo2 = load <2 x double>, ptr %a1, align 1
963 %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
964 %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
965 ret <4 x double> %res
968 define <4 x i64> @test_mm256_loadu2_m128i(ptr %a0, ptr %a1) nounwind {
969 ; X86-LABEL: test_mm256_loadu2_m128i:
971 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
972 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
973 ; X86-NEXT: vmovups (%eax), %xmm0
974 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
977 ; X64-LABEL: test_mm256_loadu2_m128i:
979 ; X64-NEXT: vmovups (%rsi), %xmm0
980 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
982 %hi2 = load <2 x i64>, ptr %a0, align 1
983 %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
984 %lo2 = load <2 x i64>, ptr %a1, align 1
985 %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
986 %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
990 define <2 x double> @test_mm_maskload_pd(ptr %a0, <2 x i64> %a1) nounwind {
991 ; X86-LABEL: test_mm_maskload_pd:
993 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
994 ; X86-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
997 ; X64-LABEL: test_mm_maskload_pd:
999 ; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0
1001 %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %a1)
1002 ret <2 x double> %res
1004 declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readnone
1006 define <4 x double> @test_mm256_maskload_pd(ptr %a0, <4 x i64> %a1) nounwind {
1007 ; X86-LABEL: test_mm256_maskload_pd:
1009 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1010 ; X86-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
1013 ; X64-LABEL: test_mm256_maskload_pd:
1015 ; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
1017 %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %a1)
1018 ret <4 x double> %res
1020 declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind readnone
1022 define <4 x float> @test_mm_maskload_ps(ptr %a0, <2 x i64> %a1) nounwind {
1023 ; X86-LABEL: test_mm_maskload_ps:
1025 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1026 ; X86-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
1029 ; X64-LABEL: test_mm_maskload_ps:
1031 ; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
1033 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1034 %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %arg1)
1035 ret <4 x float> %res
1037 declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readnone
1039 define <8 x float> @test_mm256_maskload_ps(ptr %a0, <4 x i64> %a1) nounwind {
1040 ; X86-LABEL: test_mm256_maskload_ps:
1042 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1043 ; X86-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
1046 ; X64-LABEL: test_mm256_maskload_ps:
1048 ; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
1050 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1051 %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %arg1)
1052 ret <8 x float> %res
1054 declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind readnone
1056 define void @test_mm_maskstore_pd(ptr %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
1057 ; X86-LABEL: test_mm_maskstore_pd:
1059 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1060 ; X86-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
1063 ; X64-LABEL: test_mm_maskstore_pd:
1065 ; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
1067 call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %a1, <2 x double> %a2)
1070 declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind readnone
1072 define void @test_mm256_maskstore_pd(ptr %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
1073 ; X86-LABEL: test_mm256_maskstore_pd:
1075 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1076 ; X86-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
1077 ; X86-NEXT: vzeroupper
1080 ; X64-LABEL: test_mm256_maskstore_pd:
1082 ; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
1083 ; X64-NEXT: vzeroupper
1085 call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %a1, <4 x double> %a2)
1088 declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwind readnone
1090 define void @test_mm_maskstore_ps(ptr %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
1091 ; X86-LABEL: test_mm_maskstore_ps:
1093 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1094 ; X86-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
1097 ; X64-LABEL: test_mm_maskstore_ps:
1099 ; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
1101 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1102 call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %arg1, <4 x float> %a2)
1105 declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind readnone
1107 define void @test_mm256_maskstore_ps(ptr %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
1108 ; X86-LABEL: test_mm256_maskstore_ps:
1110 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1111 ; X86-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
1112 ; X86-NEXT: vzeroupper
1115 ; X64-LABEL: test_mm256_maskstore_ps:
1117 ; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
1118 ; X64-NEXT: vzeroupper
1120 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1121 call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %arg1, <8 x float> %a2)
1124 declare void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) nounwind readnone
1126 define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1127 ; CHECK-LABEL: test_mm256_max_pd:
1129 ; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
1130 ; CHECK-NEXT: ret{{[l|q]}}
1131 %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1132 ret <4 x double> %res
1134 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1136 define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1137 ; CHECK-LABEL: test_mm256_max_ps:
1139 ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
1140 ; CHECK-NEXT: ret{{[l|q]}}
1141 %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
1142 ret <8 x float> %res
1144 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
1146 define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1147 ; CHECK-LABEL: test_mm256_min_pd:
1149 ; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0
1150 ; CHECK-NEXT: ret{{[l|q]}}
1151 %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
1152 ret <4 x double> %res
1154 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
1156 define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1157 ; CHECK-LABEL: test_mm256_min_ps:
1159 ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0
1160 ; CHECK-NEXT: ret{{[l|q]}}
1161 %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
1162 ret <8 x float> %res
1164 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
1166 define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
1167 ; CHECK-LABEL: test_mm256_movedup_pd:
1169 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1170 ; CHECK-NEXT: ret{{[l|q]}}
1171 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1172 ret <4 x double> %res
1175 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
1176 ; CHECK-LABEL: test_mm256_movehdup_ps:
1178 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1179 ; CHECK-NEXT: ret{{[l|q]}}
1180 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1181 ret <8 x float> %res
1184 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
1185 ; CHECK-LABEL: test_mm256_moveldup_ps:
1187 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1188 ; CHECK-NEXT: ret{{[l|q]}}
1189 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1190 ret <8 x float> %res
1193 define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
1194 ; CHECK-LABEL: test_mm256_movemask_pd:
1196 ; CHECK-NEXT: vmovmskpd %ymm0, %eax
1197 ; CHECK-NEXT: vzeroupper
1198 ; CHECK-NEXT: ret{{[l|q]}}
1199 %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
1202 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
1204 define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
1205 ; CHECK-LABEL: test_mm256_movemask_ps:
1207 ; CHECK-NEXT: vmovmskps %ymm0, %eax
1208 ; CHECK-NEXT: vzeroupper
1209 ; CHECK-NEXT: ret{{[l|q]}}
1210 %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
1213 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
1215 define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1216 ; CHECK-LABEL: test_mm256_mul_pd:
1218 ; CHECK-NEXT: vmulpd %ymm1, %ymm0, %ymm0
1219 ; CHECK-NEXT: ret{{[l|q]}}
1220 %res = fmul <4 x double> %a0, %a1
1221 ret <4 x double> %res
1224 define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1225 ; CHECK-LABEL: test_mm256_mul_ps:
1227 ; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0
1228 ; CHECK-NEXT: ret{{[l|q]}}
1229 %res = fmul <8 x float> %a0, %a1
1230 ret <8 x float> %res
1233 define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1234 ; CHECK-LABEL: test_mm256_or_pd:
1236 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1237 ; CHECK-NEXT: ret{{[l|q]}}
1238 %1 = bitcast <4 x double> %a0 to <4 x i64>
1239 %2 = bitcast <4 x double> %a1 to <4 x i64>
1240 %res = or <4 x i64> %1, %2
1241 %bc = bitcast <4 x i64> %res to <4 x double>
1242 ret <4 x double> %bc
1245 define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1246 ; CHECK-LABEL: test_mm256_or_ps:
1248 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1249 ; CHECK-NEXT: ret{{[l|q]}}
1250 %1 = bitcast <8 x float> %a0 to <8 x i32>
1251 %2 = bitcast <8 x float> %a1 to <8 x i32>
1252 %res = or <8 x i32> %1, %2
1253 %bc = bitcast <8 x i32> %res to <8 x float>
1257 define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
1258 ; CHECK-LABEL: test_mm_permute_pd:
1260 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1261 ; CHECK-NEXT: ret{{[l|q]}}
1262 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
1263 ret <2 x double> %res
1266 define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
1267 ; CHECK-LABEL: test_mm256_permute_pd:
1269 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1270 ; CHECK-NEXT: ret{{[l|q]}}
1271 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
1272 ret <4 x double> %res
1275 define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
1276 ; CHECK-LABEL: test_mm_permute_ps:
1278 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1279 ; CHECK-NEXT: ret{{[l|q]}}
1280 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1281 ret <4 x float> %res
1284 define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
1285 ; CHECK-LABEL: test2_mm_permute_ps:
1287 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1288 ; CHECK-NEXT: ret{{[l|q]}}
1289 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
1290 ret <4 x float> %res
1293 define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
1294 ; CHECK-LABEL: test_mm256_permute_ps:
1296 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1297 ; CHECK-NEXT: ret{{[l|q]}}
1298 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1299 ret <8 x float> %res
1302 define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1303 ; CHECK-LABEL: test_mm256_permute2f128_pd:
1305 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1306 ; CHECK-NEXT: ret{{[l|q]}}
1307 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1308 ret <4 x double> %res
1310 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
1313 define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1314 ; CHECK-LABEL: test_mm256_permute2f128_ps:
1316 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
1317 ; CHECK-NEXT: ret{{[l|q]}}
1318 %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
1319 ret <8 x float> %res
1321 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1323 define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1324 ; CHECK-LABEL: test_mm256_permute2f128_si256:
1326 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1327 ; CHECK-NEXT: ret{{[l|q]}}
1328 %1 = bitcast <4 x i64> %a0 to <8 x i32>
1329 %2 = bitcast <4 x i64> %a1 to <8 x i32>
1330 %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1331 %bc = bitcast <8 x i32> %res to <4 x i64>
1334 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
1336 define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
1337 ; CHECK-LABEL: test_mm_permutevar_pd:
1339 ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1340 ; CHECK-NEXT: ret{{[l|q]}}
1341 %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
1342 ret <2 x double> %res
1344 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
1346 define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
1347 ; CHECK-LABEL: test_mm256_permutevar_pd:
1349 ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1350 ; CHECK-NEXT: ret{{[l|q]}}
1351 %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
1352 ret <4 x double> %res
1354 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
1356 define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
1357 ; CHECK-LABEL: test_mm_permutevar_ps:
1359 ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1360 ; CHECK-NEXT: ret{{[l|q]}}
1361 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1362 %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
1363 ret <4 x float> %res
1365 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
1367 define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
1368 ; CHECK-LABEL: test_mm256_permutevar_ps:
1370 ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1371 ; CHECK-NEXT: ret{{[l|q]}}
1372 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1373 %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
1374 ret <8 x float> %res
1376 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
1378 define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
1379 ; CHECK-LABEL: test_mm256_rcp_ps:
1381 ; CHECK-NEXT: vrcpps %ymm0, %ymm0
1382 ; CHECK-NEXT: ret{{[l|q]}}
1383 %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
1384 ret <8 x float> %res
1386 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
1388 define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
1389 ; CHECK-LABEL: test_mm256_round_pd:
1391 ; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
1392 ; CHECK-NEXT: ret{{[l|q]}}
1393 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
1394 ret <4 x double> %res
1397 define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
1398 ; CHECK-LABEL: test_mm256_round_ps:
1400 ; CHECK-NEXT: vroundps $4, %ymm0, %ymm0
1401 ; CHECK-NEXT: ret{{[l|q]}}
1402 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
1403 ret <8 x float> %res
1406 define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
1407 ; CHECK-LABEL: test_mm256_rsqrt_ps:
1409 ; CHECK-NEXT: vrsqrtps %ymm0, %ymm0
1410 ; CHECK-NEXT: ret{{[l|q]}}
1411 %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
1412 ret <8 x float> %res
1414 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
1416 define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
1417 ; X86-LABEL: test_mm256_set_epi8:
1419 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1420 ; X86-NEXT: vmovd %eax, %xmm0
1421 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1422 ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
1423 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1424 ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
1425 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1426 ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
1427 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1428 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
1429 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1430 ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
1431 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1432 ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
1433 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1434 ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
1435 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1436 ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
1437 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1438 ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
1439 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1440 ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1441 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1442 ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
1443 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1444 ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1445 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1446 ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
1447 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1448 ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1449 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1450 ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
1451 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1452 ; X86-NEXT: vmovd %eax, %xmm1
1453 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1454 ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
1455 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1456 ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
1457 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1458 ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
1459 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1460 ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
1461 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1462 ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
1463 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1464 ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
1465 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
1467 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1468 ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
1469 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1470 ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
1471 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1472 ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
1473 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1474 ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
1475 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1476 ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
1477 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1478 ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
1479 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1480 ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
1481 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1482 ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
1483 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1486 ; X64-LABEL: test_mm256_set_epi8:
1488 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1489 ; X64-NEXT: vmovd %eax, %xmm0
1490 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1491 ; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
1492 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1493 ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
1494 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1495 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
1496 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1497 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
1498 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1499 ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
1500 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1501 ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
1502 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1503 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
1504 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1505 ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
1506 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1507 ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
1508 ; X64-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
1509 ; X64-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
1510 ; X64-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
1511 ; X64-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
1512 ; X64-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0
1513 ; X64-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
1514 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1515 ; X64-NEXT: vmovd %eax, %xmm1
1516 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1517 ; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
1518 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1519 ; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
1520 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1521 ; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
1522 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1523 ; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
1524 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1525 ; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
1526 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1527 ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
1528 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1529 ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
1530 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1531 ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
1532 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1533 ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
1534 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1535 ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
1536 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1537 ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
1538 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1539 ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
1540 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1541 ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
1542 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1543 ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
1544 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1545 ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
1546 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1548 %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0
1549 %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1
1550 %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2
1551 %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3
1552 %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4
1553 %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5
1554 %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6
1555 %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7
1556 %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8
1557 %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9
1558 %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10
1559 %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
1560 %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
1561 %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
1562 %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
1563 %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
1564 %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
1565 %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
1566 %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
1567 %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
1568 %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
1569 %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
1570 %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
1571 %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
1572 %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
1573 %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
1574 %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
1575 %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
1576 %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
1577 %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
1578 %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
1579 %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
1580 %res = bitcast <32 x i8> %res31 to <4 x i64>
1584 define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
1585 ; X86-LABEL: test_mm256_set_epi16:
1587 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1588 ; X86-NEXT: vmovd %eax, %xmm0
1589 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1590 ; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
1591 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1592 ; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
1593 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1594 ; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
1595 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1596 ; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
1597 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1598 ; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1599 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1600 ; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
1601 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1602 ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
1603 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1604 ; X86-NEXT: vmovd %eax, %xmm1
1605 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1606 ; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
1607 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1608 ; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
1611 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1612 ; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
1613 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1614 ; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
1615 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1616 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
1617 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1618 ; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
1619 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1622 ; X64-LABEL: test_mm256_set_epi16:
1624 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1625 ; X64-NEXT: vmovd %eax, %xmm0
1626 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1627 ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
1628 ; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
1629 ; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
1630 ; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
1631 ; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
1632 ; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
1633 ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
1634 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1635 ; X64-NEXT: vmovd %eax, %xmm1
1636 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1637 ; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
1638 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1639 ; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
1640 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1641 ; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
1642 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1643 ; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
1644 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1645 ; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
1646 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1647 ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
1648 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1649 ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
1650 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1652 %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0
1653 %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1
1654 %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2
1655 %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3
1656 %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4
1657 %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5
1658 %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6
1659 %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7
1660 %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8
1661 %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9
1662 %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10
1663 %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
1664 %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
1665 %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
1666 %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
1667 %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
1668 %res = bitcast <16 x i16> %res15 to <4 x i64>
1672 define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
1673 ; X86-LABEL: test_mm256_set_epi32:
1675 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1676 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1677 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1678 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1679 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1680 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1681 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1682 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1683 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1686 ; X64-LABEL: test_mm256_set_epi32:
1688 ; X64-NEXT: vmovd %ecx, %xmm0
1689 ; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
1690 ; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
1691 ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
1692 ; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1693 ; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
1694 ; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1
1695 ; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1
1696 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1698 %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
1699 %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
1700 %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
1701 %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
1702 %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
1703 %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
1704 %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
1705 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1706 %res = bitcast <8 x i32> %res7 to <4 x i64>
1710 define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
1711 ; X86-LABEL: test_mm256_set_epi64x:
1713 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1714 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1715 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1716 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1717 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1718 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1719 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1720 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1721 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1724 ; X64-LABEL: test_mm256_set_epi64x:
1726 ; X64-NEXT: vmovq %rdi, %xmm0
1727 ; X64-NEXT: vmovq %rsi, %xmm1
1728 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1729 ; X64-NEXT: vmovq %rdx, %xmm1
1730 ; X64-NEXT: vmovq %rcx, %xmm2
1731 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1732 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1734 %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
1735 %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
1736 %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
1737 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1741 define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
1742 ; CHECK-LABEL: test_mm256_set_m128:
1744 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1745 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1746 ; CHECK-NEXT: ret{{[l|q]}}
1747 %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1748 ret <8 x float> %res
1751 define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
1752 ; CHECK-LABEL: test_mm256_set_m128d:
1754 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1755 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1756 ; CHECK-NEXT: ret{{[l|q]}}
1757 %arg0 = bitcast <2 x double> %a0 to <4 x float>
1758 %arg1 = bitcast <2 x double> %a1 to <4 x float>
1759 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1760 %bc = bitcast <8 x float> %res to <4 x double>
1761 ret <4 x double> %bc
1764 define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1765 ; CHECK-LABEL: test_mm256_set_m128i:
1767 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1768 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1769 ; CHECK-NEXT: ret{{[l|q]}}
1770 %arg0 = bitcast <2 x i64> %a0 to <4 x float>
1771 %arg1 = bitcast <2 x i64> %a1 to <4 x float>
1772 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1773 %bc = bitcast <8 x float> %res to <4 x i64>
1777 define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
1778 ; X86-LABEL: test_mm256_set_pd:
1780 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1781 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1782 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1783 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1784 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
1785 ; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1786 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1789 ; X64-LABEL: test_mm256_set_pd:
1791 ; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1792 ; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1793 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1795 %res0 = insertelement <4 x double> undef, double %a3, i32 0
1796 %res1 = insertelement <4 x double> %res0, double %a2, i32 1
1797 %res2 = insertelement <4 x double> %res1, double %a1, i32 2
1798 %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1799 ret <4 x double> %res3
1802 define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
1803 ; X86-LABEL: test_mm256_set_ps:
1805 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1806 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1807 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1808 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1809 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1810 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1811 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1812 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1813 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1814 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
1815 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1816 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1817 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1818 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
1819 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1822 ; X64-LABEL: test_mm256_set_ps:
1824 ; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1825 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1826 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1827 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
1828 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1829 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1830 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1832 %res0 = insertelement <8 x float> undef, float %a7, i32 0
1833 %res1 = insertelement <8 x float> %res0, float %a6, i32 1
1834 %res2 = insertelement <8 x float> %res1, float %a5, i32 2
1835 %res3 = insertelement <8 x float> %res2, float %a4, i32 3
1836 %res4 = insertelement <8 x float> %res3, float %a3, i32 4
1837 %res5 = insertelement <8 x float> %res4, float %a2, i32 5
1838 %res6 = insertelement <8 x float> %res5, float %a1, i32 6
1839 %res7 = insertelement <8 x float> %res6, float %a0, i32 7
1840 ret <8 x float> %res7
1843 define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
1844 ; X86-LABEL: test_mm256_set1_epi8:
1846 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1847 ; X86-NEXT: vmovd %eax, %xmm0
1848 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1849 ; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1850 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1853 ; X64-LABEL: test_mm256_set1_epi8:
1855 ; X64-NEXT: vmovd %edi, %xmm0
1856 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1857 ; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1858 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1860 %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0
1861 %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1
1862 %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2
1863 %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3
1864 %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4
1865 %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5
1866 %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6
1867 %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7
1868 %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8
1869 %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9
1870 %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10
1871 %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
1872 %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
1873 %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
1874 %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
1875 %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
1876 %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
1877 %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
1878 %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
1879 %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
1880 %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
1881 %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
1882 %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
1883 %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
1884 %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
1885 %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
1886 %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
1887 %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
1888 %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
1889 %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
1890 %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
1891 %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
1892 %res = bitcast <32 x i8> %res31 to <4 x i64>
1896 define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
1897 ; X86-LABEL: test_mm256_set1_epi16:
1899 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1900 ; X86-NEXT: vmovd %eax, %xmm0
1901 ; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1902 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1903 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1906 ; X64-LABEL: test_mm256_set1_epi16:
1908 ; X64-NEXT: vmovd %edi, %xmm0
1909 ; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1910 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1911 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1913 %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
1914 %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1
1915 %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2
1916 %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3
1917 %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4
1918 %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5
1919 %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6
1920 %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7
1921 %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8
1922 %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9
1923 %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10
1924 %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
1925 %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
1926 %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
1927 %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
1928 %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
1929 %res = bitcast <16 x i16> %res15 to <4 x i64>
1933 define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
1934 ; X86-LABEL: test_mm256_set1_epi32:
1936 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1937 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1938 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1941 ; X64-LABEL: test_mm256_set1_epi32:
1943 ; X64-NEXT: vmovd %edi, %xmm0
1944 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1945 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1947 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
1948 %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
1949 %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
1950 %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
1951 %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
1952 %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
1953 %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
1954 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1955 %res = bitcast <8 x i32> %res7 to <4 x i64>
1959 define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
1960 ; X86-LABEL: test_mm256_set1_epi64x:
1962 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1963 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1964 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1965 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1968 ; X64-LABEL: test_mm256_set1_epi64x:
1970 ; X64-NEXT: vmovq %rdi, %xmm0
1971 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1972 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1974 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
1975 %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
1976 %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
1977 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1981 define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
1982 ; X86-LABEL: test_mm256_set1_pd:
1984 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1985 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1986 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1989 ; X64-LABEL: test_mm256_set1_pd:
1991 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1992 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1994 %res0 = insertelement <4 x double> undef, double %a0, i32 0
1995 %res1 = insertelement <4 x double> %res0, double %a0, i32 1
1996 %res2 = insertelement <4 x double> %res1, double %a0, i32 2
1997 %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1998 ret <4 x double> %res3
2001 define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
2002 ; X86-LABEL: test_mm256_set1_ps:
2004 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2005 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2006 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2009 ; X64-LABEL: test_mm256_set1_ps:
2011 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2012 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2014 %res0 = insertelement <8 x float> undef, float %a0, i32 0
2015 %res1 = insertelement <8 x float> %res0, float %a0, i32 1
2016 %res2 = insertelement <8 x float> %res1, float %a0, i32 2
2017 %res3 = insertelement <8 x float> %res2, float %a0, i32 3
2018 %res4 = insertelement <8 x float> %res3, float %a0, i32 4
2019 %res5 = insertelement <8 x float> %res4, float %a0, i32 5
2020 %res6 = insertelement <8 x float> %res5, float %a0, i32 6
2021 %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2022 ret <8 x float> %res7
2025 define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
2026 ; X86-LABEL: test_mm256_setr_epi8:
2028 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2029 ; X86-NEXT: vmovd %eax, %xmm0
2030 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2031 ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
2032 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2033 ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
2034 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2035 ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
2036 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2037 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2038 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2039 ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2040 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2041 ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
2042 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2043 ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
2044 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2045 ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
2046 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2047 ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2048 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2049 ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
2050 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2051 ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
2052 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2053 ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
2054 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2055 ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2056 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2057 ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2058 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2059 ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2060 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2061 ; X86-NEXT: vmovd %eax, %xmm1
2062 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2063 ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
2064 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2065 ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
2066 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2067 ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
2068 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2069 ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
2070 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2071 ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
2072 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2073 ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
2074 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2075 ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
2076 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2077 ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
2078 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2079 ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
2080 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2081 ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
2082 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2083 ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
2084 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2085 ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
2086 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2087 ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
2088 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2089 ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
2090 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2091 ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
2092 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2095 ; X64-LABEL: test_mm256_setr_epi8:
2097 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2098 ; X64-NEXT: vmovd %eax, %xmm0
2099 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2100 ; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
2101 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2102 ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
2103 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2104 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
2105 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2106 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2107 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2108 ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2109 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2110 ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
2111 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2112 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
2113 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2114 ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
2115 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2116 ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2117 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2118 ; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
2119 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2120 ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
2121 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2122 ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
2123 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2124 ; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2125 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2126 ; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2127 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2128 ; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2129 ; X64-NEXT: vmovd %edi, %xmm1
2130 ; X64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
2131 ; X64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
2132 ; X64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
2133 ; X64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
2134 ; X64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
2135 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2136 ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
2137 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2138 ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
2139 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2140 ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
2141 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2142 ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
2143 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2144 ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
2145 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2146 ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
2147 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2148 ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
2149 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2150 ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
2151 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2152 ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
2153 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2154 ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
2155 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2157 %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0
2158 %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1
2159 %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2
2160 %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3
2161 %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4
2162 %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5
2163 %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6
2164 %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7
2165 %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8
2166 %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9
2167 %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10
2168 %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
2169 %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
2170 %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
2171 %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
2172 %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
2173 %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
2174 %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
2175 %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
2176 %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
2177 %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
2178 %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
2179 %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
2180 %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
2181 %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
2182 %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
2183 %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
2184 %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
2185 %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
2186 %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
2187 %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
2188 %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
2189 %res = bitcast <32 x i8> %res31 to <4 x i64>
2193 define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2194 ; X86-LABEL: test_mm256_setr_epi16:
2196 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2197 ; X86-NEXT: vmovd %eax, %xmm0
2198 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2199 ; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
2200 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2201 ; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
2202 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2203 ; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
2204 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2205 ; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
2206 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2207 ; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
2208 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2209 ; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
2210 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2211 ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
2212 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2213 ; X86-NEXT: vmovd %eax, %xmm1
2214 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2215 ; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
2216 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2217 ; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
2218 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2219 ; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
2220 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2221 ; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
2222 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2223 ; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
2224 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2225 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
2226 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2227 ; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
2228 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2231 ; X64-LABEL: test_mm256_setr_epi16:
2233 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2234 ; X64-NEXT: vmovd %eax, %xmm0
2235 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2236 ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
2237 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2238 ; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
2239 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2240 ; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
2241 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2242 ; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
2243 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2244 ; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
2245 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2246 ; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
2247 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2248 ; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
2249 ; X64-NEXT: vmovd %edi, %xmm1
2250 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
2251 ; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
2252 ; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
2253 ; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
2254 ; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
2255 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2256 ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
2257 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2258 ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
2259 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2261 %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0
2262 %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1
2263 %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2
2264 %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3
2265 %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4
2266 %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5
2267 %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6
2268 %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7
2269 %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8
2270 %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9
2271 %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10
2272 %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
2273 %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
2274 %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
2275 %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
2276 %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
2277 %res = bitcast <16 x i16> %res15 to <4 x i64>
2281 define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2282 ; X86-LABEL: test_mm256_setr_epi32:
2284 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2285 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2286 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2287 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2288 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2289 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2290 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2291 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2292 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2295 ; X64-LABEL: test_mm256_setr_epi32:
2297 ; X64-NEXT: vmovd %r8d, %xmm0
2298 ; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
2299 ; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2300 ; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2301 ; X64-NEXT: vmovd %edi, %xmm1
2302 ; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
2303 ; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
2304 ; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
2305 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2307 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2308 %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
2309 %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
2310 %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
2311 %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
2312 %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
2313 %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
2314 %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
2315 %res = bitcast <8 x i32> %res7 to <4 x i64>
2319 define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2320 ; X86-LABEL: test_mm256_setr_epi64x:
2322 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2323 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2324 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2325 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2326 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2327 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2328 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2329 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2330 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2333 ; X64-LABEL: test_mm256_setr_epi64x:
2335 ; X64-NEXT: vmovq %rcx, %xmm0
2336 ; X64-NEXT: vmovq %rdx, %xmm1
2337 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2338 ; X64-NEXT: vmovq %rsi, %xmm1
2339 ; X64-NEXT: vmovq %rdi, %xmm2
2340 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2341 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2343 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2344 %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
2345 %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
2346 %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
2350 define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2351 ; CHECK-LABEL: test_mm256_setr_m128:
2353 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2354 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2355 ; CHECK-NEXT: ret{{[l|q]}}
2356 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2357 ret <8 x float> %res
2360 define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2361 ; CHECK-LABEL: test_mm256_setr_m128d:
2363 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2364 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2365 ; CHECK-NEXT: ret{{[l|q]}}
2366 %arg0 = bitcast <2 x double> %a0 to <4 x float>
2367 %arg1 = bitcast <2 x double> %a1 to <4 x float>
2368 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2369 %bc = bitcast <8 x float> %res to <4 x double>
2370 ret <4 x double> %bc
2373 define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2374 ; CHECK-LABEL: test_mm256_setr_m128i:
2376 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2377 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2378 ; CHECK-NEXT: ret{{[l|q]}}
2379 %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2380 %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2381 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2382 %bc = bitcast <8 x float> %res to <4 x i64>
2386 define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2387 ; X86-LABEL: test_mm256_setr_pd:
2389 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2390 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2391 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2392 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2393 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
2394 ; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2395 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2398 ; X64-LABEL: test_mm256_setr_pd:
2400 ; X64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2401 ; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2402 ; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2404 %res0 = insertelement <4 x double> undef, double %a0, i32 0
2405 %res1 = insertelement <4 x double> %res0, double %a1, i32 1
2406 %res2 = insertelement <4 x double> %res1, double %a2, i32 2
2407 %res3 = insertelement <4 x double> %res2, double %a3, i32 3
2408 ret <4 x double> %res3
2411 define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2412 ; X86-LABEL: test_mm256_setr_ps:
2414 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2415 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2416 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2417 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2418 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2419 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2420 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2421 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2422 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2423 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2424 ; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2425 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
2426 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
2427 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2428 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2431 ; X64-LABEL: test_mm256_setr_ps:
2433 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2434 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2435 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2436 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2437 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2438 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2439 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2441 %res0 = insertelement <8 x float> undef, float %a0, i32 0
2442 %res1 = insertelement <8 x float> %res0, float %a1, i32 1
2443 %res2 = insertelement <8 x float> %res1, float %a2, i32 2
2444 %res3 = insertelement <8 x float> %res2, float %a3, i32 3
2445 %res4 = insertelement <8 x float> %res3, float %a4, i32 4
2446 %res5 = insertelement <8 x float> %res4, float %a5, i32 5
2447 %res6 = insertelement <8 x float> %res5, float %a6, i32 6
2448 %res7 = insertelement <8 x float> %res6, float %a7, i32 7
2449 ret <8 x float> %res7
2452 define <4 x double> @test_mm256_setzero_pd() nounwind {
2453 ; CHECK-LABEL: test_mm256_setzero_pd:
2455 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2456 ; CHECK-NEXT: ret{{[l|q]}}
2457 ret <4 x double> zeroinitializer
2460 define <8 x float> @test_mm256_setzero_ps() nounwind {
2461 ; CHECK-LABEL: test_mm256_setzero_ps:
2463 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2464 ; CHECK-NEXT: ret{{[l|q]}}
2465 ret <8 x float> zeroinitializer
2468 define <4 x i64> @test_mm256_setzero_si256() nounwind {
2469 ; CHECK-LABEL: test_mm256_setzero_si256:
2471 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2472 ; CHECK-NEXT: ret{{[l|q]}}
2473 ret <4 x i64> zeroinitializer
2476 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2477 ; CHECK-LABEL: test_mm256_shuffle_pd:
2479 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2480 ; CHECK-NEXT: ret{{[l|q]}}
2481 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2482 ret <4 x double> %res
2485 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2486 ; CHECK-LABEL: test_mm256_shuffle_ps:
2488 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
2489 ; CHECK-NEXT: ret{{[l|q]}}
2490 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
2491 ret <8 x float> %res
2494 define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
2495 ; CHECK-LABEL: test_mm256_sqrt_pd:
2496 ; CHECK: # %bb.0: # %entry
2497 ; CHECK-NEXT: vsqrtpd %ymm0, %ymm0
2498 ; CHECK-NEXT: ret{{[l|q]}}
2500 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
2504 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
2506 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
2507 ; CHECK-LABEL: test_mm256_sqrt_ps:
2508 ; CHECK: # %bb.0: # %entry
2509 ; CHECK-NEXT: vsqrtps %ymm0, %ymm0
2510 ; CHECK-NEXT: ret{{[l|q]}}
2512 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
2516 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
2518 define void @test_mm256_store_pd(ptr %a0, <4 x double> %a1) nounwind {
2519 ; X86-LABEL: test_mm256_store_pd:
2521 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2522 ; X86-NEXT: vmovaps %ymm0, (%eax)
2523 ; X86-NEXT: vzeroupper
2526 ; X64-LABEL: test_mm256_store_pd:
2528 ; X64-NEXT: vmovaps %ymm0, (%rdi)
2529 ; X64-NEXT: vzeroupper
2531 store <4 x double> %a1, ptr %a0, align 32
2535 define void @test_mm256_store_ps(ptr %a0, <8 x float> %a1) nounwind {
2536 ; X86-LABEL: test_mm256_store_ps:
2538 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2539 ; X86-NEXT: vmovaps %ymm0, (%eax)
2540 ; X86-NEXT: vzeroupper
2543 ; X64-LABEL: test_mm256_store_ps:
2545 ; X64-NEXT: vmovaps %ymm0, (%rdi)
2546 ; X64-NEXT: vzeroupper
2548 store <8 x float> %a1, ptr %a0, align 32
2552 define void @test_mm256_store_si256(ptr %a0, <4 x i64> %a1) nounwind {
2553 ; X86-LABEL: test_mm256_store_si256:
2555 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2556 ; X86-NEXT: vmovaps %ymm0, (%eax)
2557 ; X86-NEXT: vzeroupper
2560 ; X64-LABEL: test_mm256_store_si256:
2562 ; X64-NEXT: vmovaps %ymm0, (%rdi)
2563 ; X64-NEXT: vzeroupper
2565 store <4 x i64> %a1, ptr %a0, align 32
2569 define void @test_mm256_storeu_pd(ptr %a0, <4 x double> %a1) nounwind {
2570 ; X86-LABEL: test_mm256_storeu_pd:
2572 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2573 ; X86-NEXT: vmovups %ymm0, (%eax)
2574 ; X86-NEXT: vzeroupper
2577 ; X64-LABEL: test_mm256_storeu_pd:
2579 ; X64-NEXT: vmovups %ymm0, (%rdi)
2580 ; X64-NEXT: vzeroupper
2582 store <4 x double> %a1, ptr %a0, align 1
2586 define void @test_mm256_storeu_ps(ptr %a0, <8 x float> %a1) nounwind {
2587 ; X86-LABEL: test_mm256_storeu_ps:
2589 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2590 ; X86-NEXT: vmovups %ymm0, (%eax)
2591 ; X86-NEXT: vzeroupper
2594 ; X64-LABEL: test_mm256_storeu_ps:
2596 ; X64-NEXT: vmovups %ymm0, (%rdi)
2597 ; X64-NEXT: vzeroupper
2599 store <8 x float> %a1, ptr %a0, align 1
2603 define void @test_mm256_storeu_si256(ptr %a0, <4 x i64> %a1) nounwind {
2604 ; X86-LABEL: test_mm256_storeu_si256:
2606 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2607 ; X86-NEXT: vmovups %ymm0, (%eax)
2608 ; X86-NEXT: vzeroupper
2611 ; X64-LABEL: test_mm256_storeu_si256:
2613 ; X64-NEXT: vmovups %ymm0, (%rdi)
2614 ; X64-NEXT: vzeroupper
2616 store <4 x i64> %a1, ptr %a0, align 1
2620 define void @test_mm256_storeu2_m128(ptr %a0, ptr %a1, <8 x float> %a2) nounwind {
2621 ; X86-LABEL: test_mm256_storeu2_m128:
2623 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2624 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2625 ; X86-NEXT: vmovups %xmm0, (%ecx)
2626 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2627 ; X86-NEXT: vmovups %xmm0, (%eax)
2628 ; X86-NEXT: vzeroupper
2631 ; X64-LABEL: test_mm256_storeu2_m128:
2633 ; X64-NEXT: vmovups %xmm0, (%rdi)
2634 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
2635 ; X64-NEXT: vmovups %xmm0, (%rsi)
2636 ; X64-NEXT: vzeroupper
2638 %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2639 store <4 x float> %lo, ptr %a0, align 1
2640 %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2641 store <4 x float> %hi, ptr %a1, align 1
2645 define void @test_mm256_storeu2_m128d(ptr %a0, ptr %a1, <4 x double> %a2) nounwind {
2646 ; X86-LABEL: test_mm256_storeu2_m128d:
2648 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2649 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2650 ; X86-NEXT: vmovups %xmm0, (%ecx)
2651 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2652 ; X86-NEXT: vmovups %xmm0, (%eax)
2653 ; X86-NEXT: vzeroupper
2656 ; X64-LABEL: test_mm256_storeu2_m128d:
2658 ; X64-NEXT: vmovups %xmm0, (%rdi)
2659 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
2660 ; X64-NEXT: vmovups %xmm0, (%rsi)
2661 ; X64-NEXT: vzeroupper
2663 %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
2664 store <2 x double> %lo, ptr %a0, align 1
2665 %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
2666 store <2 x double> %hi, ptr %a1, align 1
2670 define void @test_mm256_storeu2_m128i(ptr %a0, ptr %a1, <4 x i64> %a2) nounwind {
2671 ; X86-LABEL: test_mm256_storeu2_m128i:
2673 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2674 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2675 ; X86-NEXT: vmovups %xmm0, (%ecx)
2676 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2677 ; X86-NEXT: vmovups %xmm0, (%eax)
2678 ; X86-NEXT: vzeroupper
2681 ; X64-LABEL: test_mm256_storeu2_m128i:
2683 ; X64-NEXT: vmovups %xmm0, (%rdi)
2684 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
2685 ; X64-NEXT: vmovups %xmm0, (%rsi)
2686 ; X64-NEXT: vzeroupper
2688 %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
2689 store <2 x i64> %lo, ptr %a0, align 1
2690 %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
2691 store <2 x i64> %hi, ptr %a1, align 1
2695 define void @test_mm256_stream_pd(ptr%a0, <4 x double> %a1) nounwind {
2696 ; X86-LABEL: test_mm256_stream_pd:
2698 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2699 ; X86-NEXT: vmovntps %ymm0, (%eax)
2700 ; X86-NEXT: vzeroupper
2703 ; X64-LABEL: test_mm256_stream_pd:
2705 ; X64-NEXT: vmovntps %ymm0, (%rdi)
2706 ; X64-NEXT: vzeroupper
2708 store <4 x double> %a1, ptr %a0, align 32, !nontemporal !0
2712 define void @test_mm256_stream_ps(ptr%a0, <8 x float> %a1) nounwind {
2713 ; X86-LABEL: test_mm256_stream_ps:
2715 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2716 ; X86-NEXT: vmovntps %ymm0, (%eax)
2717 ; X86-NEXT: vzeroupper
2720 ; X64-LABEL: test_mm256_stream_ps:
2722 ; X64-NEXT: vmovntps %ymm0, (%rdi)
2723 ; X64-NEXT: vzeroupper
2725 store <8 x float> %a1, ptr %a0, align 32, !nontemporal !0
2729 define void @test_mm256_stream_si256(ptr%a0, <4 x i64> %a1) nounwind {
2730 ; X86-LABEL: test_mm256_stream_si256:
2732 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2733 ; X86-NEXT: vmovntps %ymm0, (%eax)
2734 ; X86-NEXT: vzeroupper
2737 ; X64-LABEL: test_mm256_stream_si256:
2739 ; X64-NEXT: vmovntps %ymm0, (%rdi)
2740 ; X64-NEXT: vzeroupper
2742 store <4 x i64> %a1, ptr %a0, align 32, !nontemporal !0
2746 define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2747 ; CHECK-LABEL: test_mm256_sub_pd:
2749 ; CHECK-NEXT: vsubpd %ymm1, %ymm0, %ymm0
2750 ; CHECK-NEXT: ret{{[l|q]}}
2751 %res = fsub <4 x double> %a0, %a1
2752 ret <4 x double> %res
2755 define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2756 ; CHECK-LABEL: test_mm256_sub_ps:
2758 ; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0
2759 ; CHECK-NEXT: ret{{[l|q]}}
2760 %res = fsub <8 x float> %a0, %a1
2761 ret <8 x float> %res
2764 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2765 ; CHECK-LABEL: test_mm_testc_pd:
2767 ; CHECK-NEXT: xorl %eax, %eax
2768 ; CHECK-NEXT: vtestpd %xmm1, %xmm0
2769 ; CHECK-NEXT: setb %al
2770 ; CHECK-NEXT: ret{{[l|q]}}
2771 %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
2774 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
2776 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2777 ; CHECK-LABEL: test_mm256_testc_pd:
2779 ; CHECK-NEXT: xorl %eax, %eax
2780 ; CHECK-NEXT: vtestpd %ymm1, %ymm0
2781 ; CHECK-NEXT: setb %al
2782 ; CHECK-NEXT: vzeroupper
2783 ; CHECK-NEXT: ret{{[l|q]}}
2784 %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
2787 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2789 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2790 ; CHECK-LABEL: test_mm_testc_ps:
2792 ; CHECK-NEXT: xorl %eax, %eax
2793 ; CHECK-NEXT: vtestps %xmm1, %xmm0
2794 ; CHECK-NEXT: setb %al
2795 ; CHECK-NEXT: ret{{[l|q]}}
2796 %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
2799 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
2801 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2802 ; CHECK-LABEL: test_mm256_testc_ps:
2804 ; CHECK-NEXT: xorl %eax, %eax
2805 ; CHECK-NEXT: vtestps %ymm1, %ymm0
2806 ; CHECK-NEXT: setb %al
2807 ; CHECK-NEXT: vzeroupper
2808 ; CHECK-NEXT: ret{{[l|q]}}
2809 %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
2812 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2814 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2815 ; CHECK-LABEL: test_mm256_testc_si256:
2817 ; CHECK-NEXT: xorl %eax, %eax
2818 ; CHECK-NEXT: vptest %ymm1, %ymm0
2819 ; CHECK-NEXT: setb %al
2820 ; CHECK-NEXT: vzeroupper
2821 ; CHECK-NEXT: ret{{[l|q]}}
2822 %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
2825 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
2827 define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2828 ; CHECK-LABEL: test_mm_testnzc_pd:
2830 ; CHECK-NEXT: xorl %eax, %eax
2831 ; CHECK-NEXT: vtestpd %xmm1, %xmm0
2832 ; CHECK-NEXT: seta %al
2833 ; CHECK-NEXT: ret{{[l|q]}}
2834 %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
2837 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
2839 define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2840 ; CHECK-LABEL: test_mm256_testnzc_pd:
2842 ; CHECK-NEXT: xorl %eax, %eax
2843 ; CHECK-NEXT: vtestpd %ymm1, %ymm0
2844 ; CHECK-NEXT: seta %al
2845 ; CHECK-NEXT: vzeroupper
2846 ; CHECK-NEXT: ret{{[l|q]}}
2847 %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
2850 declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2852 define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2853 ; CHECK-LABEL: test_mm_testnzc_ps:
2855 ; CHECK-NEXT: xorl %eax, %eax
2856 ; CHECK-NEXT: vtestps %xmm1, %xmm0
2857 ; CHECK-NEXT: seta %al
2858 ; CHECK-NEXT: ret{{[l|q]}}
2859 %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
2862 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
2864 define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2865 ; CHECK-LABEL: test_mm256_testnzc_ps:
2867 ; CHECK-NEXT: xorl %eax, %eax
2868 ; CHECK-NEXT: vtestps %ymm1, %ymm0
2869 ; CHECK-NEXT: seta %al
2870 ; CHECK-NEXT: vzeroupper
2871 ; CHECK-NEXT: ret{{[l|q]}}
2872 %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
2875 declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2877 define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2878 ; CHECK-LABEL: test_mm256_testnzc_si256:
2880 ; CHECK-NEXT: xorl %eax, %eax
2881 ; CHECK-NEXT: vptest %ymm1, %ymm0
2882 ; CHECK-NEXT: seta %al
2883 ; CHECK-NEXT: vzeroupper
2884 ; CHECK-NEXT: ret{{[l|q]}}
2885 %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
2888 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
2890 define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2891 ; CHECK-LABEL: test_mm_testz_pd:
2893 ; CHECK-NEXT: xorl %eax, %eax
2894 ; CHECK-NEXT: vtestpd %xmm1, %xmm0
2895 ; CHECK-NEXT: sete %al
2896 ; CHECK-NEXT: ret{{[l|q]}}
2897 %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
2900 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
2902 define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2903 ; CHECK-LABEL: test_mm256_testz_pd:
2905 ; CHECK-NEXT: xorl %eax, %eax
2906 ; CHECK-NEXT: vtestpd %ymm1, %ymm0
2907 ; CHECK-NEXT: sete %al
2908 ; CHECK-NEXT: vzeroupper
2909 ; CHECK-NEXT: ret{{[l|q]}}
2910 %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
2913 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
2915 define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2916 ; CHECK-LABEL: test_mm_testz_ps:
2918 ; CHECK-NEXT: xorl %eax, %eax
2919 ; CHECK-NEXT: vtestps %xmm1, %xmm0
2920 ; CHECK-NEXT: sete %al
2921 ; CHECK-NEXT: ret{{[l|q]}}
2922 %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
2925 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
2927 define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2928 ; CHECK-LABEL: test_mm256_testz_ps:
2930 ; CHECK-NEXT: xorl %eax, %eax
2931 ; CHECK-NEXT: vtestps %ymm1, %ymm0
2932 ; CHECK-NEXT: sete %al
2933 ; CHECK-NEXT: vzeroupper
2934 ; CHECK-NEXT: ret{{[l|q]}}
2935 %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
2938 declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
2940 define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2941 ; CHECK-LABEL: test_mm256_testz_si256:
2943 ; CHECK-NEXT: xorl %eax, %eax
2944 ; CHECK-NEXT: vptest %ymm1, %ymm0
2945 ; CHECK-NEXT: sete %al
2946 ; CHECK-NEXT: vzeroupper
2947 ; CHECK-NEXT: ret{{[l|q]}}
2948 %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
2951 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
2953 define <4 x double> @test_mm256_undefined_pd() nounwind {
2954 ; CHECK-LABEL: test_mm256_undefined_pd:
2956 ; CHECK-NEXT: ret{{[l|q]}}
2957 ret <4 x double> undef
2960 define <8 x float> @test_mm256_undefined_ps() nounwind {
2961 ; CHECK-LABEL: test_mm256_undefined_ps:
2963 ; CHECK-NEXT: ret{{[l|q]}}
2964 ret <8 x float> undef
2967 define <4 x i64> @test_mm256_undefined_si256() nounwind {
2968 ; CHECK-LABEL: test_mm256_undefined_si256:
2970 ; CHECK-NEXT: ret{{[l|q]}}
2974 define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2975 ; CHECK-LABEL: test_mm256_unpackhi_pd:
2977 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2978 ; CHECK-NEXT: ret{{[l|q]}}
2979 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2980 ret <4 x double> %res
2983 define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2984 ; CHECK-LABEL: test_mm256_unpackhi_ps:
2986 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2987 ; CHECK-NEXT: ret{{[l|q]}}
2988 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2989 ret <8 x float> %res
2992 define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2993 ; CHECK-LABEL: test_mm256_unpacklo_pd:
2995 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2996 ; CHECK-NEXT: ret{{[l|q]}}
2997 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2998 ret <4 x double> %res
3001 define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3002 ; CHECK-LABEL: test_mm256_unpacklo_ps:
3004 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3005 ; CHECK-NEXT: ret{{[l|q]}}
3006 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3007 ret <8 x float> %res
3010 define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3011 ; CHECK-LABEL: test_mm256_xor_pd:
3013 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
3014 ; CHECK-NEXT: ret{{[l|q]}}
3015 %1 = bitcast <4 x double> %a0 to <4 x i64>
3016 %2 = bitcast <4 x double> %a1 to <4 x i64>
3017 %res = xor <4 x i64> %1, %2
3018 %bc = bitcast <4 x i64> %res to <4 x double>
3019 ret <4 x double> %bc
3022 define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3023 ; CHECK-LABEL: test_mm256_xor_ps:
3025 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
3026 ; CHECK-NEXT: ret{{[l|q]}}
3027 %1 = bitcast <8 x float> %a0 to <8 x i32>
3028 %2 = bitcast <8 x float> %a1 to <8 x i32>
3029 %res = xor <8 x i32> %1, %2
3030 %bc = bitcast <8 x i32> %res to <8 x float>
3034 define void @test_mm256_zeroall() nounwind {
3035 ; CHECK-LABEL: test_mm256_zeroall:
3037 ; CHECK-NEXT: vzeroall
3038 ; CHECK-NEXT: ret{{[l|q]}}
3039 call void @llvm.x86.avx.vzeroall()
3042 declare void @llvm.x86.avx.vzeroall() nounwind readnone
3044 define void @test_mm256_zeroupper() nounwind {
3045 ; CHECK-LABEL: test_mm256_zeroupper:
3047 ; CHECK-NEXT: vzeroupper
3048 ; CHECK-NEXT: ret{{[l|q]}}
3049 call void @llvm.x86.avx.vzeroupper()
3052 declare void @llvm.x86.avx.vzeroupper() nounwind readnone
3054 define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
3055 ; CHECK-LABEL: test_mm256_zextpd128_pd256:
3057 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
3058 ; CHECK-NEXT: ret{{[l|q]}}
3059 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3060 ret <4 x double> %res
3063 define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
3064 ; CHECK-LABEL: test_mm256_zextps128_ps256:
3066 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
3067 ; CHECK-NEXT: ret{{[l|q]}}
3068 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3069 ret <8 x float> %res
3072 define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
3073 ; CHECK-LABEL: test_mm256_zextsi128_si256:
3075 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
3076 ; CHECK-NEXT: ret{{[l|q]}}
3077 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>