1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
7 define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
8 ; CHECK-LABEL: test_mm256_add_pd:
10 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
11 ; CHECK-NEXT: ret{{[l|q]}}
12 %res = fadd <4 x double> %a0, %a1
16 define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
17 ; CHECK-LABEL: test_mm256_add_ps:
19 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
20 ; CHECK-NEXT: ret{{[l|q]}}
21 %res = fadd <8 x float> %a0, %a1
25 define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
26 ; CHECK-LABEL: test_mm256_addsub_pd:
28 ; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
29 ; CHECK-NEXT: ret{{[l|q]}}
30 %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
33 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
35 define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
36 ; CHECK-LABEL: test_mm256_addsub_ps:
38 ; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
39 ; CHECK-NEXT: ret{{[l|q]}}
40 %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
43 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
45 define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
46 ; CHECK-LABEL: test_mm256_and_pd:
48 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
49 ; CHECK-NEXT: ret{{[l|q]}}
50 %1 = bitcast <4 x double> %a0 to <4 x i64>
51 %2 = bitcast <4 x double> %a1 to <4 x i64>
52 %res = and <4 x i64> %1, %2
53 %bc = bitcast <4 x i64> %res to <4 x double>
57 define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
58 ; CHECK-LABEL: test_mm256_and_ps:
60 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
61 ; CHECK-NEXT: ret{{[l|q]}}
62 %1 = bitcast <8 x float> %a0 to <8 x i32>
63 %2 = bitcast <8 x float> %a1 to <8 x i32>
64 %res = and <8 x i32> %1, %2
65 %bc = bitcast <8 x i32> %res to <8 x float>
69 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
70 ; CHECK-LABEL: test_mm256_andnot_pd:
72 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
73 ; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
74 ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
75 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
76 ; CHECK-NEXT: ret{{[l|q]}}
77 %1 = bitcast <4 x double> %a0 to <4 x i64>
78 %2 = bitcast <4 x double> %a1 to <4 x i64>
79 %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
80 %res = and <4 x i64> %3, %2
81 %bc = bitcast <4 x i64> %res to <4 x double>
85 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
86 ; CHECK-LABEL: test_mm256_andnot_ps:
88 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
89 ; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
90 ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
91 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
92 ; CHECK-NEXT: ret{{[l|q]}}
93 %1 = bitcast <8 x float> %a0 to <8 x i32>
94 %2 = bitcast <8 x float> %a1 to <8 x i32>
95 %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
96 %res = and <8 x i32> %3, %2
97 %bc = bitcast <8 x i32> %res to <8 x float>
101 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
102 ; CHECK-LABEL: test_mm256_blend_pd:
104 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
105 ; CHECK-NEXT: ret{{[l|q]}}
106 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
107 ret <4 x double> %res
110 define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
111 ; CHECK-LABEL: test_mm256_blend_ps:
113 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
114 ; CHECK-NEXT: ret{{[l|q]}}
115 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
119 define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
120 ; CHECK-LABEL: test_mm256_blendv_pd:
122 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
123 ; CHECK-NEXT: ret{{[l|q]}}
124 %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
125 ret <4 x double> %res
127 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
129 define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
130 ; CHECK-LABEL: test_mm256_blendv_ps:
132 ; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
133 ; CHECK-NEXT: ret{{[l|q]}}
134 %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
137 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
139 define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
140 ; X86-LABEL: test_mm256_broadcast_pd:
142 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
143 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
146 ; X64-LABEL: test_mm256_broadcast_pd:
148 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
150 %ld = load <2 x double>, <2 x double>* %a0
151 %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
152 ret <4 x double> %res
155 define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
156 ; X86-LABEL: test_mm256_broadcast_ps:
158 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
159 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
162 ; X64-LABEL: test_mm256_broadcast_ps:
164 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
166 %ld = load <4 x float>, <4 x float>* %a0
167 %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
171 define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
172 ; X86-LABEL: test_mm256_broadcast_sd:
174 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
175 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
178 ; X64-LABEL: test_mm256_broadcast_sd:
180 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
182 %ld = load double, double* %a0
183 %ins0 = insertelement <4 x double> undef, double %ld, i32 0
184 %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
185 %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
186 %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
187 ret <4 x double> %ins3
190 define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
191 ; X86-LABEL: test_mm_broadcast_ss:
193 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
194 ; X86-NEXT: vbroadcastss (%eax), %xmm0
197 ; X64-LABEL: test_mm_broadcast_ss:
199 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
201 %ld = load float, float* %a0
202 %ins0 = insertelement <4 x float> undef, float %ld, i32 0
203 %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
204 %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
205 %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
206 ret <4 x float> %ins3
209 define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
210 ; X86-LABEL: test_mm256_broadcast_ss:
212 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
213 ; X86-NEXT: vbroadcastss (%eax), %ymm0
216 ; X64-LABEL: test_mm256_broadcast_ss:
218 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
220 %ld = load float, float* %a0
221 %ins0 = insertelement <8 x float> undef, float %ld, i32 0
222 %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
223 %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
224 %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
225 %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
226 %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
227 %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
228 %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
229 ret <8 x float> %ins7
232 define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
233 ; CHECK-LABEL: test_mm256_castpd_ps:
235 ; CHECK-NEXT: ret{{[l|q]}}
236 %res = bitcast <4 x double> %a0 to <8 x float>
240 define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
241 ; CHECK-LABEL: test_mm256_castpd_si256:
243 ; CHECK-NEXT: ret{{[l|q]}}
244 %res = bitcast <4 x double> %a0 to <4 x i64>
248 define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
249 ; CHECK-LABEL: test_mm256_castpd128_pd256:
251 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
252 ; CHECK-NEXT: ret{{[l|q]}}
253 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
254 ret <4 x double> %res
257 define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
258 ; CHECK-LABEL: test_mm256_castpd256_pd128:
260 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
261 ; CHECK-NEXT: vzeroupper
262 ; CHECK-NEXT: ret{{[l|q]}}
263 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
264 ret <2 x double> %res
267 define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
268 ; CHECK-LABEL: test_mm256_castps_pd:
270 ; CHECK-NEXT: ret{{[l|q]}}
271 %res = bitcast <8 x float> %a0 to <4 x double>
272 ret <4 x double> %res
275 define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
276 ; CHECK-LABEL: test_mm256_castps_si256:
278 ; CHECK-NEXT: ret{{[l|q]}}
279 %res = bitcast <8 x float> %a0 to <4 x i64>
283 define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
284 ; CHECK-LABEL: test_mm256_castps128_ps256:
286 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
287 ; CHECK-NEXT: ret{{[l|q]}}
288 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
292 define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
293 ; CHECK-LABEL: test_mm256_castps256_ps128:
295 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
296 ; CHECK-NEXT: vzeroupper
297 ; CHECK-NEXT: ret{{[l|q]}}
298 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
302 define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
303 ; CHECK-LABEL: test_mm256_castsi128_si256:
305 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
306 ; CHECK-NEXT: ret{{[l|q]}}
307 %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
311 define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
312 ; CHECK-LABEL: test_mm256_castsi256_pd:
314 ; CHECK-NEXT: ret{{[l|q]}}
315 %res = bitcast <4 x i64> %a0 to <4 x double>
316 ret <4 x double> %res
319 define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
320 ; CHECK-LABEL: test_mm256_castsi256_ps:
322 ; CHECK-NEXT: ret{{[l|q]}}
323 %res = bitcast <4 x i64> %a0 to <8 x float>
327 define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
328 ; CHECK-LABEL: test_mm256_castsi256_si128:
330 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
331 ; CHECK-NEXT: vzeroupper
332 ; CHECK-NEXT: ret{{[l|q]}}
333 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
337 define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
338 ; CHECK-LABEL: test_mm256_ceil_pd:
340 ; CHECK-NEXT: vroundpd $2, %ymm0, %ymm0
341 ; CHECK-NEXT: ret{{[l|q]}}
342 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
343 ret <4 x double> %res
345 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
347 define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
348 ; CHECK-LABEL: test_mm256_ceil_ps:
350 ; CHECK-NEXT: vroundps $2, %ymm0, %ymm0
351 ; CHECK-NEXT: ret{{[l|q]}}
352 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
355 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
357 define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
358 ; CHECK-LABEL: test_mm_cmp_pd:
360 ; CHECK-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
361 ; CHECK-NEXT: ret{{[l|q]}}
362 %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
363 ret <2 x double> %res
365 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
367 define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
368 ; CHECK-LABEL: test_mm256_cmp_pd:
370 ; CHECK-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
371 ; CHECK-NEXT: ret{{[l|q]}}
372 %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
373 ret <4 x double> %res
375 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
377 define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
378 ; CHECK-LABEL: test_mm_cmp_ps:
380 ; CHECK-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
381 ; CHECK-NEXT: ret{{[l|q]}}
382 %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
385 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
387 define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
388 ; CHECK-LABEL: test_mm256_cmp_ps:
390 ; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
391 ; CHECK-NEXT: ret{{[l|q]}}
392 %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
395 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
397 define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
398 ; CHECK-LABEL: test_mm_cmp_sd:
400 ; CHECK-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
401 ; CHECK-NEXT: ret{{[l|q]}}
402 %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
403 ret <2 x double> %res
405 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
407 define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
408 ; CHECK-LABEL: test_mm_cmp_ss:
410 ; CHECK-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
411 ; CHECK-NEXT: ret{{[l|q]}}
412 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
415 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
417 define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
418 ; CHECK-LABEL: test_mm256_cvtepi32_pd:
420 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
421 ; CHECK-NEXT: ret{{[l|q]}}
422 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
423 %res = sitofp <4 x i32> %arg0 to <4 x double>
424 ret <4 x double> %res
427 define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
428 ; CHECK-LABEL: test_mm256_cvtepi32_ps:
430 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
431 ; CHECK-NEXT: ret{{[l|q]}}
432 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
433 %res = sitofp <8 x i32> %arg0 to <8 x float>
437 define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
438 ; CHECK-LABEL: test_mm256_cvtpd_epi32:
440 ; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0
441 ; CHECK-NEXT: vzeroupper
442 ; CHECK-NEXT: ret{{[l|q]}}
443 %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
444 %res = bitcast <4 x i32> %cvt to <2 x i64>
447 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
449 define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
450 ; CHECK-LABEL: test_mm256_cvtpd_ps:
452 ; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0
453 ; CHECK-NEXT: vzeroupper
454 ; CHECK-NEXT: ret{{[l|q]}}
455 %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
458 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
460 define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
461 ; CHECK-LABEL: test_mm256_cvtps_epi32:
463 ; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0
464 ; CHECK-NEXT: ret{{[l|q]}}
465 %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
466 %res = bitcast <8 x i32> %cvt to <4 x i64>
469 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
471 define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
472 ; CHECK-LABEL: test_mm256_cvtps_pd:
474 ; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
475 ; CHECK-NEXT: ret{{[l|q]}}
476 %res = fpext <4 x float> %a0 to <4 x double>
477 ret <4 x double> %res
480 define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
481 ; CHECK-LABEL: test_mm256_cvttpd_epi32:
483 ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
484 ; CHECK-NEXT: vzeroupper
485 ; CHECK-NEXT: ret{{[l|q]}}
486 %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
487 %res = bitcast <4 x i32> %cvt to <2 x i64>
490 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
492 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
493 ; CHECK-LABEL: test_mm256_cvttps_epi32:
495 ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
496 ; CHECK-NEXT: ret{{[l|q]}}
497 %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
498 %res = bitcast <8 x i32> %cvt to <4 x i64>
501 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
503 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
504 ; CHECK-LABEL: test_mm256_div_pd:
506 ; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm0
507 ; CHECK-NEXT: ret{{[l|q]}}
508 %res = fdiv <4 x double> %a0, %a1
509 ret <4 x double> %res
512 define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
513 ; CHECK-LABEL: test_mm256_div_ps:
515 ; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0
516 ; CHECK-NEXT: ret{{[l|q]}}
517 %res = fdiv <8 x float> %a0, %a1
521 define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
522 ; CHECK-LABEL: test_mm256_dp_ps:
524 ; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
525 ; CHECK-NEXT: ret{{[l|q]}}
526 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
529 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
531 define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
532 ; CHECK-LABEL: test_mm256_extract_epi8:
534 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
535 ; CHECK-NEXT: vpextrb $15, %xmm0, %eax
536 ; CHECK-NEXT: movzbl %al, %eax
537 ; CHECK-NEXT: vzeroupper
538 ; CHECK-NEXT: ret{{[l|q]}}
539 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
540 %ext = extractelement <32 x i8> %arg0, i32 31
541 %res = zext i8 %ext to i32
545 define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
546 ; CHECK-LABEL: test_mm256_extract_epi16:
548 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
549 ; CHECK-NEXT: vpextrw $3, %xmm0, %eax
550 ; CHECK-NEXT: movzwl %ax, %eax
551 ; CHECK-NEXT: vzeroupper
552 ; CHECK-NEXT: ret{{[l|q]}}
553 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
554 %ext = extractelement <16 x i16> %arg0, i32 11
555 %res = zext i16 %ext to i32
559 define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
560 ; CHECK-LABEL: test_mm256_extract_epi32:
562 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
563 ; CHECK-NEXT: vextractps $1, %xmm0, %eax
564 ; CHECK-NEXT: vzeroupper
565 ; CHECK-NEXT: ret{{[l|q]}}
566 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
567 %res = extractelement <8 x i32> %arg0, i32 5
571 define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
572 ; X86-LABEL: test_mm256_extract_epi64:
574 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
575 ; X86-NEXT: vextractps $2, %xmm0, %eax
576 ; X86-NEXT: vextractps $3, %xmm0, %edx
577 ; X86-NEXT: vzeroupper
580 ; X64-LABEL: test_mm256_extract_epi64:
582 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
583 ; X64-NEXT: vpextrq $1, %xmm0, %rax
584 ; X64-NEXT: vzeroupper
586 %res = extractelement <4 x i64> %a0, i32 3
590 define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
591 ; CHECK-LABEL: test_mm256_extractf128_pd:
593 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
594 ; CHECK-NEXT: vzeroupper
595 ; CHECK-NEXT: ret{{[l|q]}}
596 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
597 ret <2 x double> %res
600 define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
601 ; CHECK-LABEL: test_mm256_extractf128_ps:
603 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
604 ; CHECK-NEXT: vzeroupper
605 ; CHECK-NEXT: ret{{[l|q]}}
606 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
610 define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
611 ; CHECK-LABEL: test_mm256_extractf128_si256:
613 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
614 ; CHECK-NEXT: vzeroupper
615 ; CHECK-NEXT: ret{{[l|q]}}
616 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
620 define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
621 ; CHECK-LABEL: test_mm256_floor_pd:
623 ; CHECK-NEXT: vroundpd $1, %ymm0, %ymm0
624 ; CHECK-NEXT: ret{{[l|q]}}
625 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
626 ret <4 x double> %res
629 define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
630 ; CHECK-LABEL: test_mm256_floor_ps:
632 ; CHECK-NEXT: vroundps $1, %ymm0, %ymm0
633 ; CHECK-NEXT: ret{{[l|q]}}
634 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
638 define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
639 ; CHECK-LABEL: test_mm256_hadd_pd:
641 ; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
642 ; CHECK-NEXT: ret{{[l|q]}}
643 %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
644 ret <4 x double> %res
646 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
648 define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
649 ; CHECK-LABEL: test_mm256_hadd_ps:
651 ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0
652 ; CHECK-NEXT: ret{{[l|q]}}
653 %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
656 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
658 define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
659 ; CHECK-LABEL: test_mm256_hsub_pd:
661 ; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
662 ; CHECK-NEXT: ret{{[l|q]}}
663 %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
664 ret <4 x double> %res
666 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
668 define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
669 ; CHECK-LABEL: test_mm256_hsub_ps:
671 ; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0
672 ; CHECK-NEXT: ret{{[l|q]}}
673 %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
676 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
678 define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
679 ; X86-LABEL: test_mm256_insert_epi8:
681 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
682 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
683 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
686 ; X64-LABEL: test_mm256_insert_epi8:
688 ; X64-NEXT: movzbl %dil, %eax
689 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
690 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
692 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
693 %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
694 %bc = bitcast <32 x i8> %res to <4 x i64>
698 define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
699 ; X86-LABEL: test_mm256_insert_epi16:
701 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
702 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
703 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
704 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
707 ; X64-LABEL: test_mm256_insert_epi16:
709 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
710 ; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1
711 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
713 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714 %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
715 %bc = bitcast <16 x i16> %res to <4 x i64>
719 define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
720 ; X86-LABEL: test_mm256_insert_epi32:
722 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
723 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
726 ; X64-LABEL: test_mm256_insert_epi32:
728 ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1
729 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
731 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
732 %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
733 %bc = bitcast <8 x i32> %res to <4 x i64>
737 define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
738 ; X86-LABEL: test_mm256_insert_epi64:
740 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
741 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
742 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
743 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
746 ; X64-LABEL: test_mm256_insert_epi64:
748 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
749 ; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
750 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
752 %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
756 define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
757 ; CHECK-LABEL: test_mm256_insertf128_pd:
759 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
760 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
761 ; CHECK-NEXT: ret{{[l|q]}}
762 %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
763 %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
764 ret <4 x double> %res
767 define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
768 ; CHECK-LABEL: test_mm256_insertf128_ps:
770 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
771 ; CHECK-NEXT: ret{{[l|q]}}
772 %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
773 %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
777 define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
778 ; CHECK-LABEL: test_mm256_insertf128_si256:
780 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
781 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
782 ; CHECK-NEXT: ret{{[l|q]}}
783 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
784 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
788 define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
789 ; X86-LABEL: test_mm256_lddqu_si256:
791 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
792 ; X86-NEXT: vlddqu (%eax), %ymm0
795 ; X64-LABEL: test_mm256_lddqu_si256:
797 ; X64-NEXT: vlddqu (%rdi), %ymm0
799 %arg0 = bitcast <4 x i64>* %a0 to i8*
800 %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
801 %bc = bitcast <32 x i8> %res to <4 x i64>
804 declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
806 define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
807 ; X86-LABEL: test_mm256_load_pd:
809 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
810 ; X86-NEXT: vmovaps (%eax), %ymm0
813 ; X64-LABEL: test_mm256_load_pd:
815 ; X64-NEXT: vmovaps (%rdi), %ymm0
817 %arg0 = bitcast double* %a0 to <4 x double>*
818 %res = load <4 x double>, <4 x double>* %arg0, align 32
819 ret <4 x double> %res
822 define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
823 ; X86-LABEL: test_mm256_load_ps:
825 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
826 ; X86-NEXT: vmovaps (%eax), %ymm0
829 ; X64-LABEL: test_mm256_load_ps:
831 ; X64-NEXT: vmovaps (%rdi), %ymm0
833 %arg0 = bitcast float* %a0 to <8 x float>*
834 %res = load <8 x float>, <8 x float>* %arg0, align 32
838 define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
839 ; X86-LABEL: test_mm256_load_si256:
841 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
842 ; X86-NEXT: vmovaps (%eax), %ymm0
845 ; X64-LABEL: test_mm256_load_si256:
847 ; X64-NEXT: vmovaps (%rdi), %ymm0
849 %res = load <4 x i64>, <4 x i64>* %a0, align 32
853 define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
854 ; X86-LABEL: test_mm256_loadu_pd:
856 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
857 ; X86-NEXT: vmovups (%eax), %ymm0
860 ; X64-LABEL: test_mm256_loadu_pd:
862 ; X64-NEXT: vmovups (%rdi), %ymm0
864 %arg0 = bitcast double* %a0 to <4 x double>*
865 %res = load <4 x double>, <4 x double>* %arg0, align 1
866 ret <4 x double> %res
869 define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
870 ; X86-LABEL: test_mm256_loadu_ps:
872 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
873 ; X86-NEXT: vmovups (%eax), %ymm0
876 ; X64-LABEL: test_mm256_loadu_ps:
878 ; X64-NEXT: vmovups (%rdi), %ymm0
880 %arg0 = bitcast float* %a0 to <8 x float>*
881 %res = load <8 x float>, <8 x float>* %arg0, align 1
885 define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
886 ; X86-LABEL: test_mm256_loadu_si256:
888 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
889 ; X86-NEXT: vmovups (%eax), %ymm0
892 ; X64-LABEL: test_mm256_loadu_si256:
894 ; X64-NEXT: vmovups (%rdi), %ymm0
896 %res = load <4 x i64>, <4 x i64>* %a0, align 1
900 define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
901 ; X86-LABEL: test_mm256_loadu2_m128:
903 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
904 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
905 ; X86-NEXT: vmovups (%eax), %xmm0
906 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
909 ; X64-LABEL: test_mm256_loadu2_m128:
911 ; X64-NEXT: vmovups (%rsi), %xmm0
912 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
914 %arg0 = bitcast float* %a0 to <4 x float>*
915 %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
916 %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
917 %arg1 = bitcast float* %a1 to <4 x float>*
918 %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
919 %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
920 %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
924 define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
925 ; X86-LABEL: test_mm256_loadu2_m128d:
927 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
928 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
929 ; X86-NEXT: vmovups (%eax), %xmm0
930 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
933 ; X64-LABEL: test_mm256_loadu2_m128d:
935 ; X64-NEXT: vmovups (%rsi), %xmm0
936 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
938 %arg0 = bitcast double* %a0 to <2 x double>*
939 %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
940 %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
941 %arg1 = bitcast double* %a1 to <2 x double>*
942 %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
943 %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
944 %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
945 ret <4 x double> %res
948 define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
949 ; X86-LABEL: test_mm256_loadu2_m128i:
951 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
952 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
953 ; X86-NEXT: vmovups (%eax), %xmm0
954 ; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
957 ; X64-LABEL: test_mm256_loadu2_m128i:
959 ; X64-NEXT: vmovups (%rsi), %xmm0
960 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
962 %arg0 = bitcast i64* %a0 to <2 x i64>*
963 %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
964 %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
965 %arg1 = bitcast i64* %a1 to <2 x i64>*
966 %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
967 %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
968 %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
972 define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
973 ; X86-LABEL: test_mm_maskload_pd:
975 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
976 ; X86-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
979 ; X64-LABEL: test_mm_maskload_pd:
981 ; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0
983 %arg0 = bitcast double* %a0 to i8*
984 %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
985 ret <2 x double> %res
987 declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
989 define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
990 ; X86-LABEL: test_mm256_maskload_pd:
992 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
993 ; X86-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
996 ; X64-LABEL: test_mm256_maskload_pd:
998 ; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
1000 %arg0 = bitcast double* %a0 to i8*
1001 %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
1002 ret <4 x double> %res
1004 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
1006 define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
1007 ; X86-LABEL: test_mm_maskload_ps:
1009 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1010 ; X86-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
1013 ; X64-LABEL: test_mm_maskload_ps:
1015 ; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
1017 %arg0 = bitcast float* %a0 to i8*
1018 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1019 %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
1020 ret <4 x float> %res
1022 declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
1024 define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
1025 ; X86-LABEL: test_mm256_maskload_ps:
1027 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1028 ; X86-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
1031 ; X64-LABEL: test_mm256_maskload_ps:
1033 ; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
1035 %arg0 = bitcast float* %a0 to i8*
1036 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1037 %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
1038 ret <8 x float> %res
1040 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
1042 define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
1043 ; X86-LABEL: test_mm_maskstore_pd:
1045 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1046 ; X86-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
1049 ; X64-LABEL: test_mm_maskstore_pd:
1051 ; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
1053 %arg0 = bitcast double* %a0 to i8*
1054 call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
1057 declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
1059 define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
1060 ; X86-LABEL: test_mm256_maskstore_pd:
1062 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1063 ; X86-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
1064 ; X86-NEXT: vzeroupper
1067 ; X64-LABEL: test_mm256_maskstore_pd:
1069 ; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
1070 ; X64-NEXT: vzeroupper
1072 %arg0 = bitcast double* %a0 to i8*
1073 call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
1076 declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
1078 define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
1079 ; X86-LABEL: test_mm_maskstore_ps:
1081 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1082 ; X86-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
1085 ; X64-LABEL: test_mm_maskstore_ps:
1087 ; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
1089 %arg0 = bitcast float* %a0 to i8*
1090 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1091 call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
1094 declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
1096 define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
1097 ; X86-LABEL: test_mm256_maskstore_ps:
1099 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1100 ; X86-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
1101 ; X86-NEXT: vzeroupper
1104 ; X64-LABEL: test_mm256_maskstore_ps:
1106 ; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
1107 ; X64-NEXT: vzeroupper
1109 %arg0 = bitcast float* %a0 to i8*
1110 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1111 call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
1114 declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
1116 define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1117 ; CHECK-LABEL: test_mm256_max_pd:
1119 ; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
1120 ; CHECK-NEXT: ret{{[l|q]}}
1121 %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1122 ret <4 x double> %res
1124 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1126 define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1127 ; CHECK-LABEL: test_mm256_max_ps:
1129 ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
1130 ; CHECK-NEXT: ret{{[l|q]}}
1131 %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
1132 ret <8 x float> %res
1134 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
1136 define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1137 ; CHECK-LABEL: test_mm256_min_pd:
1139 ; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0
1140 ; CHECK-NEXT: ret{{[l|q]}}
1141 %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
1142 ret <4 x double> %res
1144 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
1146 define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1147 ; CHECK-LABEL: test_mm256_min_ps:
1149 ; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0
1150 ; CHECK-NEXT: ret{{[l|q]}}
1151 %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
1152 ret <8 x float> %res
1154 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
1156 define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
1157 ; CHECK-LABEL: test_mm256_movedup_pd:
1159 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1160 ; CHECK-NEXT: ret{{[l|q]}}
1161 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1162 ret <4 x double> %res
1165 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
1166 ; CHECK-LABEL: test_mm256_movehdup_ps:
1168 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1169 ; CHECK-NEXT: ret{{[l|q]}}
1170 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1171 ret <8 x float> %res
1174 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
1175 ; CHECK-LABEL: test_mm256_moveldup_ps:
1177 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1178 ; CHECK-NEXT: ret{{[l|q]}}
1179 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1180 ret <8 x float> %res
1183 define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
1184 ; CHECK-LABEL: test_mm256_movemask_pd:
1186 ; CHECK-NEXT: vmovmskpd %ymm0, %eax
1187 ; CHECK-NEXT: vzeroupper
1188 ; CHECK-NEXT: ret{{[l|q]}}
1189 %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
1192 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
1194 define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
1195 ; CHECK-LABEL: test_mm256_movemask_ps:
1197 ; CHECK-NEXT: vmovmskps %ymm0, %eax
1198 ; CHECK-NEXT: vzeroupper
1199 ; CHECK-NEXT: ret{{[l|q]}}
1200 %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
1203 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
1205 define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1206 ; CHECK-LABEL: test_mm256_mul_pd:
1208 ; CHECK-NEXT: vmulpd %ymm1, %ymm0, %ymm0
1209 ; CHECK-NEXT: ret{{[l|q]}}
1210 %res = fmul <4 x double> %a0, %a1
1211 ret <4 x double> %res
1214 define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1215 ; CHECK-LABEL: test_mm256_mul_ps:
1217 ; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0
1218 ; CHECK-NEXT: ret{{[l|q]}}
1219 %res = fmul <8 x float> %a0, %a1
1220 ret <8 x float> %res
1223 define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1224 ; CHECK-LABEL: test_mm256_or_pd:
1226 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1227 ; CHECK-NEXT: ret{{[l|q]}}
1228 %1 = bitcast <4 x double> %a0 to <4 x i64>
1229 %2 = bitcast <4 x double> %a1 to <4 x i64>
1230 %res = or <4 x i64> %1, %2
1231 %bc = bitcast <4 x i64> %res to <4 x double>
1232 ret <4 x double> %bc
1235 define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1236 ; CHECK-LABEL: test_mm256_or_ps:
1238 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1239 ; CHECK-NEXT: ret{{[l|q]}}
1240 %1 = bitcast <8 x float> %a0 to <8 x i32>
1241 %2 = bitcast <8 x float> %a1 to <8 x i32>
1242 %res = or <8 x i32> %1, %2
1243 %bc = bitcast <8 x i32> %res to <8 x float>
1247 define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
1248 ; CHECK-LABEL: test_mm_permute_pd:
1250 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1251 ; CHECK-NEXT: ret{{[l|q]}}
1252 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
1253 ret <2 x double> %res
1256 define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
1257 ; CHECK-LABEL: test_mm256_permute_pd:
1259 ; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1260 ; CHECK-NEXT: ret{{[l|q]}}
1261 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
1262 ret <4 x double> %res
1265 define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
1266 ; CHECK-LABEL: test_mm_permute_ps:
1268 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1269 ; CHECK-NEXT: ret{{[l|q]}}
1270 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1271 ret <4 x float> %res
1274 define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
1275 ; CHECK-LABEL: test2_mm_permute_ps:
1277 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1278 ; CHECK-NEXT: ret{{[l|q]}}
1279 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
1280 ret <4 x float> %res
1283 define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
1284 ; CHECK-LABEL: test_mm256_permute_ps:
1286 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1287 ; CHECK-NEXT: ret{{[l|q]}}
1288 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1289 ret <8 x float> %res
1292 define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1293 ; CHECK-LABEL: test_mm256_permute2f128_pd:
1295 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1296 ; CHECK-NEXT: ret{{[l|q]}}
1297 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1298 ret <4 x double> %res
1300 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
1303 define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1304 ; CHECK-LABEL: test_mm256_permute2f128_ps:
1306 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
1307 ; CHECK-NEXT: ret{{[l|q]}}
1308 %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
1309 ret <8 x float> %res
1311 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1313 define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1314 ; CHECK-LABEL: test_mm256_permute2f128_si256:
1316 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1317 ; CHECK-NEXT: ret{{[l|q]}}
1318 %1 = bitcast <4 x i64> %a0 to <8 x i32>
1319 %2 = bitcast <4 x i64> %a1 to <8 x i32>
1320 %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1321 %bc = bitcast <8 x i32> %res to <4 x i64>
1324 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
1326 define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
1327 ; CHECK-LABEL: test_mm_permutevar_pd:
1329 ; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1330 ; CHECK-NEXT: ret{{[l|q]}}
1331 %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
1332 ret <2 x double> %res
1334 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
1336 define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
1337 ; CHECK-LABEL: test_mm256_permutevar_pd:
1339 ; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1340 ; CHECK-NEXT: ret{{[l|q]}}
1341 %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
1342 ret <4 x double> %res
1344 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
1346 define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
1347 ; CHECK-LABEL: test_mm_permutevar_ps:
1349 ; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1350 ; CHECK-NEXT: ret{{[l|q]}}
1351 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1352 %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
1353 ret <4 x float> %res
1355 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
1357 define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
1358 ; CHECK-LABEL: test_mm256_permutevar_ps:
1360 ; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1361 ; CHECK-NEXT: ret{{[l|q]}}
1362 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1363 %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
1364 ret <8 x float> %res
1366 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
1368 define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
1369 ; CHECK-LABEL: test_mm256_rcp_ps:
1371 ; CHECK-NEXT: vrcpps %ymm0, %ymm0
1372 ; CHECK-NEXT: ret{{[l|q]}}
1373 %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
1374 ret <8 x float> %res
1376 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
1378 define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
1379 ; CHECK-LABEL: test_mm256_round_pd:
1381 ; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
1382 ; CHECK-NEXT: ret{{[l|q]}}
1383 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
1384 ret <4 x double> %res
1387 define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
1388 ; CHECK-LABEL: test_mm256_round_ps:
1390 ; CHECK-NEXT: vroundps $4, %ymm0, %ymm0
1391 ; CHECK-NEXT: ret{{[l|q]}}
1392 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
1393 ret <8 x float> %res
1396 define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
1397 ; CHECK-LABEL: test_mm256_rsqrt_ps:
1399 ; CHECK-NEXT: vrsqrtps %ymm0, %ymm0
1400 ; CHECK-NEXT: ret{{[l|q]}}
1401 %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
1402 ret <8 x float> %res
1404 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
1406 define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
1407 ; X86-LABEL: test_mm256_set_epi8:
1409 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1410 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
1411 ; X86-NEXT: vmovd %ecx, %xmm0
1412 ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
1413 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1414 ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
1415 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1416 ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
1417 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1418 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
1419 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1420 ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
1421 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1422 ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
1423 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1424 ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
1425 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1426 ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
1427 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1428 ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
1429 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1430 ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1431 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1432 ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
1433 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1434 ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1435 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1436 ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
1437 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1438 ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1439 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1440 ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
1441 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1442 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
1443 ; X86-NEXT: vmovd %ecx, %xmm1
1444 ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
1445 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1446 ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
1447 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1448 ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
1449 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1450 ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
1451 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1452 ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
1453 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1454 ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
1455 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1456 ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
1457 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1458 ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
1459 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1460 ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
1461 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1462 ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
1463 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1464 ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
1465 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
1467 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1468 ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
1469 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1470 ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
1471 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1472 ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
1473 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1476 ; X64-LABEL: test_mm256_set_epi8:
1478 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
1479 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1480 ; X64-NEXT: vmovd %eax, %xmm0
1481 ; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
1482 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1483 ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
1484 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1485 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
1486 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1487 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
1488 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1489 ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
1490 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1491 ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
1492 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1493 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
1494 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1495 ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
1496 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1497 ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
1498 ; X64-NEXT: movzbl %r9b, %eax
1499 ; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1500 ; X64-NEXT: movzbl %r8b, %eax
1501 ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
1502 ; X64-NEXT: movzbl %cl, %eax
1503 ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1504 ; X64-NEXT: movzbl %dl, %eax
1505 ; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
1506 ; X64-NEXT: movzbl %sil, %eax
1507 ; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1508 ; X64-NEXT: movzbl %dil, %eax
1509 ; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
1510 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1511 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
1512 ; X64-NEXT: vmovd %ecx, %xmm1
1513 ; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
1514 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1515 ; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
1516 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1517 ; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
1518 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1519 ; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
1520 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1521 ; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
1522 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1523 ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
1524 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1525 ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
1526 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1527 ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
1528 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1529 ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
1530 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1531 ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
1532 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1533 ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
1534 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1535 ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
1536 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1537 ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
1538 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1539 ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
1540 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
1541 ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
1542 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1544 %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0
1545 %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1
1546 %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2
1547 %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3
1548 %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4
1549 %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5
1550 %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6
1551 %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7
1552 %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8
1553 %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9
1554 %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10
1555 %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
1556 %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
1557 %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
1558 %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
1559 %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
1560 %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
1561 %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
1562 %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
1563 %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
1564 %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
1565 %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
1566 %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
1567 %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
1568 %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
1569 %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
1570 %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
1571 %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
1572 %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
1573 %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
1574 %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
1575 %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
1576 %res = bitcast <32 x i8> %res31 to <4 x i64>
1580 define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
1581 ; X86-LABEL: test_mm256_set_epi16:
1583 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1584 ; X86-NEXT: vmovd %eax, %xmm0
1585 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1586 ; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
1587 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1588 ; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
1589 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1590 ; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
1591 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1592 ; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
1593 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1594 ; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1595 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1596 ; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
1597 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1598 ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
1599 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1600 ; X86-NEXT: vmovd %eax, %xmm1
1601 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1602 ; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
1603 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1604 ; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
1605 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1606 ; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
1607 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1608 ; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
1611 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1612 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
1613 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1614 ; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
1615 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1618 ; X64-LABEL: test_mm256_set_epi16:
1620 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1621 ; X64-NEXT: vmovd %eax, %xmm0
1622 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1623 ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
1624 ; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
1625 ; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
1626 ; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
1627 ; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
1628 ; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
1629 ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
1630 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1631 ; X64-NEXT: vmovd %eax, %xmm1
1632 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1633 ; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
1634 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1635 ; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
1636 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1637 ; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
1638 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1639 ; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
1640 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1641 ; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
1642 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1643 ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
1644 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
1645 ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
1646 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1648 %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0
1649 %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1
1650 %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2
1651 %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3
1652 %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4
1653 %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5
1654 %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6
1655 %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7
1656 %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8
1657 %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9
1658 %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10
1659 %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
1660 %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
1661 %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
1662 %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
1663 %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
1664 %res = bitcast <16 x i16> %res15 to <4 x i64>
1668 define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
1669 ; X86-LABEL: test_mm256_set_epi32:
1671 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1672 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1673 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1674 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1675 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1676 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1677 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1678 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1679 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1682 ; X64-LABEL: test_mm256_set_epi32:
1684 ; X64-NEXT: vmovd %ecx, %xmm0
1685 ; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
1686 ; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
1687 ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
1688 ; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1689 ; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
1690 ; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1
1691 ; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1
1692 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1694 %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
1695 %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
1696 %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
1697 %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
1698 %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
1699 %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
1700 %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
1701 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1702 %res = bitcast <8 x i32> %res7 to <4 x i64>
1706 define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
1707 ; X86-LABEL: test_mm256_set_epi64x:
1709 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1710 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1711 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1712 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1713 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1714 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1715 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1716 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1717 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1720 ; X64-LABEL: test_mm256_set_epi64x:
1722 ; X64-NEXT: vmovq %rdi, %xmm0
1723 ; X64-NEXT: vmovq %rsi, %xmm1
1724 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1725 ; X64-NEXT: vmovq %rdx, %xmm1
1726 ; X64-NEXT: vmovq %rcx, %xmm2
1727 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1728 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1730 %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
1731 %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
1732 %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
1733 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1737 define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
1738 ; CHECK-LABEL: test_mm256_set_m128:
1740 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1741 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1742 ; CHECK-NEXT: ret{{[l|q]}}
1743 %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1744 ret <8 x float> %res
1747 define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
1748 ; CHECK-LABEL: test_mm256_set_m128d:
1750 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1751 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1752 ; CHECK-NEXT: ret{{[l|q]}}
1753 %arg0 = bitcast <2 x double> %a0 to <4 x float>
1754 %arg1 = bitcast <2 x double> %a1 to <4 x float>
1755 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1756 %bc = bitcast <8 x float> %res to <4 x double>
1757 ret <4 x double> %bc
1760 define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1761 ; CHECK-LABEL: test_mm256_set_m128i:
1763 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1764 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1765 ; CHECK-NEXT: ret{{[l|q]}}
1766 %arg0 = bitcast <2 x i64> %a0 to <4 x float>
1767 %arg1 = bitcast <2 x i64> %a1 to <4 x float>
1768 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1769 %bc = bitcast <8 x float> %res to <4 x i64>
1773 define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
1774 ; X86-LABEL: test_mm256_set_pd:
1776 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1777 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1778 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1779 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1780 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
1781 ; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1782 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1785 ; X64-LABEL: test_mm256_set_pd:
1787 ; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1788 ; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1789 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1791 %res0 = insertelement <4 x double> undef, double %a3, i32 0
1792 %res1 = insertelement <4 x double> %res0, double %a2, i32 1
1793 %res2 = insertelement <4 x double> %res1, double %a1, i32 2
1794 %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1795 ret <4 x double> %res3
1798 define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
1799 ; X86-LABEL: test_mm256_set_ps:
1801 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1802 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1803 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1804 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1805 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1806 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1807 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1808 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1809 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1810 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
1811 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1812 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1813 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1814 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
1815 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1818 ; X64-LABEL: test_mm256_set_ps:
1820 ; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1821 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1822 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1823 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
1824 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1825 ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1826 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1828 %res0 = insertelement <8 x float> undef, float %a7, i32 0
1829 %res1 = insertelement <8 x float> %res0, float %a6, i32 1
1830 %res2 = insertelement <8 x float> %res1, float %a5, i32 2
1831 %res3 = insertelement <8 x float> %res2, float %a4, i32 3
1832 %res4 = insertelement <8 x float> %res3, float %a3, i32 4
1833 %res5 = insertelement <8 x float> %res4, float %a2, i32 5
1834 %res6 = insertelement <8 x float> %res5, float %a1, i32 6
1835 %res7 = insertelement <8 x float> %res6, float %a0, i32 7
1836 ret <8 x float> %res7
1839 define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
1840 ; X86-LABEL: test_mm256_set1_epi8:
1842 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1843 ; X86-NEXT: vmovd %eax, %xmm0
1844 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1845 ; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1846 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1849 ; X64-LABEL: test_mm256_set1_epi8:
1851 ; X64-NEXT: movzbl %dil, %eax
1852 ; X64-NEXT: vmovd %eax, %xmm0
1853 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1854 ; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1855 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1857 %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0
1858 %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1
1859 %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2
1860 %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3
1861 %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4
1862 %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5
1863 %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6
1864 %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7
1865 %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8
1866 %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9
1867 %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10
1868 %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
1869 %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
1870 %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
1871 %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
1872 %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
1873 %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
1874 %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
1875 %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
1876 %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
1877 %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
1878 %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
1879 %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
1880 %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
1881 %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
1882 %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
1883 %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
1884 %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
1885 %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
1886 %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
1887 %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
1888 %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
1889 %res = bitcast <32 x i8> %res31 to <4 x i64>
1893 define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
1894 ; X86-LABEL: test_mm256_set1_epi16:
1896 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1897 ; X86-NEXT: vmovd %eax, %xmm0
1898 ; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1899 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1900 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1903 ; X64-LABEL: test_mm256_set1_epi16:
1905 ; X64-NEXT: vmovd %edi, %xmm0
1906 ; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1907 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1908 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1910 %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
1911 %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1
1912 %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2
1913 %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3
1914 %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4
1915 %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5
1916 %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6
1917 %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7
1918 %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8
1919 %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9
1920 %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10
1921 %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
1922 %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
1923 %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
1924 %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
1925 %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
1926 %res = bitcast <16 x i16> %res15 to <4 x i64>
1930 define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
1931 ; X86-LABEL: test_mm256_set1_epi32:
1933 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1934 ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1935 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1938 ; X64-LABEL: test_mm256_set1_epi32:
1940 ; X64-NEXT: vmovd %edi, %xmm0
1941 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1942 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1944 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
1945 %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
1946 %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
1947 %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
1948 %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
1949 %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
1950 %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
1951 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1952 %res = bitcast <8 x i32> %res7 to <4 x i64>
1956 define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
1957 ; X86-LABEL: test_mm256_set1_epi64x:
1959 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1960 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1961 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1962 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1965 ; X64-LABEL: test_mm256_set1_epi64x:
1967 ; X64-NEXT: vmovq %rdi, %xmm0
1968 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1969 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1971 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
1972 %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
1973 %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
1974 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1978 define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
1979 ; X86-LABEL: test_mm256_set1_pd:
1981 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1982 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1983 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1986 ; X64-LABEL: test_mm256_set1_pd:
1988 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1989 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1991 %res0 = insertelement <4 x double> undef, double %a0, i32 0
1992 %res1 = insertelement <4 x double> %res0, double %a0, i32 1
1993 %res2 = insertelement <4 x double> %res1, double %a0, i32 2
1994 %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1995 ret <4 x double> %res3
1998 define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
1999 ; X86-LABEL: test_mm256_set1_ps:
2001 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2002 ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2003 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2006 ; X64-LABEL: test_mm256_set1_ps:
2008 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2009 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2011 %res0 = insertelement <8 x float> undef, float %a0, i32 0
2012 %res1 = insertelement <8 x float> %res0, float %a0, i32 1
2013 %res2 = insertelement <8 x float> %res1, float %a0, i32 2
2014 %res3 = insertelement <8 x float> %res2, float %a0, i32 3
2015 %res4 = insertelement <8 x float> %res3, float %a0, i32 4
2016 %res5 = insertelement <8 x float> %res4, float %a0, i32 5
2017 %res6 = insertelement <8 x float> %res5, float %a0, i32 6
2018 %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2019 ret <8 x float> %res7
2022 define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
2023 ; X86-LABEL: test_mm256_setr_epi8:
2025 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2026 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
2027 ; X86-NEXT: vmovd %ecx, %xmm0
2028 ; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
2029 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2030 ; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
2031 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2032 ; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
2033 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2034 ; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2035 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2036 ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2037 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2038 ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
2039 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2040 ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
2041 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2042 ; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
2043 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2044 ; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2045 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2046 ; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
2047 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2048 ; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
2049 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2050 ; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
2051 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2052 ; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2053 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2054 ; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2055 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2056 ; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2057 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2058 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
2059 ; X86-NEXT: vmovd %ecx, %xmm1
2060 ; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
2061 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2062 ; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
2063 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2064 ; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
2065 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2066 ; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
2067 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2068 ; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
2069 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2070 ; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
2071 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2072 ; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
2073 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2074 ; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
2075 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2076 ; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
2077 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2078 ; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
2079 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2080 ; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
2081 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2082 ; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
2083 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2084 ; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
2085 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2086 ; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
2087 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2088 ; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
2089 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2092 ; X64-LABEL: test_mm256_setr_epi8:
2094 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
2095 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2096 ; X64-NEXT: vmovd %eax, %xmm0
2097 ; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
2098 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2099 ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
2100 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2101 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
2102 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2103 ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2104 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2105 ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2106 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2107 ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
2108 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2109 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
2110 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2111 ; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
2112 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2113 ; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2114 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2115 ; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
2116 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2117 ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
2118 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2119 ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
2120 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2121 ; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2122 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2123 ; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2124 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2125 ; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2126 ; X64-NEXT: movzbl %sil, %eax
2127 ; X64-NEXT: movzbl %dil, %esi
2128 ; X64-NEXT: vmovd %esi, %xmm1
2129 ; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
2130 ; X64-NEXT: movzbl %dl, %eax
2131 ; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
2132 ; X64-NEXT: movzbl %cl, %eax
2133 ; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
2134 ; X64-NEXT: movzbl %r8b, %eax
2135 ; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
2136 ; X64-NEXT: movzbl %r9b, %eax
2137 ; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
2138 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2139 ; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
2140 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2141 ; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
2142 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2143 ; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
2144 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2145 ; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
2146 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2147 ; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
2148 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2149 ; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
2150 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2151 ; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
2152 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2153 ; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
2154 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2155 ; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
2156 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
2157 ; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
2158 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2160 %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0
2161 %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1
2162 %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2
2163 %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3
2164 %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4
2165 %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5
2166 %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6
2167 %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7
2168 %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8
2169 %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9
2170 %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10
2171 %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
2172 %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
2173 %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
2174 %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
2175 %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
2176 %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
2177 %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
2178 %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
2179 %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
2180 %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
2181 %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
2182 %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
2183 %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
2184 %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
2185 %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
2186 %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
2187 %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
2188 %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
2189 %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
2190 %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
2191 %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
2192 %res = bitcast <32 x i8> %res31 to <4 x i64>
2196 define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2197 ; X86-LABEL: test_mm256_setr_epi16:
2199 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2200 ; X86-NEXT: vmovd %eax, %xmm0
2201 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2202 ; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
2203 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2204 ; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
2205 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2206 ; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
2207 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2208 ; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
2209 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2210 ; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
2211 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2212 ; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
2213 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2214 ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
2215 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2216 ; X86-NEXT: vmovd %eax, %xmm1
2217 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2218 ; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
2219 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2220 ; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
2221 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2222 ; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
2223 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2224 ; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
2225 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2226 ; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
2227 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2228 ; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
2229 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2230 ; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
2231 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2234 ; X64-LABEL: test_mm256_setr_epi16:
2236 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2237 ; X64-NEXT: vmovd %eax, %xmm0
2238 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2239 ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
2240 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2241 ; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
2242 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2243 ; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
2244 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2245 ; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
2246 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2247 ; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
2248 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2249 ; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
2250 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2251 ; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
2252 ; X64-NEXT: vmovd %edi, %xmm1
2253 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
2254 ; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
2255 ; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
2256 ; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
2257 ; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
2258 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2259 ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
2260 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
2261 ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
2262 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2264 %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0
2265 %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1
2266 %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2
2267 %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3
2268 %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4
2269 %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5
2270 %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6
2271 %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7
2272 %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8
2273 %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9
2274 %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10
2275 %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
2276 %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
2277 %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
2278 %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
2279 %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
2280 %res = bitcast <16 x i16> %res15 to <4 x i64>
2284 define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2285 ; X86-LABEL: test_mm256_setr_epi32:
2287 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2288 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2289 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2290 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2291 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2292 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2293 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2294 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2295 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2298 ; X64-LABEL: test_mm256_setr_epi32:
2300 ; X64-NEXT: vmovd %r8d, %xmm0
2301 ; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
2302 ; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2303 ; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2304 ; X64-NEXT: vmovd %edi, %xmm1
2305 ; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
2306 ; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
2307 ; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
2308 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2310 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2311 %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
2312 %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
2313 %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
2314 %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
2315 %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
2316 %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
2317 %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
2318 %res = bitcast <8 x i32> %res7 to <4 x i64>
2322 define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2323 ; X86-LABEL: test_mm256_setr_epi64x:
2325 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2326 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2327 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2328 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2329 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2330 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2331 ; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2332 ; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2333 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2336 ; X64-LABEL: test_mm256_setr_epi64x:
2338 ; X64-NEXT: vmovq %rcx, %xmm0
2339 ; X64-NEXT: vmovq %rdx, %xmm1
2340 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2341 ; X64-NEXT: vmovq %rsi, %xmm1
2342 ; X64-NEXT: vmovq %rdi, %xmm2
2343 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2344 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2346 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2347 %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
2348 %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
2349 %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
2353 define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2354 ; CHECK-LABEL: test_mm256_setr_m128:
2356 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2357 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2358 ; CHECK-NEXT: ret{{[l|q]}}
2359 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2360 ret <8 x float> %res
2363 define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2364 ; CHECK-LABEL: test_mm256_setr_m128d:
2366 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2367 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2368 ; CHECK-NEXT: ret{{[l|q]}}
2369 %arg0 = bitcast <2 x double> %a0 to <4 x float>
2370 %arg1 = bitcast <2 x double> %a1 to <4 x float>
2371 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2372 %bc = bitcast <8 x float> %res to <4 x double>
2373 ret <4 x double> %bc
2376 define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2377 ; CHECK-LABEL: test_mm256_setr_m128i:
2379 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2380 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2381 ; CHECK-NEXT: ret{{[l|q]}}
2382 %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2383 %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2384 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2385 %bc = bitcast <8 x float> %res to <4 x i64>
2389 define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2390 ; X86-LABEL: test_mm256_setr_pd:
2392 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2393 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2394 ; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2395 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2396 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
2397 ; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2398 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2401 ; X64-LABEL: test_mm256_setr_pd:
2403 ; X64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2404 ; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2405 ; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2407 %res0 = insertelement <4 x double> undef, double %a0, i32 0
2408 %res1 = insertelement <4 x double> %res0, double %a1, i32 1
2409 %res2 = insertelement <4 x double> %res1, double %a2, i32 2
2410 %res3 = insertelement <4 x double> %res2, double %a3, i32 3
2411 ret <4 x double> %res3
2414 define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2415 ; X86-LABEL: test_mm256_setr_ps:
2417 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2418 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2419 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2420 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2421 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2422 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2423 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2424 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2425 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2426 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2427 ; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2428 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
2429 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
2430 ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2431 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2434 ; X64-LABEL: test_mm256_setr_ps:
2436 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2437 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2438 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2439 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2440 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2441 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2442 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2444 %res0 = insertelement <8 x float> undef, float %a0, i32 0
2445 %res1 = insertelement <8 x float> %res0, float %a1, i32 1
2446 %res2 = insertelement <8 x float> %res1, float %a2, i32 2
2447 %res3 = insertelement <8 x float> %res2, float %a3, i32 3
2448 %res4 = insertelement <8 x float> %res3, float %a4, i32 4
2449 %res5 = insertelement <8 x float> %res4, float %a5, i32 5
2450 %res6 = insertelement <8 x float> %res5, float %a6, i32 6
2451 %res7 = insertelement <8 x float> %res6, float %a7, i32 7
2452 ret <8 x float> %res7
2455 define <4 x double> @test_mm256_setzero_pd() nounwind {
2456 ; CHECK-LABEL: test_mm256_setzero_pd:
2458 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2459 ; CHECK-NEXT: ret{{[l|q]}}
2460 ret <4 x double> zeroinitializer
2463 define <8 x float> @test_mm256_setzero_ps() nounwind {
2464 ; CHECK-LABEL: test_mm256_setzero_ps:
2466 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2467 ; CHECK-NEXT: ret{{[l|q]}}
2468 ret <8 x float> zeroinitializer
2471 define <4 x i64> @test_mm256_setzero_si256() nounwind {
2472 ; CHECK-LABEL: test_mm256_setzero_si256:
2474 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2475 ; CHECK-NEXT: ret{{[l|q]}}
2476 ret <4 x i64> zeroinitializer
2479 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2480 ; CHECK-LABEL: test_mm256_shuffle_pd:
2482 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2483 ; CHECK-NEXT: ret{{[l|q]}}
2484 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2485 ret <4 x double> %res
2488 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2489 ; CHECK-LABEL: test_mm256_shuffle_ps:
2491 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
2492 ; CHECK-NEXT: ret{{[l|q]}}
2493 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
2494 ret <8 x float> %res
2497 define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
2498 ; CHECK-LABEL: test_mm256_sqrt_pd:
2499 ; CHECK: # %bb.0: # %entry
2500 ; CHECK-NEXT: vsqrtpd %ymm0, %ymm0
2501 ; CHECK-NEXT: ret{{[l|q]}}
2503 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
2507 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
2509 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
2510 ; CHECK-LABEL: test_mm256_sqrt_ps:
2511 ; CHECK: # %bb.0: # %entry
2512 ; CHECK-NEXT: vsqrtps %ymm0, %ymm0
2513 ; CHECK-NEXT: ret{{[l|q]}}
2515 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
2519 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
2521 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
2522 ; X86-LABEL: test_mm256_store_pd:
2524 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2525 ; X86-NEXT: vmovaps %ymm0, (%eax)
2526 ; X86-NEXT: vzeroupper
2529 ; X64-LABEL: test_mm256_store_pd:
2531 ; X64-NEXT: vmovaps %ymm0, (%rdi)
2532 ; X64-NEXT: vzeroupper
2534 %arg0 = bitcast double* %a0 to <4 x double>*
2535 store <4 x double> %a1, <4 x double>* %arg0, align 32
2539 define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
2540 ; X86-LABEL: test_mm256_store_ps:
2542 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2543 ; X86-NEXT: vmovaps %ymm0, (%eax)
2544 ; X86-NEXT: vzeroupper
2547 ; X64-LABEL: test_mm256_store_ps:
2549 ; X64-NEXT: vmovaps %ymm0, (%rdi)
2550 ; X64-NEXT: vzeroupper
2552 %arg0 = bitcast float* %a0 to <8 x float>*
2553 store <8 x float> %a1, <8 x float>* %arg0, align 32
2557 define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
2558 ; X86-LABEL: test_mm256_store_si256:
2560 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2561 ; X86-NEXT: vmovaps %ymm0, (%eax)
2562 ; X86-NEXT: vzeroupper
2565 ; X64-LABEL: test_mm256_store_si256:
2567 ; X64-NEXT: vmovaps %ymm0, (%rdi)
2568 ; X64-NEXT: vzeroupper
2570 store <4 x i64> %a1, <4 x i64>* %a0, align 32
2574 define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
2575 ; X86-LABEL: test_mm256_storeu_pd:
2577 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2578 ; X86-NEXT: vmovups %ymm0, (%eax)
2579 ; X86-NEXT: vzeroupper
2582 ; X64-LABEL: test_mm256_storeu_pd:
2584 ; X64-NEXT: vmovups %ymm0, (%rdi)
2585 ; X64-NEXT: vzeroupper
2587 %arg0 = bitcast double* %a0 to <4 x double>*
2588 store <4 x double> %a1, <4 x double>* %arg0, align 1
2592 define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
2593 ; X86-LABEL: test_mm256_storeu_ps:
2595 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2596 ; X86-NEXT: vmovups %ymm0, (%eax)
2597 ; X86-NEXT: vzeroupper
2600 ; X64-LABEL: test_mm256_storeu_ps:
2602 ; X64-NEXT: vmovups %ymm0, (%rdi)
2603 ; X64-NEXT: vzeroupper
2605 %arg0 = bitcast float* %a0 to <8 x float>*
2606 store <8 x float> %a1, <8 x float>* %arg0, align 1
2610 define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
2611 ; X86-LABEL: test_mm256_storeu_si256:
2613 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2614 ; X86-NEXT: vmovups %ymm0, (%eax)
2615 ; X86-NEXT: vzeroupper
2618 ; X64-LABEL: test_mm256_storeu_si256:
2620 ; X64-NEXT: vmovups %ymm0, (%rdi)
2621 ; X64-NEXT: vzeroupper
2623 store <4 x i64> %a1, <4 x i64>* %a0, align 1
2627 define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
2628 ; X86-LABEL: test_mm256_storeu2_m128:
2630 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2631 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2632 ; X86-NEXT: vmovups %xmm0, (%ecx)
2633 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2634 ; X86-NEXT: vmovups %xmm0, (%eax)
2635 ; X86-NEXT: vzeroupper
2638 ; X64-LABEL: test_mm256_storeu2_m128:
2640 ; X64-NEXT: vmovups %xmm0, (%rdi)
2641 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
2642 ; X64-NEXT: vmovups %xmm0, (%rsi)
2643 ; X64-NEXT: vzeroupper
2645 %arg0 = bitcast float* %a0 to <4 x float>*
2646 %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2647 store <4 x float> %lo, <4 x float>* %arg0, align 1
2648 %arg1 = bitcast float* %a1 to <4 x float>*
2649 %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2650 store <4 x float> %hi, <4 x float>* %arg1, align 1
2654 define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
2655 ; X86-LABEL: test_mm256_storeu2_m128d:
2657 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2658 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2659 ; X86-NEXT: vmovups %xmm0, (%ecx)
2660 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2661 ; X86-NEXT: vmovups %xmm0, (%eax)
2662 ; X86-NEXT: vzeroupper
2665 ; X64-LABEL: test_mm256_storeu2_m128d:
2667 ; X64-NEXT: vmovups %xmm0, (%rdi)
2668 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
2669 ; X64-NEXT: vmovups %xmm0, (%rsi)
2670 ; X64-NEXT: vzeroupper
2672 %arg0 = bitcast double* %a0 to <2 x double>*
2673 %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
2674 store <2 x double> %lo, <2 x double>* %arg0, align 1
2675 %arg1 = bitcast double* %a1 to <2 x double>*
2676 %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
2677 store <2 x double> %hi, <2 x double>* %arg1, align 1
2681 define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
2682 ; X86-LABEL: test_mm256_storeu2_m128i:
2684 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2685 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2686 ; X86-NEXT: vmovups %xmm0, (%ecx)
2687 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2688 ; X86-NEXT: vmovups %xmm0, (%eax)
2689 ; X86-NEXT: vzeroupper
2692 ; X64-LABEL: test_mm256_storeu2_m128i:
2694 ; X64-NEXT: vmovups %xmm0, (%rdi)
2695 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
2696 ; X64-NEXT: vmovups %xmm0, (%rsi)
2697 ; X64-NEXT: vzeroupper
2699 %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
2700 %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
2701 store <2 x i64> %lo, <2 x i64>* %arg0, align 1
2702 %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
2703 %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
2704 store <2 x i64> %hi, <2 x i64>* %arg1, align 1
2708 define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
2709 ; X86-LABEL: test_mm256_stream_pd:
2711 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2712 ; X86-NEXT: vmovntps %ymm0, (%eax)
2713 ; X86-NEXT: vzeroupper
2716 ; X64-LABEL: test_mm256_stream_pd:
2718 ; X64-NEXT: vmovntps %ymm0, (%rdi)
2719 ; X64-NEXT: vzeroupper
2721 %arg0 = bitcast double* %a0 to <4 x double>*
2722 store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
2726 define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
2727 ; X86-LABEL: test_mm256_stream_ps:
2729 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2730 ; X86-NEXT: vmovntps %ymm0, (%eax)
2731 ; X86-NEXT: vzeroupper
2734 ; X64-LABEL: test_mm256_stream_ps:
2736 ; X64-NEXT: vmovntps %ymm0, (%rdi)
2737 ; X64-NEXT: vzeroupper
2739 %arg0 = bitcast float* %a0 to <8 x float>*
2740 store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
2744 define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
2745 ; X86-LABEL: test_mm256_stream_si256:
2747 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2748 ; X86-NEXT: vmovntps %ymm0, (%eax)
2749 ; X86-NEXT: vzeroupper
2752 ; X64-LABEL: test_mm256_stream_si256:
2754 ; X64-NEXT: vmovntps %ymm0, (%rdi)
2755 ; X64-NEXT: vzeroupper
2757 store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
2761 define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2762 ; CHECK-LABEL: test_mm256_sub_pd:
2764 ; CHECK-NEXT: vsubpd %ymm1, %ymm0, %ymm0
2765 ; CHECK-NEXT: ret{{[l|q]}}
2766 %res = fsub <4 x double> %a0, %a1
2767 ret <4 x double> %res
2770 define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2771 ; CHECK-LABEL: test_mm256_sub_ps:
2773 ; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0
2774 ; CHECK-NEXT: ret{{[l|q]}}
2775 %res = fsub <8 x float> %a0, %a1
2776 ret <8 x float> %res
2779 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2780 ; CHECK-LABEL: test_mm_testc_pd:
2782 ; CHECK-NEXT: xorl %eax, %eax
2783 ; CHECK-NEXT: vtestpd %xmm1, %xmm0
2784 ; CHECK-NEXT: setb %al
2785 ; CHECK-NEXT: ret{{[l|q]}}
2786 %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
2789 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
2791 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2792 ; CHECK-LABEL: test_mm256_testc_pd:
2794 ; CHECK-NEXT: xorl %eax, %eax
2795 ; CHECK-NEXT: vtestpd %ymm1, %ymm0
2796 ; CHECK-NEXT: setb %al
2797 ; CHECK-NEXT: vzeroupper
2798 ; CHECK-NEXT: ret{{[l|q]}}
2799 %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
2802 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2804 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2805 ; CHECK-LABEL: test_mm_testc_ps:
2807 ; CHECK-NEXT: xorl %eax, %eax
2808 ; CHECK-NEXT: vtestps %xmm1, %xmm0
2809 ; CHECK-NEXT: setb %al
2810 ; CHECK-NEXT: ret{{[l|q]}}
2811 %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
2814 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
2816 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2817 ; CHECK-LABEL: test_mm256_testc_ps:
2819 ; CHECK-NEXT: xorl %eax, %eax
2820 ; CHECK-NEXT: vtestps %ymm1, %ymm0
2821 ; CHECK-NEXT: setb %al
2822 ; CHECK-NEXT: vzeroupper
2823 ; CHECK-NEXT: ret{{[l|q]}}
2824 %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
2827 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2829 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2830 ; CHECK-LABEL: test_mm256_testc_si256:
2832 ; CHECK-NEXT: xorl %eax, %eax
2833 ; CHECK-NEXT: vptest %ymm1, %ymm0
2834 ; CHECK-NEXT: setb %al
2835 ; CHECK-NEXT: vzeroupper
2836 ; CHECK-NEXT: ret{{[l|q]}}
2837 %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
2840 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
2842 define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2843 ; CHECK-LABEL: test_mm_testnzc_pd:
2845 ; CHECK-NEXT: xorl %eax, %eax
2846 ; CHECK-NEXT: vtestpd %xmm1, %xmm0
2847 ; CHECK-NEXT: seta %al
2848 ; CHECK-NEXT: ret{{[l|q]}}
2849 %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
2852 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
2854 define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2855 ; CHECK-LABEL: test_mm256_testnzc_pd:
2857 ; CHECK-NEXT: xorl %eax, %eax
2858 ; CHECK-NEXT: vtestpd %ymm1, %ymm0
2859 ; CHECK-NEXT: seta %al
2860 ; CHECK-NEXT: vzeroupper
2861 ; CHECK-NEXT: ret{{[l|q]}}
2862 %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
2865 declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2867 define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2868 ; CHECK-LABEL: test_mm_testnzc_ps:
2870 ; CHECK-NEXT: xorl %eax, %eax
2871 ; CHECK-NEXT: vtestps %xmm1, %xmm0
2872 ; CHECK-NEXT: seta %al
2873 ; CHECK-NEXT: ret{{[l|q]}}
2874 %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
2877 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
2879 define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2880 ; CHECK-LABEL: test_mm256_testnzc_ps:
2882 ; CHECK-NEXT: xorl %eax, %eax
2883 ; CHECK-NEXT: vtestps %ymm1, %ymm0
2884 ; CHECK-NEXT: seta %al
2885 ; CHECK-NEXT: vzeroupper
2886 ; CHECK-NEXT: ret{{[l|q]}}
2887 %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
2890 declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2892 define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2893 ; CHECK-LABEL: test_mm256_testnzc_si256:
2895 ; CHECK-NEXT: xorl %eax, %eax
2896 ; CHECK-NEXT: vptest %ymm1, %ymm0
2897 ; CHECK-NEXT: seta %al
2898 ; CHECK-NEXT: vzeroupper
2899 ; CHECK-NEXT: ret{{[l|q]}}
2900 %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
2903 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
2905 define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2906 ; CHECK-LABEL: test_mm_testz_pd:
2908 ; CHECK-NEXT: xorl %eax, %eax
2909 ; CHECK-NEXT: vtestpd %xmm1, %xmm0
2910 ; CHECK-NEXT: sete %al
2911 ; CHECK-NEXT: ret{{[l|q]}}
2912 %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
2915 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
2917 define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2918 ; CHECK-LABEL: test_mm256_testz_pd:
2920 ; CHECK-NEXT: xorl %eax, %eax
2921 ; CHECK-NEXT: vtestpd %ymm1, %ymm0
2922 ; CHECK-NEXT: sete %al
2923 ; CHECK-NEXT: vzeroupper
2924 ; CHECK-NEXT: ret{{[l|q]}}
2925 %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
2928 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
2930 define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2931 ; CHECK-LABEL: test_mm_testz_ps:
2933 ; CHECK-NEXT: xorl %eax, %eax
2934 ; CHECK-NEXT: vtestps %xmm1, %xmm0
2935 ; CHECK-NEXT: sete %al
2936 ; CHECK-NEXT: ret{{[l|q]}}
2937 %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
2940 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
2942 define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2943 ; CHECK-LABEL: test_mm256_testz_ps:
2945 ; CHECK-NEXT: xorl %eax, %eax
2946 ; CHECK-NEXT: vtestps %ymm1, %ymm0
2947 ; CHECK-NEXT: sete %al
2948 ; CHECK-NEXT: vzeroupper
2949 ; CHECK-NEXT: ret{{[l|q]}}
2950 %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
2953 declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
2955 define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2956 ; CHECK-LABEL: test_mm256_testz_si256:
2958 ; CHECK-NEXT: xorl %eax, %eax
2959 ; CHECK-NEXT: vptest %ymm1, %ymm0
2960 ; CHECK-NEXT: sete %al
2961 ; CHECK-NEXT: vzeroupper
2962 ; CHECK-NEXT: ret{{[l|q]}}
2963 %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
2966 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
2968 define <2 x double> @test_mm_undefined_pd() nounwind {
2969 ; CHECK-LABEL: test_mm_undefined_pd:
2971 ; CHECK-NEXT: ret{{[l|q]}}
2972 ret <2 x double> undef
2975 define <4 x double> @test_mm256_undefined_pd() nounwind {
2976 ; CHECK-LABEL: test_mm256_undefined_pd:
2978 ; CHECK-NEXT: ret{{[l|q]}}
2979 ret <4 x double> undef
2982 define <8 x float> @test_mm256_undefined_ps() nounwind {
2983 ; CHECK-LABEL: test_mm256_undefined_ps:
2985 ; CHECK-NEXT: ret{{[l|q]}}
2986 ret <8 x float> undef
2989 define <4 x i64> @test_mm256_undefined_si256() nounwind {
2990 ; CHECK-LABEL: test_mm256_undefined_si256:
2992 ; CHECK-NEXT: ret{{[l|q]}}
2996 define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2997 ; CHECK-LABEL: test_mm256_unpackhi_pd:
2999 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3000 ; CHECK-NEXT: ret{{[l|q]}}
3001 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3002 ret <4 x double> %res
3005 define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3006 ; CHECK-LABEL: test_mm256_unpackhi_ps:
3008 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3009 ; CHECK-NEXT: ret{{[l|q]}}
3010 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3011 ret <8 x float> %res
3014 define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3015 ; CHECK-LABEL: test_mm256_unpacklo_pd:
3017 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3018 ; CHECK-NEXT: ret{{[l|q]}}
3019 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3020 ret <4 x double> %res
3023 define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3024 ; CHECK-LABEL: test_mm256_unpacklo_ps:
3026 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3027 ; CHECK-NEXT: ret{{[l|q]}}
3028 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3029 ret <8 x float> %res
3032 define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3033 ; CHECK-LABEL: test_mm256_xor_pd:
3035 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
3036 ; CHECK-NEXT: ret{{[l|q]}}
3037 %1 = bitcast <4 x double> %a0 to <4 x i64>
3038 %2 = bitcast <4 x double> %a1 to <4 x i64>
3039 %res = xor <4 x i64> %1, %2
3040 %bc = bitcast <4 x i64> %res to <4 x double>
3041 ret <4 x double> %bc
3044 define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3045 ; CHECK-LABEL: test_mm256_xor_ps:
3047 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
3048 ; CHECK-NEXT: ret{{[l|q]}}
3049 %1 = bitcast <8 x float> %a0 to <8 x i32>
3050 %2 = bitcast <8 x float> %a1 to <8 x i32>
3051 %res = xor <8 x i32> %1, %2
3052 %bc = bitcast <8 x i32> %res to <8 x float>
3056 define void @test_mm256_zeroall() nounwind {
3057 ; CHECK-LABEL: test_mm256_zeroall:
3059 ; CHECK-NEXT: vzeroall
3060 ; CHECK-NEXT: ret{{[l|q]}}
3061 call void @llvm.x86.avx.vzeroall()
3064 declare void @llvm.x86.avx.vzeroall() nounwind readnone
3066 define void @test_mm256_zeroupper() nounwind {
3067 ; CHECK-LABEL: test_mm256_zeroupper:
3069 ; CHECK-NEXT: vzeroupper
3070 ; CHECK-NEXT: ret{{[l|q]}}
3071 call void @llvm.x86.avx.vzeroupper()
3074 declare void @llvm.x86.avx.vzeroupper() nounwind readnone
3076 define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
3077 ; CHECK-LABEL: test_mm256_zextpd128_pd256:
3079 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
3080 ; CHECK-NEXT: ret{{[l|q]}}
3081 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3082 ret <4 x double> %res
3085 define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
3086 ; CHECK-LABEL: test_mm256_zextps128_ps256:
3088 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
3089 ; CHECK-NEXT: ret{{[l|q]}}
3090 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3091 ret <8 x float> %res
3094 define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
3095 ; CHECK-LABEL: test_mm256_zextsi128_si256:
3097 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
3098 ; CHECK-NEXT: ret{{[l|q]}}
3099 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>