1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8 ; CHECK-LABEL: test_mm256_abs_epi8:
10 ; CHECK-NEXT: vpabsb %ymm0, %ymm0
11 ; CHECK-NEXT: ret{{[l|q]}}
12 %arg = bitcast <4 x i64> %a0 to <32 x i8>
13 %sub = sub <32 x i8> zeroinitializer, %arg
14 %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
15 %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
16 %res = bitcast <32 x i8> %sel to <4 x i64>
19 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
21 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
22 ; CHECK-LABEL: test_mm256_abs_epi16:
24 ; CHECK-NEXT: vpabsw %ymm0, %ymm0
25 ; CHECK-NEXT: ret{{[l|q]}}
26 %arg = bitcast <4 x i64> %a0 to <16 x i16>
27 %sub = sub <16 x i16> zeroinitializer, %arg
28 %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
29 %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
30 %res = bitcast <16 x i16> %sel to <4 x i64>
33 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
35 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
36 ; CHECK-LABEL: test_mm256_abs_epi32:
38 ; CHECK-NEXT: vpabsd %ymm0, %ymm0
39 ; CHECK-NEXT: ret{{[l|q]}}
40 %arg = bitcast <4 x i64> %a0 to <8 x i32>
41 %sub = sub <8 x i32> zeroinitializer, %arg
42 %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
43 %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
44 %res = bitcast <8 x i32> %sel to <4 x i64>
47 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
49 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
50 ; CHECK-LABEL: test_mm256_add_epi8:
52 ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
53 ; CHECK-NEXT: ret{{[l|q]}}
54 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
55 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
56 %res = add <32 x i8> %arg0, %arg1
57 %bc = bitcast <32 x i8> %res to <4 x i64>
61 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
62 ; CHECK-LABEL: test_mm256_add_epi16:
64 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
65 ; CHECK-NEXT: ret{{[l|q]}}
66 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
67 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
68 %res = add <16 x i16> %arg0, %arg1
69 %bc = bitcast <16 x i16> %res to <4 x i64>
73 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
74 ; CHECK-LABEL: test_mm256_add_epi32:
76 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
77 ; CHECK-NEXT: ret{{[l|q]}}
78 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
79 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
80 %res = add <8 x i32> %arg0, %arg1
81 %bc = bitcast <8 x i32> %res to <4 x i64>
85 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
86 ; CHECK-LABEL: test_mm256_add_epi64:
88 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
89 ; CHECK-NEXT: ret{{[l|q]}}
90 %res = add <4 x i64> %a0, %a1
94 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
95 ; CHECK-LABEL: test_mm256_adds_epi8:
97 ; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
98 ; CHECK-NEXT: ret{{[l|q]}}
99 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
100 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
101 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
102 %bc = bitcast <32 x i8> %res to <4 x i64>
105 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
107 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
108 ; CHECK-LABEL: test_mm256_adds_epi16:
110 ; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
111 ; CHECK-NEXT: ret{{[l|q]}}
112 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
113 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
114 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
115 %bc = bitcast <16 x i16> %res to <4 x i64>
118 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
120 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
121 ; CHECK-LABEL: test_mm256_adds_epu8:
123 ; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
124 ; CHECK-NEXT: ret{{[l|q]}}
125 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
126 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
127 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
128 %bc = bitcast <32 x i8> %res to <4 x i64>
131 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
133 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
134 ; CHECK-LABEL: test_mm256_adds_epu16:
136 ; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
137 ; CHECK-NEXT: ret{{[l|q]}}
138 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
139 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
140 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
141 %bc = bitcast <16 x i16> %res to <4 x i64>
144 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
146 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
147 ; CHECK-LABEL: test_mm256_alignr_epi8:
149 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
150 ; CHECK-NEXT: ret{{[l|q]}}
151 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
152 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
153 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
154 %res = bitcast <32 x i8> %shuf to <4 x i64>
158 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
159 ; CHECK-LABEL: test2_mm256_alignr_epi8:
161 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
162 ; CHECK-NEXT: ret{{[l|q]}}
163 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
164 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
165 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
166 %res = bitcast <32 x i8> %shuf to <4 x i64>
170 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
171 ; CHECK-LABEL: test_mm256_and_si256:
173 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
174 ; CHECK-NEXT: ret{{[l|q]}}
175 %res = and <4 x i64> %a0, %a1
179 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
180 ; CHECK-LABEL: test_mm256_andnot_si256:
182 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
183 ; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0
184 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
185 ; CHECK-NEXT: ret{{[l|q]}}
186 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
187 %res = and <4 x i64> %not, %a1
191 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
192 ; CHECK-LABEL: test_mm256_avg_epu8:
194 ; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
195 ; CHECK-NEXT: ret{{[l|q]}}
196 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
197 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
198 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
199 %bc = bitcast <32 x i8> %res to <4 x i64>
202 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
204 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
205 ; CHECK-LABEL: test_mm256_avg_epu16:
207 ; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
208 ; CHECK-NEXT: ret{{[l|q]}}
209 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
210 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
211 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
212 %bc = bitcast <16 x i16> %res to <4 x i64>
215 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
217 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
218 ; CHECK-LABEL: test_mm256_blend_epi16:
220 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
221 ; CHECK-NEXT: ret{{[l|q]}}
222 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
223 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
224 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
225 %res = bitcast <16 x i16> %shuf to <4 x i64>
229 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
230 ; CHECK-LABEL: test_mm_blend_epi32:
232 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
233 ; CHECK-NEXT: ret{{[l|q]}}
234 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
235 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
236 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
237 %res = bitcast <4 x i32> %shuf to <2 x i64>
241 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
242 ; CHECK-LABEL: test_mm256_blend_epi32:
244 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
245 ; CHECK-NEXT: ret{{[l|q]}}
246 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
247 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
248 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
249 %res = bitcast <8 x i32> %shuf to <4 x i64>
253 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
254 ; CHECK-LABEL: test_mm256_blendv_epi8:
256 ; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
257 ; CHECK-NEXT: ret{{[l|q]}}
258 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
259 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
260 %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
261 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
262 %res = bitcast <32 x i8> %call to <4 x i64>
265 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
267 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
268 ; CHECK-LABEL: test_mm_broadcastb_epi8:
270 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
271 ; CHECK-NEXT: ret{{[l|q]}}
272 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
273 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
274 %res = bitcast <16 x i8> %shuf to <2 x i64>
278 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
279 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
281 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
282 ; CHECK-NEXT: ret{{[l|q]}}
283 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
284 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
285 %res = bitcast <32 x i8> %shuf to <4 x i64>
289 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
290 ; CHECK-LABEL: test_mm_broadcastd_epi32:
292 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
293 ; CHECK-NEXT: ret{{[l|q]}}
294 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
295 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
296 %res = bitcast <4 x i32> %shuf to <2 x i64>
300 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
301 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
303 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
304 ; CHECK-NEXT: ret{{[l|q]}}
305 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
306 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
307 %res = bitcast <8 x i32> %shuf to <4 x i64>
311 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
312 ; CHECK-LABEL: test_mm_broadcastq_epi64:
314 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
315 ; CHECK-NEXT: ret{{[l|q]}}
316 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
320 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
321 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
323 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
324 ; CHECK-NEXT: ret{{[l|q]}}
325 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
329 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
330 ; CHECK-LABEL: test_mm_broadcastsd_pd:
332 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
333 ; CHECK-NEXT: ret{{[l|q]}}
334 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
335 ret <2 x double> %res
338 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
339 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
341 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
342 ; CHECK-NEXT: ret{{[l|q]}}
343 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
344 ret <4 x double> %res
347 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
348 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
350 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
351 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
352 ; CHECK-NEXT: ret{{[l|q]}}
353 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
357 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
358 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
360 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
361 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
364 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
366 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
368 %a0 = load <2 x i64>, <2 x i64>* %p0
369 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
373 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
374 ; CHECK-LABEL: test_mm_broadcastss_ps:
376 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
377 ; CHECK-NEXT: ret{{[l|q]}}
378 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
382 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
383 ; CHECK-LABEL: test_mm256_broadcastss_ps:
385 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
386 ; CHECK-NEXT: ret{{[l|q]}}
387 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
391 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
392 ; CHECK-LABEL: test_mm_broadcastw_epi16:
394 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
395 ; CHECK-NEXT: ret{{[l|q]}}
396 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
397 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
398 %res = bitcast <8 x i16> %shuf to <2 x i64>
402 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
403 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
405 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
406 ; CHECK-NEXT: ret{{[l|q]}}
407 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
408 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
409 %res = bitcast <16 x i16> %shuf to <4 x i64>
413 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
414 ; CHECK-LABEL: test_mm256_bslli_epi128:
416 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
417 ; CHECK-NEXT: ret{{[l|q]}}
418 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
419 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
420 %res = bitcast <32 x i8> %shuf to <4 x i64>
424 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
425 ; CHECK-LABEL: test_mm256_bsrli_epi128:
427 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
428 ; CHECK-NEXT: ret{{[l|q]}}
429 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
430 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
431 %res = bitcast <32 x i8> %shuf to <4 x i64>
435 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
436 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
438 ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
439 ; CHECK-NEXT: ret{{[l|q]}}
440 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
441 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
442 %cmp = icmp eq <32 x i8> %arg0, %arg1
443 %res = sext <32 x i1> %cmp to <32 x i8>
444 %bc = bitcast <32 x i8> %res to <4 x i64>
448 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
449 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
451 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
452 ; CHECK-NEXT: ret{{[l|q]}}
453 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
454 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
455 %cmp = icmp eq <16 x i16> %arg0, %arg1
456 %res = sext <16 x i1> %cmp to <16 x i16>
457 %bc = bitcast <16 x i16> %res to <4 x i64>
461 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
462 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
464 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
465 ; CHECK-NEXT: ret{{[l|q]}}
466 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
467 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
468 %cmp = icmp eq <8 x i32> %arg0, %arg1
469 %res = sext <8 x i1> %cmp to <8 x i32>
470 %bc = bitcast <8 x i32> %res to <4 x i64>
474 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
475 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
477 ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
478 ; CHECK-NEXT: ret{{[l|q]}}
479 %cmp = icmp eq <4 x i64> %a0, %a1
480 %res = sext <4 x i1> %cmp to <4 x i64>
484 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
485 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
487 ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
488 ; CHECK-NEXT: ret{{[l|q]}}
489 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
490 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
491 %cmp = icmp sgt <32 x i8> %arg0, %arg1
492 %res = sext <32 x i1> %cmp to <32 x i8>
493 %bc = bitcast <32 x i8> %res to <4 x i64>
497 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
498 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
500 ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
501 ; CHECK-NEXT: ret{{[l|q]}}
502 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
503 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
504 %cmp = icmp sgt <16 x i16> %arg0, %arg1
505 %res = sext <16 x i1> %cmp to <16 x i16>
506 %bc = bitcast <16 x i16> %res to <4 x i64>
510 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
511 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
513 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
514 ; CHECK-NEXT: ret{{[l|q]}}
515 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
516 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
517 %cmp = icmp sgt <8 x i32> %arg0, %arg1
518 %res = sext <8 x i1> %cmp to <8 x i32>
519 %bc = bitcast <8 x i32> %res to <4 x i64>
523 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
524 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
526 ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
527 ; CHECK-NEXT: ret{{[l|q]}}
528 %cmp = icmp sgt <4 x i64> %a0, %a1
529 %res = sext <4 x i1> %cmp to <4 x i64>
533 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
534 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
536 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
537 ; CHECK-NEXT: ret{{[l|q]}}
538 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
539 %ext = sext <16 x i8> %arg0 to <16 x i16>
540 %res = bitcast <16 x i16> %ext to <4 x i64>
544 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
545 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
547 ; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
548 ; CHECK-NEXT: ret{{[l|q]}}
549 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
550 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
551 %ext = sext <8 x i8> %shuf to <8 x i32>
552 %res = bitcast <8 x i32> %ext to <4 x i64>
556 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
557 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
559 ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
560 ; CHECK-NEXT: ret{{[l|q]}}
561 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
562 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
563 %ext = sext <4 x i8> %shuf to <4 x i64>
567 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
568 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
570 ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
571 ; CHECK-NEXT: ret{{[l|q]}}
572 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
573 %ext = sext <8 x i16> %arg0 to <8 x i32>
574 %res = bitcast <8 x i32> %ext to <4 x i64>
578 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
579 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
581 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
582 ; CHECK-NEXT: ret{{[l|q]}}
583 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
584 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
585 %ext = sext <4 x i16> %shuf to <4 x i64>
589 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
590 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
592 ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
593 ; CHECK-NEXT: ret{{[l|q]}}
594 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
595 %ext = sext <4 x i32> %arg0 to <4 x i64>
599 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
600 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
602 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
603 ; CHECK-NEXT: ret{{[l|q]}}
604 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
605 %ext = zext <16 x i8> %arg0 to <16 x i16>
606 %res = bitcast <16 x i16> %ext to <4 x i64>
610 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
611 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
613 ; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
614 ; CHECK-NEXT: ret{{[l|q]}}
615 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
616 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
617 %ext = zext <8 x i8> %shuf to <8 x i32>
618 %res = bitcast <8 x i32> %ext to <4 x i64>
622 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
623 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
625 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
626 ; CHECK-NEXT: ret{{[l|q]}}
627 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
628 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
629 %ext = zext <4 x i8> %shuf to <4 x i64>
633 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
634 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
636 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
637 ; CHECK-NEXT: ret{{[l|q]}}
638 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
639 %ext = zext <8 x i16> %arg0 to <8 x i32>
640 %res = bitcast <8 x i32> %ext to <4 x i64>
644 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
645 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
647 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
648 ; CHECK-NEXT: ret{{[l|q]}}
649 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
650 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
651 %ext = zext <4 x i16> %shuf to <4 x i64>
655 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
656 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
658 ; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
659 ; CHECK-NEXT: ret{{[l|q]}}
660 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
661 %ext = zext <4 x i32> %arg0 to <4 x i64>
665 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
666 ; CHECK-LABEL: test_mm256_extracti128_si256:
668 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
669 ; CHECK-NEXT: vzeroupper
670 ; CHECK-NEXT: ret{{[l|q]}}
671 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
675 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
676 ; CHECK-LABEL: test_mm256_hadd_epi16:
678 ; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
679 ; CHECK-NEXT: ret{{[l|q]}}
680 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
681 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
682 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
683 %bc = bitcast <16 x i16> %res to <4 x i64>
686 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
688 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
689 ; CHECK-LABEL: test_mm256_hadd_epi32:
691 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
692 ; CHECK-NEXT: ret{{[l|q]}}
693 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
694 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
695 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
696 %bc = bitcast <8 x i32> %res to <4 x i64>
699 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
701 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
702 ; CHECK-LABEL: test_mm256_hadds_epi16:
704 ; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
705 ; CHECK-NEXT: ret{{[l|q]}}
706 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
707 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
708 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
709 %bc = bitcast <16 x i16> %res to <4 x i64>
712 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
714 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
715 ; CHECK-LABEL: test_mm256_hsub_epi16:
717 ; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
718 ; CHECK-NEXT: ret{{[l|q]}}
719 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
720 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
721 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
722 %bc = bitcast <16 x i16> %res to <4 x i64>
725 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
727 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
728 ; CHECK-LABEL: test_mm256_hsub_epi32:
730 ; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
731 ; CHECK-NEXT: ret{{[l|q]}}
732 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
733 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
734 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
735 %bc = bitcast <8 x i32> %res to <4 x i64>
738 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
740 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
741 ; CHECK-LABEL: test_mm256_hsubs_epi16:
743 ; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
744 ; CHECK-NEXT: ret{{[l|q]}}
745 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
746 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
747 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
748 %bc = bitcast <16 x i16> %res to <4 x i64>
751 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
753 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
754 ; X86-LABEL: test_mm_i32gather_epi32:
756 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
757 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
758 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
759 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
760 ; X86-NEXT: vmovdqa %xmm1, %xmm0
763 ; X64-LABEL: test_mm_i32gather_epi32:
765 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
766 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
767 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
768 ; X64-NEXT: vmovdqa %xmm1, %xmm0
770 %arg0 = bitcast i32 *%a0 to i8*
771 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
772 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
773 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
774 %bc = bitcast <4 x i32> %call to <2 x i64>
777 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
779 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
780 ; X86-LABEL: test_mm_mask_i32gather_epi32:
782 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
783 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
786 ; X64-LABEL: test_mm_mask_i32gather_epi32:
788 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
790 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
791 %arg1 = bitcast i32 *%a1 to i8*
792 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
793 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
794 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
795 %bc = bitcast <4 x i32> %call to <2 x i64>
799 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
800 ; X86-LABEL: test_mm256_i32gather_epi32:
802 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
803 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
804 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
805 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
806 ; X86-NEXT: vmovdqa %ymm1, %ymm0
809 ; X64-LABEL: test_mm256_i32gather_epi32:
811 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
812 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
813 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
814 ; X64-NEXT: vmovdqa %ymm1, %ymm0
816 %arg0 = bitcast i32 *%a0 to i8*
817 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
818 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
819 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
820 %bc = bitcast <8 x i32> %call to <4 x i64>
823 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
825 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
826 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
828 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
829 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
832 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
834 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
836 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
837 %arg1 = bitcast i32 *%a1 to i8*
838 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
839 %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
840 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
841 %bc = bitcast <8 x i32> %call to <4 x i64>
845 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
846 ; X86-LABEL: test_mm_i32gather_epi64:
848 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
849 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
850 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
851 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
852 ; X86-NEXT: vmovdqa %xmm1, %xmm0
855 ; X64-LABEL: test_mm_i32gather_epi64:
857 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
858 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
859 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
860 ; X64-NEXT: vmovdqa %xmm1, %xmm0
862 %arg0 = bitcast i64 *%a0 to i8*
863 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
864 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
867 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
869 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
870 ; X86-LABEL: test_mm_mask_i32gather_epi64:
872 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
873 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
876 ; X64-LABEL: test_mm_mask_i32gather_epi64:
878 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
880 %arg1 = bitcast i64 *%a1 to i8*
881 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
882 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
886 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
887 ; X86-LABEL: test_mm256_i32gather_epi64:
889 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
890 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
891 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
892 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
893 ; X86-NEXT: vmovdqa %ymm1, %ymm0
896 ; X64-LABEL: test_mm256_i32gather_epi64:
898 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
899 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
900 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
901 ; X64-NEXT: vmovdqa %ymm1, %ymm0
903 %arg0 = bitcast i64 *%a0 to i8*
904 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
905 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
908 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
910 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
911 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
913 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
914 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
917 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
919 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
921 %arg1 = bitcast i64 *%a1 to i8*
922 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
923 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
927 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
928 ; X86-LABEL: test_mm_i32gather_pd:
930 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
931 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
932 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
933 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
934 ; X86-NEXT: vmovapd %xmm1, %xmm0
937 ; X64-LABEL: test_mm_i32gather_pd:
939 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
940 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
941 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
942 ; X64-NEXT: vmovapd %xmm1, %xmm0
944 %arg0 = bitcast double *%a0 to i8*
945 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
946 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
947 %sext = sext <2 x i1> %cmp to <2 x i64>
948 %mask = bitcast <2 x i64> %sext to <2 x double>
949 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
950 ret <2 x double> %res
952 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
954 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
955 ; X86-LABEL: test_mm_mask_i32gather_pd:
957 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
958 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
961 ; X64-LABEL: test_mm_mask_i32gather_pd:
963 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
965 %arg1 = bitcast double *%a1 to i8*
966 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
967 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
968 ret <2 x double> %res
971 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
972 ; X86-LABEL: test_mm256_i32gather_pd:
974 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
975 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
976 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
977 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
978 ; X86-NEXT: vmovapd %ymm1, %ymm0
981 ; X64-LABEL: test_mm256_i32gather_pd:
983 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
984 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
985 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
986 ; X64-NEXT: vmovapd %ymm1, %ymm0
988 %arg0 = bitcast double *%a0 to i8*
989 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
990 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
991 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
992 ret <4 x double> %res
994 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
996 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
997 ; X86-LABEL: test_mm256_mask_i32gather_pd:
999 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1003 ; X64-LABEL: test_mm256_mask_i32gather_pd:
1005 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1007 %arg1 = bitcast double *%a1 to i8*
1008 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1009 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1010 ret <4 x double> %res
1013 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1014 ; X86-LABEL: test_mm_i32gather_ps:
1016 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1017 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1018 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1019 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1020 ; X86-NEXT: vmovaps %xmm1, %xmm0
1023 ; X64-LABEL: test_mm_i32gather_ps:
1025 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1026 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1027 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1028 ; X64-NEXT: vmovaps %xmm1, %xmm0
1030 %arg0 = bitcast float *%a0 to i8*
1031 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1032 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1033 %sext = sext <4 x i1> %cmp to <4 x i32>
1034 %mask = bitcast <4 x i32> %sext to <4 x float>
1035 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1036 ret <4 x float> %call
1038 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1040 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1041 ; X86-LABEL: test_mm_mask_i32gather_ps:
1043 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1044 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1047 ; X64-LABEL: test_mm_mask_i32gather_ps:
1049 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1051 %arg1 = bitcast float *%a1 to i8*
1052 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1053 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1054 ret <4 x float> %call
1057 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1058 ; X86-LABEL: test_mm256_i32gather_ps:
1060 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1061 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1062 ; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1063 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1064 ; X86-NEXT: vmovaps %ymm1, %ymm0
1067 ; X64-LABEL: test_mm256_i32gather_ps:
1069 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1070 ; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1071 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1072 ; X64-NEXT: vmovaps %ymm1, %ymm0
1074 %arg0 = bitcast float *%a0 to i8*
1075 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1076 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1077 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1078 ret <8 x float> %call
1080 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1082 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1083 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1085 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1086 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1089 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1091 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1093 %arg1 = bitcast float *%a1 to i8*
1094 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1095 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1096 ret <8 x float> %call
1099 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1100 ; X86-LABEL: test_mm_i64gather_epi32:
1102 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1103 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1104 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1105 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1106 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1109 ; X64-LABEL: test_mm_i64gather_epi32:
1111 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1112 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1113 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1114 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1116 %arg0 = bitcast i32 *%a0 to i8*
1117 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1118 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1119 %bc = bitcast <4 x i32> %call to <2 x i64>
1122 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1124 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1125 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1127 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1128 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1131 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1133 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1135 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1136 %arg1 = bitcast i32 *%a1 to i8*
1137 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1138 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1139 %bc = bitcast <4 x i32> %call to <2 x i64>
1143 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1144 ; X86-LABEL: test_mm256_i64gather_epi32:
1146 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1147 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1148 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1149 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1150 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1151 ; X86-NEXT: vzeroupper
1154 ; X64-LABEL: test_mm256_i64gather_epi32:
1156 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1157 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1158 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1159 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1160 ; X64-NEXT: vzeroupper
1162 %arg0 = bitcast i32 *%a0 to i8*
1163 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1164 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1165 %bc = bitcast <4 x i32> %call to <2 x i64>
1168 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1170 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1171 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1173 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1174 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1175 ; X86-NEXT: vzeroupper
1178 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1180 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1181 ; X64-NEXT: vzeroupper
1183 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1184 %arg1 = bitcast i32 *%a1 to i8*
1185 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1186 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1187 %bc = bitcast <4 x i32> %call to <2 x i64>
1191 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1192 ; X86-LABEL: test_mm_i64gather_epi64:
1194 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1195 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1196 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1197 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1198 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1201 ; X64-LABEL: test_mm_i64gather_epi64:
1203 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1204 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1205 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1206 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1208 %arg0 = bitcast i64 *%a0 to i8*
1209 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1212 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1214 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1215 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1217 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1218 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1221 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1223 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1225 %arg1 = bitcast i64 *%a1 to i8*
1226 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1230 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1231 ; X86-LABEL: test_mm256_i64gather_epi64:
1233 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1234 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1235 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1236 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1237 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1240 ; X64-LABEL: test_mm256_i64gather_epi64:
1242 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1243 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1244 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1245 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1247 %arg0 = bitcast i64 *%a0 to i8*
1248 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1251 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1253 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1254 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1256 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1257 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1260 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1262 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1264 %arg1 = bitcast i64 *%a1 to i8*
1265 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1269 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1270 ; X86-LABEL: test_mm_i64gather_pd:
1272 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1273 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1274 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1275 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1276 ; X86-NEXT: vmovapd %xmm1, %xmm0
1279 ; X64-LABEL: test_mm_i64gather_pd:
1281 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1282 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1283 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1284 ; X64-NEXT: vmovapd %xmm1, %xmm0
1286 %arg0 = bitcast double *%a0 to i8*
1287 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1288 %sext = sext <2 x i1> %cmp to <2 x i64>
1289 %mask = bitcast <2 x i64> %sext to <2 x double>
1290 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1291 ret <2 x double> %call
1293 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1295 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1296 ; X86-LABEL: test_mm_mask_i64gather_pd:
1298 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1299 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1302 ; X64-LABEL: test_mm_mask_i64gather_pd:
1304 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1306 %arg1 = bitcast double *%a1 to i8*
1307 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1308 ret <2 x double> %call
1311 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1312 ; X86-LABEL: test_mm256_i64gather_pd:
1314 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1315 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1316 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1317 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1318 ; X86-NEXT: vmovapd %ymm1, %ymm0
1321 ; X64-LABEL: test_mm256_i64gather_pd:
1323 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1324 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1325 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1326 ; X64-NEXT: vmovapd %ymm1, %ymm0
1328 %arg0 = bitcast double *%a0 to i8*
1329 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1330 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1331 ret <4 x double> %call
1333 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1335 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1336 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1338 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1339 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1342 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1344 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1346 %arg1 = bitcast i64 *%a1 to i8*
1347 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1348 ret <4 x double> %call
1351 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1352 ; X86-LABEL: test_mm_i64gather_ps:
1354 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1355 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1356 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1357 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1358 ; X86-NEXT: vmovaps %xmm1, %xmm0
1361 ; X64-LABEL: test_mm_i64gather_ps:
1363 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1364 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1365 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1366 ; X64-NEXT: vmovaps %xmm1, %xmm0
1368 %arg0 = bitcast float *%a0 to i8*
1369 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1370 %sext = sext <4 x i1> %cmp to <4 x i32>
1371 %mask = bitcast <4 x i32> %sext to <4 x float>
1372 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1373 ret <4 x float> %call
1375 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1377 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1378 ; X86-LABEL: test_mm_mask_i64gather_ps:
1380 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1381 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1384 ; X64-LABEL: test_mm_mask_i64gather_ps:
1386 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1388 %arg1 = bitcast float *%a1 to i8*
1389 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1390 ret <4 x float> %call
1393 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1394 ; X86-LABEL: test_mm256_i64gather_ps:
1396 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1397 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1398 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1399 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1400 ; X86-NEXT: vmovaps %xmm1, %xmm0
1401 ; X86-NEXT: vzeroupper
1404 ; X64-LABEL: test_mm256_i64gather_ps:
1406 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1407 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1408 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1409 ; X64-NEXT: vmovaps %xmm1, %xmm0
1410 ; X64-NEXT: vzeroupper
1412 %arg0 = bitcast float *%a0 to i8*
1413 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1414 %sext = sext <4 x i1> %cmp to <4 x i32>
1415 %mask = bitcast <4 x i32> %sext to <4 x float>
1416 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1417 ret <4 x float> %call
1419 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1421 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1422 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1424 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1425 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1426 ; X86-NEXT: vzeroupper
1429 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1431 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1432 ; X64-NEXT: vzeroupper
1434 %arg1 = bitcast float *%a1 to i8*
1435 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1436 ret <4 x float> %call
1439 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1440 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1442 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1443 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1444 ; CHECK-NEXT: ret{{[l|q]}}
1445 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1446 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1450 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1451 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1453 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1454 ; CHECK-NEXT: ret{{[l|q]}}
1455 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1456 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1460 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1461 ; CHECK-LABEL: test_mm256_madd_epi16:
1463 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
1464 ; CHECK-NEXT: ret{{[l|q]}}
1465 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1466 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1467 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1468 %bc = bitcast <8 x i32> %res to <4 x i64>
1471 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1473 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1474 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1476 ; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
1477 ; CHECK-NEXT: ret{{[l|q]}}
1478 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1479 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1480 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1481 %bc = bitcast <16 x i16> %res to <4 x i64>
1484 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1486 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1487 ; X86-LABEL: test_mm_maskload_epi32:
1489 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1490 ; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
1493 ; X64-LABEL: test_mm_maskload_epi32:
1495 ; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
1497 %arg0 = bitcast i32* %a0 to i8*
1498 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1499 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1500 %bc = bitcast <4 x i32> %call to <2 x i64>
1503 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1505 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1506 ; X86-LABEL: test_mm256_maskload_epi32:
1508 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1509 ; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
1512 ; X64-LABEL: test_mm256_maskload_epi32:
1514 ; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
1516 %arg0 = bitcast i32* %a0 to i8*
1517 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1518 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1519 %bc = bitcast <8 x i32> %call to <4 x i64>
1522 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1524 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1525 ; X86-LABEL: test_mm_maskload_epi64:
1527 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1528 ; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
1531 ; X64-LABEL: test_mm_maskload_epi64:
1533 ; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
1535 %arg0 = bitcast i64* %a0 to i8*
1536 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1539 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1541 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1542 ; X86-LABEL: test_mm256_maskload_epi64:
1544 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1545 ; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
1548 ; X64-LABEL: test_mm256_maskload_epi64:
1550 ; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
1552 %arg0 = bitcast i64* %a0 to i8*
1553 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1556 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1558 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1559 ; X86-LABEL: test_mm_maskstore_epi32:
1561 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1562 ; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
1565 ; X64-LABEL: test_mm_maskstore_epi32:
1567 ; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
1569 %arg0 = bitcast float* %a0 to i8*
1570 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1571 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1572 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1575 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1577 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1578 ; X86-LABEL: test_mm256_maskstore_epi32:
1580 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1581 ; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
1582 ; X86-NEXT: vzeroupper
1585 ; X64-LABEL: test_mm256_maskstore_epi32:
1587 ; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
1588 ; X64-NEXT: vzeroupper
1590 %arg0 = bitcast float* %a0 to i8*
1591 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1592 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1593 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1596 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1598 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1599 ; X86-LABEL: test_mm_maskstore_epi64:
1601 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1602 ; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
1605 ; X64-LABEL: test_mm_maskstore_epi64:
1607 ; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
1609 %arg0 = bitcast i64* %a0 to i8*
1610 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1613 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1615 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1616 ; X86-LABEL: test_mm256_maskstore_epi64:
1618 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1619 ; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
1620 ; X86-NEXT: vzeroupper
1623 ; X64-LABEL: test_mm256_maskstore_epi64:
1625 ; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
1626 ; X64-NEXT: vzeroupper
1628 %arg0 = bitcast i64* %a0 to i8*
1629 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1632 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1634 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1635 ; CHECK-LABEL: test_mm256_max_epi8:
1637 ; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1638 ; CHECK-NEXT: ret{{[l|q]}}
1639 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1640 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1641 %cmp = icmp sgt <32 x i8> %arg0, %arg1
1642 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1643 %bc = bitcast <32 x i8> %sel to <4 x i64>
1647 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1648 ; CHECK-LABEL: test_mm256_max_epi16:
1650 ; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1651 ; CHECK-NEXT: ret{{[l|q]}}
1652 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1653 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1654 %cmp = icmp sgt <16 x i16> %arg0, %arg1
1655 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1656 %bc = bitcast <16 x i16> %sel to <4 x i64>
1660 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1661 ; CHECK-LABEL: test_mm256_max_epi32:
1663 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1664 ; CHECK-NEXT: ret{{[l|q]}}
1665 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1666 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1667 %cmp = icmp sgt <8 x i32> %arg0, %arg1
1668 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1669 %bc = bitcast <8 x i32> %sel to <4 x i64>
1673 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1674 ; CHECK-LABEL: test_mm256_max_epu8:
1676 ; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
1677 ; CHECK-NEXT: ret{{[l|q]}}
1678 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1679 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1680 %cmp = icmp ugt <32 x i8> %arg0, %arg1
1681 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1682 %bc = bitcast <32 x i8> %sel to <4 x i64>
1686 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1687 ; CHECK-LABEL: test_mm256_max_epu16:
1689 ; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
1690 ; CHECK-NEXT: ret{{[l|q]}}
1691 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1692 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1693 %cmp = icmp ugt <16 x i16> %arg0, %arg1
1694 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1695 %bc = bitcast <16 x i16> %sel to <4 x i64>
1699 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1700 ; CHECK-LABEL: test_mm256_max_epu32:
1702 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
1703 ; CHECK-NEXT: ret{{[l|q]}}
1704 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1705 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1706 %cmp = icmp ugt <8 x i32> %arg0, %arg1
1707 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1708 %bc = bitcast <8 x i32> %sel to <4 x i64>
1712 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1713 ; CHECK-LABEL: test_mm256_min_epi8:
1715 ; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
1716 ; CHECK-NEXT: ret{{[l|q]}}
1717 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1718 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1719 %cmp = icmp slt <32 x i8> %arg0, %arg1
1720 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1721 %bc = bitcast <32 x i8> %sel to <4 x i64>
1725 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1726 ; CHECK-LABEL: test_mm256_min_epi16:
1728 ; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
1729 ; CHECK-NEXT: ret{{[l|q]}}
1730 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1731 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1732 %cmp = icmp slt <16 x i16> %arg0, %arg1
1733 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1734 %bc = bitcast <16 x i16> %sel to <4 x i64>
1738 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1739 ; CHECK-LABEL: test_mm256_min_epi32:
1741 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1742 ; CHECK-NEXT: ret{{[l|q]}}
1743 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1744 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1745 %cmp = icmp slt <8 x i32> %arg0, %arg1
1746 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1747 %bc = bitcast <8 x i32> %sel to <4 x i64>
1751 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1752 ; CHECK-LABEL: test_mm256_min_epu8:
1754 ; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
1755 ; CHECK-NEXT: ret{{[l|q]}}
1756 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1757 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1758 %cmp = icmp ult <32 x i8> %arg0, %arg1
1759 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1760 %bc = bitcast <32 x i8> %sel to <4 x i64>
1764 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1765 ; CHECK-LABEL: test_mm256_min_epu16:
1767 ; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
1768 ; CHECK-NEXT: ret{{[l|q]}}
1769 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1770 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1771 %cmp = icmp ult <16 x i16> %arg0, %arg1
1772 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1773 %bc = bitcast <16 x i16> %sel to <4 x i64>
1777 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1778 ; CHECK-LABEL: test_mm256_min_epu32:
1780 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
1781 ; CHECK-NEXT: ret{{[l|q]}}
1782 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1783 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1784 %cmp = icmp ult <8 x i32> %arg0, %arg1
1785 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1786 %bc = bitcast <8 x i32> %sel to <4 x i64>
1790 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1791 ; CHECK-LABEL: test_mm256_movemask_epi8:
1793 ; CHECK-NEXT: vpmovmskb %ymm0, %eax
1794 ; CHECK-NEXT: vzeroupper
1795 ; CHECK-NEXT: ret{{[l|q]}}
1796 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1797 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1800 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1802 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1803 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1805 ; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
1806 ; CHECK-NEXT: ret{{[l|q]}}
1807 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1808 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1809 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1810 %bc = bitcast <16 x i16> %call to <4 x i64>
1813 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1815 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1816 ; CHECK-LABEL: test_mm256_mul_epi32:
1818 ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
1819 ; CHECK-NEXT: ret{{[l|q]}}
1820 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1821 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1822 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1823 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1824 %res = mul nsw <4 x i64> %A1, %B1
1827 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1829 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1830 ; CHECK-LABEL: test_mm256_mul_epu32:
1832 ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1833 ; CHECK-NEXT: ret{{[l|q]}}
1834 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1835 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1836 %res = mul nuw <4 x i64> %A, %B
1839 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1841 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1842 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1844 ; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1845 ; CHECK-NEXT: ret{{[l|q]}}
1846 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1847 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1848 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1849 %bc = bitcast <16 x i16> %res to <4 x i64>
1852 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1854 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1855 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1857 ; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
1858 ; CHECK-NEXT: ret{{[l|q]}}
1859 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1860 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1861 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1862 %bc = bitcast <16 x i16> %res to <4 x i64>
1865 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1867 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1868 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1870 ; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
1871 ; CHECK-NEXT: ret{{[l|q]}}
1872 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1873 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1874 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1875 %bc = bitcast <16 x i16> %res to <4 x i64>
1878 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1880 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1881 ; CHECK-LABEL: test_mm256_mullo_epi16:
1883 ; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1884 ; CHECK-NEXT: ret{{[l|q]}}
1885 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1886 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1887 %res = mul <16 x i16> %arg0, %arg1
1888 %bc = bitcast <16 x i16> %res to <4 x i64>
1892 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1893 ; CHECK-LABEL: test_mm256_mullo_epi32:
1895 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1896 ; CHECK-NEXT: ret{{[l|q]}}
1897 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1898 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1899 %res = mul <8 x i32> %arg0, %arg1
1900 %bc = bitcast <8 x i32> %res to <4 x i64>
1904 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1905 ; CHECK-LABEL: test_mm256_or_si256:
1907 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1908 ; CHECK-NEXT: ret{{[l|q]}}
1909 %res = or <4 x i64> %a0, %a1
1913 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1914 ; CHECK-LABEL: test_mm256_packs_epi16:
1916 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
1917 ; CHECK-NEXT: ret{{[l|q]}}
1918 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1919 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1920 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1921 %res = bitcast <32 x i8> %call to <4 x i64>
1924 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1926 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1927 ; CHECK-LABEL: test_mm256_packs_epi32:
1929 ; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
1930 ; CHECK-NEXT: ret{{[l|q]}}
1931 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1932 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1933 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1934 %res = bitcast <16 x i16> %call to <4 x i64>
1937 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1939 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1940 ; CHECK-LABEL: test_mm256_packus_epi16:
1942 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1943 ; CHECK-NEXT: ret{{[l|q]}}
1944 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1945 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1946 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1947 %res = bitcast <32 x i8> %call to <4 x i64>
1950 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1952 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1953 ; CHECK-LABEL: test_mm256_packus_epi32:
1955 ; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1956 ; CHECK-NEXT: ret{{[l|q]}}
1957 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1958 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1959 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1960 %res = bitcast <16 x i16> %call to <4 x i64>
1963 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1965 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1966 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1968 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1969 ; CHECK-NEXT: ret{{[l|q]}}
1970 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1973 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1975 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1976 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1978 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1979 ; CHECK-NEXT: ret{{[l|q]}}
1980 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1984 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1985 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1987 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1988 ; CHECK-NEXT: ret{{[l|q]}}
1989 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1990 ret <4 x double> %res
1993 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1994 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1996 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1997 ; CHECK-NEXT: ret{{[l|q]}}
1998 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1999 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2000 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2001 %res = bitcast <8 x i32> %call to <4 x i64>
2004 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2006 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2007 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2009 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2010 ; CHECK-NEXT: ret{{[l|q]}}
2011 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2012 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2013 ret <8 x float> %res
2015 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2017 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2018 ; CHECK-LABEL: test_mm256_sad_epu8:
2020 ; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
2021 ; CHECK-NEXT: ret{{[l|q]}}
2022 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2023 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2024 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2027 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2029 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2030 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2032 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2033 ; CHECK-NEXT: ret{{[l|q]}}
2034 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2035 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2036 %res = bitcast <8 x i32> %shuf to <4 x i64>
2040 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2041 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2043 ; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2044 ; CHECK-NEXT: ret{{[l|q]}}
2045 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2046 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2047 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2048 %res = bitcast <32 x i8> %shuf to <4 x i64>
2051 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2053 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2054 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2056 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2057 ; CHECK-NEXT: ret{{[l|q]}}
2058 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2059 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2060 %res = bitcast <16 x i16> %shuf to <4 x i64>
2064 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2065 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2067 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2068 ; CHECK-NEXT: ret{{[l|q]}}
2069 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2070 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2071 %res = bitcast <16 x i16> %shuf to <4 x i64>
2075 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2076 ; CHECK-LABEL: test_mm256_sign_epi8:
2078 ; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0
2079 ; CHECK-NEXT: ret{{[l|q]}}
2080 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2081 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2082 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2083 %res = bitcast <32 x i8> %call to <4 x i64>
2086 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2088 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2089 ; CHECK-LABEL: test_mm256_sign_epi16:
2091 ; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0
2092 ; CHECK-NEXT: ret{{[l|q]}}
2093 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2094 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2095 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2096 %res = bitcast <16 x i16> %call to <4 x i64>
2099 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2101 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2102 ; CHECK-LABEL: test_mm256_sign_epi32:
2104 ; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0
2105 ; CHECK-NEXT: ret{{[l|q]}}
2106 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2107 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2108 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2109 %res = bitcast <8 x i32> %call to <4 x i64>
2112 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2114 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2115 ; CHECK-LABEL: test_mm256_sll_epi16:
2117 ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
2118 ; CHECK-NEXT: ret{{[l|q]}}
2119 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2120 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2121 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2122 %bc = bitcast <16 x i16> %res to <4 x i64>
2125 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2127 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2128 ; CHECK-LABEL: test_mm256_sll_epi32:
2130 ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
2131 ; CHECK-NEXT: ret{{[l|q]}}
2132 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2133 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2134 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2135 %bc = bitcast <8 x i32> %res to <4 x i64>
2138 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2140 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2141 ; CHECK-LABEL: test_mm256_sll_epi64:
2143 ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
2144 ; CHECK-NEXT: ret{{[l|q]}}
2145 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2148 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2150 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2151 ; CHECK-LABEL: test_mm256_slli_epi16:
2153 ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
2154 ; CHECK-NEXT: ret{{[l|q]}}
2155 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2156 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2157 %bc = bitcast <16 x i16> %res to <4 x i64>
2160 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2162 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2163 ; CHECK-LABEL: test_mm256_slli_epi32:
2165 ; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
2166 ; CHECK-NEXT: ret{{[l|q]}}
2167 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2168 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2169 %bc = bitcast <8 x i32> %res to <4 x i64>
2172 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2174 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2175 ; CHECK-LABEL: test_mm256_slli_epi64:
2177 ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
2178 ; CHECK-NEXT: ret{{[l|q]}}
2179 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2182 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2184 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2185 ; CHECK-LABEL: test_mm256_slli_si256:
2187 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2188 ; CHECK-NEXT: ret{{[l|q]}}
2189 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2190 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2191 %res = bitcast <32 x i8> %shuf to <4 x i64>
2195 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2196 ; CHECK-LABEL: test_mm_sllv_epi32:
2198 ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
2199 ; CHECK-NEXT: ret{{[l|q]}}
2200 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2201 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2202 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2203 %bc = bitcast <4 x i32> %res to <2 x i64>
2206 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2208 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2209 ; CHECK-LABEL: test_mm256_sllv_epi32:
2211 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
2212 ; CHECK-NEXT: ret{{[l|q]}}
2213 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2214 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2215 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2216 %bc = bitcast <8 x i32> %res to <4 x i64>
2219 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2221 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2222 ; CHECK-LABEL: test_mm_sllv_epi64:
2224 ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
2225 ; CHECK-NEXT: ret{{[l|q]}}
2226 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2229 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2231 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2232 ; CHECK-LABEL: test_mm256_sllv_epi64:
2234 ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
2235 ; CHECK-NEXT: ret{{[l|q]}}
2236 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2239 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2241 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2242 ; CHECK-LABEL: test_mm256_sra_epi16:
2244 ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
2245 ; CHECK-NEXT: ret{{[l|q]}}
2246 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2247 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2248 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2249 %bc = bitcast <16 x i16> %res to <4 x i64>
2252 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2254 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2255 ; CHECK-LABEL: test_mm256_sra_epi32:
2257 ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
2258 ; CHECK-NEXT: ret{{[l|q]}}
2259 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2260 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2261 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2262 %bc = bitcast <8 x i32> %res to <4 x i64>
2265 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2267 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2268 ; CHECK-LABEL: test_mm256_srai_epi16:
2270 ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
2271 ; CHECK-NEXT: ret{{[l|q]}}
2272 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2273 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2274 %bc = bitcast <16 x i16> %res to <4 x i64>
2277 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2279 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2280 ; CHECK-LABEL: test_mm256_srai_epi32:
2282 ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
2283 ; CHECK-NEXT: ret{{[l|q]}}
2284 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2285 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2286 %bc = bitcast <8 x i32> %res to <4 x i64>
2289 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2291 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2292 ; CHECK-LABEL: test_mm_srav_epi32:
2294 ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
2295 ; CHECK-NEXT: ret{{[l|q]}}
2296 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2297 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2298 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2299 %bc = bitcast <4 x i32> %res to <2 x i64>
2302 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2304 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2305 ; CHECK-LABEL: test_mm256_srav_epi32:
2307 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
2308 ; CHECK-NEXT: ret{{[l|q]}}
2309 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2310 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2311 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2312 %bc = bitcast <8 x i32> %res to <4 x i64>
2315 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2317 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2318 ; CHECK-LABEL: test_mm256_srl_epi16:
2320 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
2321 ; CHECK-NEXT: ret{{[l|q]}}
2322 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2323 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2324 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2325 %bc = bitcast <16 x i16> %res to <4 x i64>
2328 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2330 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2331 ; CHECK-LABEL: test_mm256_srl_epi32:
2333 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
2334 ; CHECK-NEXT: ret{{[l|q]}}
2335 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2336 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2337 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2338 %bc = bitcast <8 x i32> %res to <4 x i64>
2341 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2343 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2344 ; CHECK-LABEL: test_mm256_srl_epi64:
2346 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2347 ; CHECK-NEXT: ret{{[l|q]}}
2348 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2351 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2353 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2354 ; CHECK-LABEL: test_mm256_srli_epi16:
2356 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
2357 ; CHECK-NEXT: ret{{[l|q]}}
2358 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2359 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2360 %bc = bitcast <16 x i16> %res to <4 x i64>
2363 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2365 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2366 ; CHECK-LABEL: test_mm256_srli_epi32:
2368 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0
2369 ; CHECK-NEXT: ret{{[l|q]}}
2370 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2371 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2372 %bc = bitcast <8 x i32> %res to <4 x i64>
2375 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2377 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2378 ; CHECK-LABEL: test_mm256_srli_epi64:
2380 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0
2381 ; CHECK-NEXT: ret{{[l|q]}}
2382 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2385 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2387 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2388 ; CHECK-LABEL: test_mm256_srli_si256:
2390 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2391 ; CHECK-NEXT: ret{{[l|q]}}
2392 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2393 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2394 %res = bitcast <32 x i8> %shuf to <4 x i64>
2398 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2399 ; CHECK-LABEL: test_mm_srlv_epi32:
2401 ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
2402 ; CHECK-NEXT: ret{{[l|q]}}
2403 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2404 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2405 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2406 %bc = bitcast <4 x i32> %res to <2 x i64>
2409 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2411 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2412 ; CHECK-LABEL: test_mm256_srlv_epi32:
2414 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
2415 ; CHECK-NEXT: ret{{[l|q]}}
2416 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2417 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2418 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2419 %bc = bitcast <8 x i32> %res to <4 x i64>
2422 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2424 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2425 ; CHECK-LABEL: test_mm_srlv_epi64:
2427 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
2428 ; CHECK-NEXT: ret{{[l|q]}}
2429 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2432 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2434 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2435 ; CHECK-LABEL: test_mm256_srlv_epi64:
2437 ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
2438 ; CHECK-NEXT: ret{{[l|q]}}
2439 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2442 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2444 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2445 ; X86-LABEL: test_mm256_stream_load_si256:
2447 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2448 ; X86-NEXT: vmovntdqa (%eax), %ymm0
2451 ; X64-LABEL: test_mm256_stream_load_si256:
2453 ; X64-NEXT: vmovntdqa (%rdi), %ymm0
2455 %arg0 = bitcast <4 x i64> *%a0 to i8*
2456 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2459 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2461 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2462 ; CHECK-LABEL: test_mm256_sub_epi8:
2464 ; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2465 ; CHECK-NEXT: ret{{[l|q]}}
2466 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2467 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2468 %res = sub <32 x i8> %arg0, %arg1
2469 %bc = bitcast <32 x i8> %res to <4 x i64>
2473 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2474 ; CHECK-LABEL: test_mm256_sub_epi16:
2476 ; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0
2477 ; CHECK-NEXT: ret{{[l|q]}}
2478 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2479 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2480 %res = sub <16 x i16> %arg0, %arg1
2481 %bc = bitcast <16 x i16> %res to <4 x i64>
2485 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2486 ; CHECK-LABEL: test_mm256_sub_epi32:
2488 ; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2489 ; CHECK-NEXT: ret{{[l|q]}}
2490 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2491 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2492 %res = sub <8 x i32> %arg0, %arg1
2493 %bc = bitcast <8 x i32> %res to <4 x i64>
2497 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2498 ; CHECK-LABEL: test_mm256_sub_epi64:
2500 ; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
2501 ; CHECK-NEXT: ret{{[l|q]}}
2502 %res = sub <4 x i64> %a0, %a1
2506 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2507 ; CHECK-LABEL: test_mm256_subs_epi8:
2509 ; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
2510 ; CHECK-NEXT: ret{{[l|q]}}
2511 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2512 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2513 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2514 %bc = bitcast <32 x i8> %res to <4 x i64>
2517 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2519 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2520 ; CHECK-LABEL: test_mm256_subs_epi16:
2522 ; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
2523 ; CHECK-NEXT: ret{{[l|q]}}
2524 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2525 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2526 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2527 %bc = bitcast <16 x i16> %res to <4 x i64>
2530 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2532 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2533 ; CHECK-LABEL: test_mm256_subs_epu8:
2535 ; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
2536 ; CHECK-NEXT: ret{{[l|q]}}
2537 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2538 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2539 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2540 %bc = bitcast <32 x i8> %res to <4 x i64>
2543 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2545 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2546 ; CHECK-LABEL: test_mm256_subs_epu16:
2548 ; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
2549 ; CHECK-NEXT: ret{{[l|q]}}
2550 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2551 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2552 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2553 %bc = bitcast <16 x i16> %res to <4 x i64>
2556 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2558 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2559 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2561 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2562 ; CHECK-NEXT: ret{{[l|q]}}
2563 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2564 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2565 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2566 %bc = bitcast <32 x i8> %res to <4 x i64>
2570 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2571 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2573 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2574 ; CHECK-NEXT: ret{{[l|q]}}
2575 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2576 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2577 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2578 %bc = bitcast <16 x i16> %res to <4 x i64>
2582 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2583 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2585 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2586 ; CHECK-NEXT: ret{{[l|q]}}
2587 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2588 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2589 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2590 %bc = bitcast <8 x i32> %res to <4 x i64>
2594 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2595 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2597 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2598 ; CHECK-NEXT: ret{{[l|q]}}
2599 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2603 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2604 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2606 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2607 ; CHECK-NEXT: ret{{[l|q]}}
2608 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2609 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2610 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2611 %bc = bitcast <32 x i8> %res to <4 x i64>
2615 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2616 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2618 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2619 ; CHECK-NEXT: ret{{[l|q]}}
2620 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2621 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2622 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2623 %bc = bitcast <16 x i16> %res to <4 x i64>
2627 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2628 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2630 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2631 ; CHECK-NEXT: ret{{[l|q]}}
2632 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2633 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2634 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2635 %bc = bitcast <8 x i32> %res to <4 x i64>
2639 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2640 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2642 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2643 ; CHECK-NEXT: ret{{[l|q]}}
2644 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2648 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2649 ; CHECK-LABEL: test_mm256_xor_si256:
2651 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
2652 ; CHECK-NEXT: ret{{[l|q]}}
2653 %res = xor <4 x i64> %a0, %a1
2657 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2659 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone