1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8 ; CHECK-LABEL: test_mm256_abs_epi8:
10 ; CHECK-NEXT: vpabsb %ymm0, %ymm0
11 ; CHECK-NEXT: ret{{[l|q]}}
12 %arg = bitcast <4 x i64> %a0 to <32 x i8>
13 %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
14 %res = bitcast <32 x i8> %abs to <4 x i64>
17 declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
19 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
20 ; CHECK-LABEL: test_mm256_abs_epi16:
22 ; CHECK-NEXT: vpabsw %ymm0, %ymm0
23 ; CHECK-NEXT: ret{{[l|q]}}
24 %arg = bitcast <4 x i64> %a0 to <16 x i16>
25 %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
26 %res = bitcast <16 x i16> %abs to <4 x i64>
29 declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
31 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
32 ; CHECK-LABEL: test_mm256_abs_epi32:
34 ; CHECK-NEXT: vpabsd %ymm0, %ymm0
35 ; CHECK-NEXT: ret{{[l|q]}}
36 %arg = bitcast <4 x i64> %a0 to <8 x i32>
37 %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
38 %res = bitcast <8 x i32> %abs to <4 x i64>
41 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
43 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
44 ; CHECK-LABEL: test_mm256_add_epi8:
46 ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
49 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
50 %res = add <32 x i8> %arg0, %arg1
51 %bc = bitcast <32 x i8> %res to <4 x i64>
55 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
56 ; CHECK-LABEL: test_mm256_add_epi16:
58 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
59 ; CHECK-NEXT: ret{{[l|q]}}
60 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
61 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
62 %res = add <16 x i16> %arg0, %arg1
63 %bc = bitcast <16 x i16> %res to <4 x i64>
67 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
68 ; CHECK-LABEL: test_mm256_add_epi32:
70 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
71 ; CHECK-NEXT: ret{{[l|q]}}
72 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
73 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
74 %res = add <8 x i32> %arg0, %arg1
75 %bc = bitcast <8 x i32> %res to <4 x i64>
79 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
80 ; CHECK-LABEL: test_mm256_add_epi64:
82 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
83 ; CHECK-NEXT: ret{{[l|q]}}
84 %res = add <4 x i64> %a0, %a1
88 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
89 ; CHECK-LABEL: test_mm256_adds_epi8:
91 ; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
92 ; CHECK-NEXT: ret{{[l|q]}}
93 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
94 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
95 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
96 %bc = bitcast <32 x i8> %res to <4 x i64>
99 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
101 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
102 ; CHECK-LABEL: test_mm256_adds_epi16:
104 ; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
105 ; CHECK-NEXT: ret{{[l|q]}}
106 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
107 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
108 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
109 %bc = bitcast <16 x i16> %res to <4 x i64>
112 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
114 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
115 ; CHECK-LABEL: test_mm256_adds_epu8:
117 ; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
118 ; CHECK-NEXT: ret{{[l|q]}}
119 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
120 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
121 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
122 %bc = bitcast <32 x i8> %res to <4 x i64>
125 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
127 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
128 ; CHECK-LABEL: test_mm256_adds_epu16:
130 ; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
131 ; CHECK-NEXT: ret{{[l|q]}}
132 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
133 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
134 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
135 %bc = bitcast <16 x i16> %res to <4 x i64>
138 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
140 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
141 ; CHECK-LABEL: test_mm256_alignr_epi8:
143 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
144 ; CHECK-NEXT: ret{{[l|q]}}
145 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
146 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
147 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
148 %res = bitcast <32 x i8> %shuf to <4 x i64>
152 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
153 ; CHECK-LABEL: test2_mm256_alignr_epi8:
155 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
156 ; CHECK-NEXT: ret{{[l|q]}}
157 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
158 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
159 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
160 %res = bitcast <32 x i8> %shuf to <4 x i64>
164 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
165 ; CHECK-LABEL: test_mm256_and_si256:
167 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
168 ; CHECK-NEXT: ret{{[l|q]}}
169 %res = and <4 x i64> %a0, %a1
173 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
174 ; CHECK-LABEL: test_mm256_andnot_si256:
176 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
177 ; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0
178 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
179 ; CHECK-NEXT: ret{{[l|q]}}
180 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
181 %res = and <4 x i64> %not, %a1
185 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
186 ; CHECK-LABEL: test_mm256_avg_epu8:
188 ; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
189 ; CHECK-NEXT: ret{{[l|q]}}
190 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
191 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
192 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
193 %bc = bitcast <32 x i8> %res to <4 x i64>
196 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
198 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
199 ; CHECK-LABEL: test_mm256_avg_epu16:
201 ; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
202 ; CHECK-NEXT: ret{{[l|q]}}
203 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
204 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
205 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
206 %bc = bitcast <16 x i16> %res to <4 x i64>
209 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
211 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
212 ; CHECK-LABEL: test_mm256_blend_epi16:
214 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
215 ; CHECK-NEXT: ret{{[l|q]}}
216 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
217 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
218 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219 %res = bitcast <16 x i16> %shuf to <4 x i64>
223 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
224 ; CHECK-LABEL: test_mm_blend_epi32:
226 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
227 ; CHECK-NEXT: ret{{[l|q]}}
228 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
229 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
230 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
231 %res = bitcast <4 x i32> %shuf to <2 x i64>
235 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
236 ; CHECK-LABEL: test_mm256_blend_epi32:
238 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
239 ; CHECK-NEXT: ret{{[l|q]}}
240 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
241 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
242 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
243 %res = bitcast <8 x i32> %shuf to <4 x i64>
247 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
248 ; CHECK-LABEL: test_mm256_blendv_epi8:
250 ; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
251 ; CHECK-NEXT: ret{{[l|q]}}
252 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
253 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
254 %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
255 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
256 %res = bitcast <32 x i8> %call to <4 x i64>
259 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
261 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
262 ; CHECK-LABEL: test_mm_broadcastb_epi8:
264 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
265 ; CHECK-NEXT: ret{{[l|q]}}
266 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
267 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
268 %res = bitcast <16 x i8> %shuf to <2 x i64>
272 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
273 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
275 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
276 ; CHECK-NEXT: ret{{[l|q]}}
277 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
278 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
279 %res = bitcast <32 x i8> %shuf to <4 x i64>
283 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
284 ; CHECK-LABEL: test_mm_broadcastd_epi32:
286 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
287 ; CHECK-NEXT: ret{{[l|q]}}
288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
290 %res = bitcast <4 x i32> %shuf to <2 x i64>
294 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
295 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
297 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
298 ; CHECK-NEXT: ret{{[l|q]}}
299 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
300 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
301 %res = bitcast <8 x i32> %shuf to <4 x i64>
305 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
306 ; CHECK-LABEL: test_mm_broadcastq_epi64:
308 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
309 ; CHECK-NEXT: ret{{[l|q]}}
310 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
314 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
315 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
317 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
318 ; CHECK-NEXT: ret{{[l|q]}}
319 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
323 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
324 ; CHECK-LABEL: test_mm_broadcastsd_pd:
326 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
327 ; CHECK-NEXT: ret{{[l|q]}}
328 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
329 ret <2 x double> %res
332 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
333 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
335 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
336 ; CHECK-NEXT: ret{{[l|q]}}
337 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
338 ret <4 x double> %res
341 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
342 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
344 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
345 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
346 ; CHECK-NEXT: ret{{[l|q]}}
347 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
351 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(ptr %p0) {
352 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
354 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
355 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
358 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
360 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
362 %a0 = load <2 x i64>, ptr %p0
363 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
367 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
368 ; CHECK-LABEL: test_mm_broadcastss_ps:
370 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
371 ; CHECK-NEXT: ret{{[l|q]}}
372 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
376 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
377 ; CHECK-LABEL: test_mm256_broadcastss_ps:
379 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
385 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
386 ; CHECK-LABEL: test_mm_broadcastw_epi16:
388 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
389 ; CHECK-NEXT: ret{{[l|q]}}
390 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
391 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
392 %res = bitcast <8 x i16> %shuf to <2 x i64>
396 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
397 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
399 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
400 ; CHECK-NEXT: ret{{[l|q]}}
401 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
402 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
403 %res = bitcast <16 x i16> %shuf to <4 x i64>
407 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
408 ; CHECK-LABEL: test_mm256_bslli_epi128:
410 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
411 ; CHECK-NEXT: ret{{[l|q]}}
412 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
413 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
414 %res = bitcast <32 x i8> %shuf to <4 x i64>
418 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
419 ; CHECK-LABEL: test_mm256_bsrli_epi128:
421 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
422 ; CHECK-NEXT: ret{{[l|q]}}
423 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
424 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
425 %res = bitcast <32 x i8> %shuf to <4 x i64>
429 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
430 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
432 ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
433 ; CHECK-NEXT: ret{{[l|q]}}
434 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
435 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
436 %cmp = icmp eq <32 x i8> %arg0, %arg1
437 %res = sext <32 x i1> %cmp to <32 x i8>
438 %bc = bitcast <32 x i8> %res to <4 x i64>
442 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
443 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
445 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
446 ; CHECK-NEXT: ret{{[l|q]}}
447 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
448 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
449 %cmp = icmp eq <16 x i16> %arg0, %arg1
450 %res = sext <16 x i1> %cmp to <16 x i16>
451 %bc = bitcast <16 x i16> %res to <4 x i64>
455 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
456 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
458 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
459 ; CHECK-NEXT: ret{{[l|q]}}
460 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
461 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
462 %cmp = icmp eq <8 x i32> %arg0, %arg1
463 %res = sext <8 x i1> %cmp to <8 x i32>
464 %bc = bitcast <8 x i32> %res to <4 x i64>
468 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
469 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
471 ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
472 ; CHECK-NEXT: ret{{[l|q]}}
473 %cmp = icmp eq <4 x i64> %a0, %a1
474 %res = sext <4 x i1> %cmp to <4 x i64>
478 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
479 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
481 ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
482 ; CHECK-NEXT: ret{{[l|q]}}
483 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
484 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
485 %cmp = icmp sgt <32 x i8> %arg0, %arg1
486 %res = sext <32 x i1> %cmp to <32 x i8>
487 %bc = bitcast <32 x i8> %res to <4 x i64>
491 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
492 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
494 ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
495 ; CHECK-NEXT: ret{{[l|q]}}
496 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
497 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
498 %cmp = icmp sgt <16 x i16> %arg0, %arg1
499 %res = sext <16 x i1> %cmp to <16 x i16>
500 %bc = bitcast <16 x i16> %res to <4 x i64>
504 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
505 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
507 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
508 ; CHECK-NEXT: ret{{[l|q]}}
509 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
510 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
511 %cmp = icmp sgt <8 x i32> %arg0, %arg1
512 %res = sext <8 x i1> %cmp to <8 x i32>
513 %bc = bitcast <8 x i32> %res to <4 x i64>
517 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
518 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
520 ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
521 ; CHECK-NEXT: ret{{[l|q]}}
522 %cmp = icmp sgt <4 x i64> %a0, %a1
523 %res = sext <4 x i1> %cmp to <4 x i64>
527 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
528 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
530 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
531 ; CHECK-NEXT: ret{{[l|q]}}
532 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
533 %ext = sext <16 x i8> %arg0 to <16 x i16>
534 %res = bitcast <16 x i16> %ext to <4 x i64>
538 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
539 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
541 ; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
542 ; CHECK-NEXT: ret{{[l|q]}}
543 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
544 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
545 %ext = sext <8 x i8> %shuf to <8 x i32>
546 %res = bitcast <8 x i32> %ext to <4 x i64>
550 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
551 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
553 ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
554 ; CHECK-NEXT: ret{{[l|q]}}
555 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
556 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
557 %ext = sext <4 x i8> %shuf to <4 x i64>
561 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
562 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
564 ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
565 ; CHECK-NEXT: ret{{[l|q]}}
566 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
567 %ext = sext <8 x i16> %arg0 to <8 x i32>
568 %res = bitcast <8 x i32> %ext to <4 x i64>
572 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
573 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
575 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
576 ; CHECK-NEXT: ret{{[l|q]}}
577 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
578 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579 %ext = sext <4 x i16> %shuf to <4 x i64>
583 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
584 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
586 ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
587 ; CHECK-NEXT: ret{{[l|q]}}
588 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
589 %ext = sext <4 x i32> %arg0 to <4 x i64>
593 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
594 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
596 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
597 ; CHECK-NEXT: ret{{[l|q]}}
598 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
599 %ext = zext <16 x i8> %arg0 to <16 x i16>
600 %res = bitcast <16 x i16> %ext to <4 x i64>
604 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
607 ; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
610 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
611 %ext = zext <8 x i8> %shuf to <8 x i32>
612 %res = bitcast <8 x i32> %ext to <4 x i64>
616 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
617 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
619 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
620 ; CHECK-NEXT: ret{{[l|q]}}
621 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
622 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
623 %ext = zext <4 x i8> %shuf to <4 x i64>
627 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
628 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
630 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
631 ; CHECK-NEXT: ret{{[l|q]}}
632 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
633 %ext = zext <8 x i16> %arg0 to <8 x i32>
634 %res = bitcast <8 x i32> %ext to <4 x i64>
638 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
639 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
641 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
642 ; CHECK-NEXT: ret{{[l|q]}}
643 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
644 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
645 %ext = zext <4 x i16> %shuf to <4 x i64>
649 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
650 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
652 ; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
653 ; CHECK-NEXT: ret{{[l|q]}}
654 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
655 %ext = zext <4 x i32> %arg0 to <4 x i64>
659 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
660 ; CHECK-LABEL: test_mm256_extracti128_si256:
662 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
663 ; CHECK-NEXT: vzeroupper
664 ; CHECK-NEXT: ret{{[l|q]}}
665 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
669 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
670 ; CHECK-LABEL: test_mm256_hadd_epi16:
672 ; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
673 ; CHECK-NEXT: ret{{[l|q]}}
674 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
675 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
676 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
677 %bc = bitcast <16 x i16> %res to <4 x i64>
680 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
682 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
683 ; CHECK-LABEL: test_mm256_hadd_epi32:
685 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
686 ; CHECK-NEXT: ret{{[l|q]}}
687 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
688 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
689 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
690 %bc = bitcast <8 x i32> %res to <4 x i64>
693 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
695 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
696 ; CHECK-LABEL: test_mm256_hadds_epi16:
698 ; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
699 ; CHECK-NEXT: ret{{[l|q]}}
700 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
701 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
702 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
703 %bc = bitcast <16 x i16> %res to <4 x i64>
706 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
708 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
709 ; CHECK-LABEL: test_mm256_hsub_epi16:
711 ; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
712 ; CHECK-NEXT: ret{{[l|q]}}
713 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
715 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
716 %bc = bitcast <16 x i16> %res to <4 x i64>
719 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
721 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
722 ; CHECK-LABEL: test_mm256_hsub_epi32:
724 ; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
725 ; CHECK-NEXT: ret{{[l|q]}}
726 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
727 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
728 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
729 %bc = bitcast <8 x i32> %res to <4 x i64>
732 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
734 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
735 ; CHECK-LABEL: test_mm256_hsubs_epi16:
737 ; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
738 ; CHECK-NEXT: ret{{[l|q]}}
739 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
740 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
741 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
742 %bc = bitcast <16 x i16> %res to <4 x i64>
745 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
747 define <2 x i64> @test_mm_i32gather_epi32(ptr%a0, <2 x i64> %a1) {
748 ; X86-LABEL: test_mm_i32gather_epi32:
750 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
751 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
752 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
753 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
754 ; X86-NEXT: vmovdqa %xmm1, %xmm0
757 ; X64-LABEL: test_mm_i32gather_epi32:
759 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
760 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
761 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
762 ; X64-NEXT: vmovdqa %xmm1, %xmm0
764 %arg0 = bitcast ptr%a0 to ptr
765 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
766 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
767 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, ptr %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
768 %bc = bitcast <4 x i32> %call to <2 x i64>
771 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr, <4 x i32>, <4 x i32>, i8) nounwind readonly
773 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
774 ; X86-LABEL: test_mm_mask_i32gather_epi32:
776 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
777 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
780 ; X64-LABEL: test_mm_mask_i32gather_epi32:
782 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
784 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
785 %arg1 = bitcast ptr%a1 to ptr
786 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
787 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
788 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, ptr %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
789 %bc = bitcast <4 x i32> %call to <2 x i64>
793 define <4 x i64> @test_mm256_i32gather_epi32(ptr%a0, <4 x i64> %a1) {
794 ; X86-LABEL: test_mm256_i32gather_epi32:
796 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
797 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
798 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
799 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
800 ; X86-NEXT: vmovdqa %ymm1, %ymm0
803 ; X64-LABEL: test_mm256_i32gather_epi32:
805 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
806 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
807 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
808 ; X64-NEXT: vmovdqa %ymm1, %ymm0
810 %arg0 = bitcast ptr%a0 to ptr
811 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
812 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
813 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, ptr %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
814 %bc = bitcast <8 x i32> %call to <4 x i64>
817 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr, <8 x i32>, <8 x i32>, i8) nounwind readonly
819 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) {
820 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
822 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
823 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
826 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
828 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
830 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
831 %arg1 = bitcast ptr%a1 to ptr
832 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
833 %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
834 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, ptr %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
835 %bc = bitcast <8 x i32> %call to <4 x i64>
839 define <2 x i64> @test_mm_i32gather_epi64(ptr%a0, <2 x i64> %a1) {
840 ; X86-LABEL: test_mm_i32gather_epi64:
842 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
843 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
844 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
845 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
846 ; X86-NEXT: vmovdqa %xmm1, %xmm0
849 ; X64-LABEL: test_mm_i32gather_epi64:
851 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
852 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
853 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
854 ; X64-NEXT: vmovdqa %xmm1, %xmm0
856 %arg0 = bitcast ptr%a0 to ptr
857 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
858 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, ptr %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
861 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr, <4 x i32>, <2 x i64>, i8) nounwind readonly
863 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
864 ; X86-LABEL: test_mm_mask_i32gather_epi64:
866 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
867 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
870 ; X64-LABEL: test_mm_mask_i32gather_epi64:
872 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
874 %arg1 = bitcast ptr%a1 to ptr
875 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
876 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
880 define <4 x i64> @test_mm256_i32gather_epi64(ptr%a0, <2 x i64> %a1) {
881 ; X86-LABEL: test_mm256_i32gather_epi64:
883 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
884 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
885 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
886 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
887 ; X86-NEXT: vmovdqa %ymm1, %ymm0
890 ; X64-LABEL: test_mm256_i32gather_epi64:
892 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
893 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
894 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
895 ; X64-NEXT: vmovdqa %ymm1, %ymm0
897 %arg0 = bitcast ptr%a0 to ptr
898 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
899 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, ptr %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
902 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr, <4 x i32>, <4 x i64>, i8) nounwind readonly
904 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, ptr%a1, <2 x i64> %a2, <4 x i64> %a3) {
905 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
907 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
908 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
911 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
913 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
915 %arg1 = bitcast ptr%a1 to ptr
916 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
917 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
921 define <2 x double> @test_mm_i32gather_pd(ptr%a0, <2 x i64> %a1) {
922 ; X86-LABEL: test_mm_i32gather_pd:
924 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
926 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
927 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
928 ; X86-NEXT: vmovapd %xmm1, %xmm0
931 ; X64-LABEL: test_mm_i32gather_pd:
933 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
934 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
935 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
936 ; X64-NEXT: vmovapd %xmm1, %xmm0
938 %arg0 = bitcast ptr%a0 to ptr
939 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
940 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
941 %sext = sext <2 x i1> %cmp to <2 x i64>
942 %mask = bitcast <2 x i64> %sext to <2 x double>
943 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, ptr %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
944 ret <2 x double> %res
946 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr, <4 x i32>, <2 x double>, i8) nounwind readonly
948 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) {
949 ; X86-LABEL: test_mm_mask_i32gather_pd:
951 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
952 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
955 ; X64-LABEL: test_mm_mask_i32gather_pd:
957 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
959 %arg1 = bitcast ptr%a1 to ptr
960 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
961 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, ptr %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
962 ret <2 x double> %res
965 define <4 x double> @test_mm256_i32gather_pd(ptr%a0, <2 x i64> %a1) {
966 ; X86-LABEL: test_mm256_i32gather_pd:
968 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
969 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
970 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
971 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
972 ; X86-NEXT: vmovapd %ymm1, %ymm0
975 ; X64-LABEL: test_mm256_i32gather_pd:
977 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
978 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
979 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
980 ; X64-NEXT: vmovapd %ymm1, %ymm0
982 %arg0 = bitcast ptr%a0 to ptr
983 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
984 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
985 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, ptr %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
986 ret <4 x double> %res
988 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr, <4 x i32>, <4 x double>, i8) nounwind readonly
990 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, ptr%a1, <2 x i64> %a2, <4 x double> %a3) {
991 ; X86-LABEL: test_mm256_mask_i32gather_pd:
993 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
994 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
997 ; X64-LABEL: test_mm256_mask_i32gather_pd:
999 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1001 %arg1 = bitcast ptr%a1 to ptr
1002 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, ptr %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004 ret <4 x double> %res
1007 define <4 x float> @test_mm_i32gather_ps(ptr%a0, <2 x i64> %a1) {
1008 ; X86-LABEL: test_mm_i32gather_ps:
1010 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1011 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1012 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1013 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014 ; X86-NEXT: vmovaps %xmm1, %xmm0
1017 ; X64-LABEL: test_mm_i32gather_ps:
1019 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1020 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1021 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022 ; X64-NEXT: vmovaps %xmm1, %xmm0
1024 %arg0 = bitcast ptr%a0 to ptr
1025 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027 %sext = sext <4 x i1> %cmp to <4 x i32>
1028 %mask = bitcast <4 x i32> %sext to <4 x float>
1029 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, ptr %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030 ret <4 x float> %call
1032 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr, <4 x i32>, <4 x float>, i8) nounwind readonly
1034 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) {
1035 ; X86-LABEL: test_mm_mask_i32gather_ps:
1037 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1038 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1041 ; X64-LABEL: test_mm_mask_i32gather_ps:
1043 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1045 %arg1 = bitcast ptr%a1 to ptr
1046 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, ptr %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048 ret <4 x float> %call
1051 define <8 x float> @test_mm256_i32gather_ps(ptr%a0, <4 x i64> %a1) {
1052 ; X86-LABEL: test_mm256_i32gather_ps:
1054 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1055 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1056 ; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1057 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058 ; X86-NEXT: vmovaps %ymm1, %ymm0
1061 ; X64-LABEL: test_mm256_i32gather_ps:
1063 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1064 ; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1065 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066 ; X64-NEXT: vmovaps %ymm1, %ymm0
1068 %arg0 = bitcast ptr%a0 to ptr
1069 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072 ret <8 x float> %call
1074 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr, <8 x i32>, <8 x float>, i8) nounwind readonly
1076 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, ptr%a1, <4 x i64> %a2, <8 x float> %a3) {
1077 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1079 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1080 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1083 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1085 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1087 %arg1 = bitcast ptr%a1 to ptr
1088 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, ptr %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090 ret <8 x float> %call
1093 define <2 x i64> @test_mm_i64gather_epi32(ptr%a0, <2 x i64> %a1) {
1094 ; X86-LABEL: test_mm_i64gather_epi32:
1096 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1097 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1098 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1099 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1103 ; X64-LABEL: test_mm_i64gather_epi32:
1105 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1106 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1107 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1110 %arg0 = bitcast ptr%a0 to ptr
1111 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, ptr %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113 %bc = bitcast <4 x i32> %call to <2 x i64>
1116 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr, <2 x i64>, <4 x i32>, i8) nounwind readonly
1118 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1121 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1122 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1125 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1127 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1129 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130 %arg1 = bitcast ptr%a1 to ptr
1131 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, ptr %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133 %bc = bitcast <4 x i32> %call to <2 x i64>
1137 define <2 x i64> @test_mm256_i64gather_epi32(ptr%a0, <4 x i64> %a1) {
1138 ; X86-LABEL: test_mm256_i64gather_epi32:
1140 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1141 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1142 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1143 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1145 ; X86-NEXT: vzeroupper
1148 ; X64-LABEL: test_mm256_i64gather_epi32:
1150 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1151 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1152 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1154 ; X64-NEXT: vzeroupper
1156 %arg0 = bitcast ptr%a0 to ptr
1157 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, ptr %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159 %bc = bitcast <4 x i32> %call to <2 x i64>
1162 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr, <4 x i64>, <4 x i32>, i8) nounwind readonly
1164 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1168 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169 ; X86-NEXT: vzeroupper
1172 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1174 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175 ; X64-NEXT: vzeroupper
1177 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178 %arg1 = bitcast ptr%a1 to ptr
1179 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, ptr %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181 %bc = bitcast <4 x i32> %call to <2 x i64>
1185 define <2 x i64> @test_mm_i64gather_epi64(ptr%a0, <2 x i64> %a1) {
1186 ; X86-LABEL: test_mm_i64gather_epi64:
1188 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1189 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1190 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1191 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1195 ; X64-LABEL: test_mm_i64gather_epi64:
1197 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1198 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1199 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1202 %arg0 = bitcast ptr%a0 to ptr
1203 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, ptr %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1206 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr, <2 x i64>, <2 x i64>, i8) nounwind readonly
1208 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1211 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1212 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1215 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1217 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1219 %arg1 = bitcast ptr%a1 to ptr
1220 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, ptr %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1224 define <4 x i64> @test_mm256_i64gather_epi64(ptr%a0, <4 x i64> %a1) {
1225 ; X86-LABEL: test_mm256_i64gather_epi64:
1227 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1228 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1229 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1230 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1234 ; X64-LABEL: test_mm256_i64gather_epi64:
1236 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1237 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1238 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1241 %arg0 = bitcast ptr%a0 to ptr
1242 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, ptr %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1245 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr, <4 x i64>, <4 x i64>, i8) nounwind readonly
1247 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1250 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1251 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1254 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1256 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1258 %arg1 = bitcast ptr%a1 to ptr
1259 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, ptr %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1263 define <2 x double> @test_mm_i64gather_pd(ptr%a0, <2 x i64> %a1) {
1264 ; X86-LABEL: test_mm_i64gather_pd:
1266 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1267 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1268 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1269 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270 ; X86-NEXT: vmovapd %xmm1, %xmm0
1273 ; X64-LABEL: test_mm_i64gather_pd:
1275 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1276 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1277 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278 ; X64-NEXT: vmovapd %xmm1, %xmm0
1280 %arg0 = bitcast ptr%a0 to ptr
1281 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282 %sext = sext <2 x i1> %cmp to <2 x i64>
1283 %mask = bitcast <2 x i64> %sext to <2 x double>
1284 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, ptr %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285 ret <2 x double> %call
1287 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr, <2 x i64>, <2 x double>, i8) nounwind readonly
1289 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) {
1290 ; X86-LABEL: test_mm_mask_i64gather_pd:
1292 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1293 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1296 ; X64-LABEL: test_mm_mask_i64gather_pd:
1298 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1300 %arg1 = bitcast ptr%a1 to ptr
1301 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, ptr %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302 ret <2 x double> %call
1305 define <4 x double> @test_mm256_i64gather_pd(ptr%a0, <4 x i64> %a1) {
1306 ; X86-LABEL: test_mm256_i64gather_pd:
1308 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1309 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1310 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1311 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312 ; X86-NEXT: vmovapd %ymm1, %ymm0
1315 ; X64-LABEL: test_mm256_i64gather_pd:
1317 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1318 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1319 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320 ; X64-NEXT: vmovapd %ymm1, %ymm0
1322 %arg0 = bitcast ptr%a0 to ptr
1323 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, ptr %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325 ret <4 x double> %call
1327 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr, <4 x i64>, <4 x double>, i8) nounwind readonly
1329 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, ptr%a1, <4 x i64> %a2, <4 x double> %a3) {
1330 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1332 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1333 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1336 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1338 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1340 %arg1 = bitcast ptr%a1 to ptr
1341 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, ptr %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342 ret <4 x double> %call
1345 define <4 x float> @test_mm_i64gather_ps(ptr%a0, <2 x i64> %a1) {
1346 ; X86-LABEL: test_mm_i64gather_ps:
1348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1349 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1350 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1351 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352 ; X86-NEXT: vmovaps %xmm1, %xmm0
1355 ; X64-LABEL: test_mm_i64gather_ps:
1357 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1358 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1359 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360 ; X64-NEXT: vmovaps %xmm1, %xmm0
1362 %arg0 = bitcast ptr%a0 to ptr
1363 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364 %sext = sext <4 x i1> %cmp to <4 x i32>
1365 %mask = bitcast <4 x i32> %sext to <4 x float>
1366 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, ptr %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367 ret <4 x float> %call
1369 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr, <2 x i64>, <4 x float>, i8) nounwind readonly
1371 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) {
1372 ; X86-LABEL: test_mm_mask_i64gather_ps:
1374 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1375 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1378 ; X64-LABEL: test_mm_mask_i64gather_ps:
1380 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1382 %arg1 = bitcast ptr%a1 to ptr
1383 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, ptr %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384 ret <4 x float> %call
1387 define <4 x float> @test_mm256_i64gather_ps(ptr%a0, <4 x i64> %a1) {
1388 ; X86-LABEL: test_mm256_i64gather_ps:
1390 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1391 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1392 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1393 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394 ; X86-NEXT: vmovaps %xmm1, %xmm0
1395 ; X86-NEXT: vzeroupper
1398 ; X64-LABEL: test_mm256_i64gather_ps:
1400 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1401 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1402 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403 ; X64-NEXT: vmovaps %xmm1, %xmm0
1404 ; X64-NEXT: vzeroupper
1406 %arg0 = bitcast ptr%a0 to ptr
1407 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408 %sext = sext <4 x i1> %cmp to <4 x i32>
1409 %mask = bitcast <4 x i32> %sext to <4 x float>
1410 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, ptr %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411 ret <4 x float> %call
1413 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr, <4 x i64>, <4 x float>, i8) nounwind readonly
1415 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <4 x i64> %a2, <4 x float> %a3) {
1416 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1418 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1419 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420 ; X86-NEXT: vzeroupper
1423 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1425 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426 ; X64-NEXT: vzeroupper
1428 %arg1 = bitcast ptr%a1 to ptr
1429 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, ptr %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430 ret <4 x float> %call
1433 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1436 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1437 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438 ; CHECK-NEXT: ret{{[l|q]}}
1439 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1444 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1447 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448 ; CHECK-NEXT: ret{{[l|q]}}
1449 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1454 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455 ; CHECK-LABEL: test_mm256_madd_epi16:
1457 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
1458 ; CHECK-NEXT: ret{{[l|q]}}
1459 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462 %bc = bitcast <8 x i32> %res to <4 x i64>
1465 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1467 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1470 ; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
1471 ; CHECK-NEXT: ret{{[l|q]}}
1472 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475 %bc = bitcast <16 x i16> %res to <4 x i64>
1478 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1480 define <2 x i64> @test_mm_maskload_epi32(ptr %a0, <2 x i64> %a1) nounwind {
1481 ; X86-LABEL: test_mm_maskload_epi32:
1483 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1484 ; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
1487 ; X64-LABEL: test_mm_maskload_epi32:
1489 ; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
1491 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1492 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %arg1)
1493 %bc = bitcast <4 x i32> %call to <2 x i64>
1496 declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
1498 define <4 x i64> @test_mm256_maskload_epi32(ptr %a0, <4 x i64> %a1) nounwind {
1499 ; X86-LABEL: test_mm256_maskload_epi32:
1501 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1502 ; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
1505 ; X64-LABEL: test_mm256_maskload_epi32:
1507 ; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
1509 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1510 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %arg1)
1511 %bc = bitcast <8 x i32> %call to <4 x i64>
1514 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonly
1516 define <2 x i64> @test_mm_maskload_epi64(ptr %a0, <2 x i64> %a1) nounwind {
1517 ; X86-LABEL: test_mm_maskload_epi64:
1519 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1520 ; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
1523 ; X64-LABEL: test_mm_maskload_epi64:
1525 ; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
1527 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1)
1530 declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
1532 define <4 x i64> @test_mm256_maskload_epi64(ptr %a0, <4 x i64> %a1) nounwind {
1533 ; X86-LABEL: test_mm256_maskload_epi64:
1535 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1536 ; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
1539 ; X64-LABEL: test_mm256_maskload_epi64:
1541 ; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
1543 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1)
1546 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonly
1548 define void @test_mm_maskstore_epi32(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1549 ; X86-LABEL: test_mm_maskstore_epi32:
1551 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1552 ; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
1555 ; X64-LABEL: test_mm_maskstore_epi32:
1557 ; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
1559 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1560 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1561 call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %arg1, <4 x i32> %arg2)
1564 declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind readnone
1566 define void @test_mm256_maskstore_epi32(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1567 ; X86-LABEL: test_mm256_maskstore_epi32:
1569 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1570 ; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
1571 ; X86-NEXT: vzeroupper
1574 ; X64-LABEL: test_mm256_maskstore_epi32:
1576 ; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
1577 ; X64-NEXT: vzeroupper
1579 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1580 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1581 call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %arg1, <8 x i32> %arg2)
1584 declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind readnone
1586 define void @test_mm_maskstore_epi64(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1587 ; X86-LABEL: test_mm_maskstore_epi64:
1589 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1590 ; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
1593 ; X64-LABEL: test_mm_maskstore_epi64:
1595 ; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
1597 call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
1600 declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind readnone
1602 define void @test_mm256_maskstore_epi64(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1603 ; X86-LABEL: test_mm256_maskstore_epi64:
1605 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1606 ; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
1607 ; X86-NEXT: vzeroupper
1610 ; X64-LABEL: test_mm256_maskstore_epi64:
1612 ; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
1613 ; X64-NEXT: vzeroupper
1615 call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
1618 declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind readnone
1620 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1621 ; CHECK-LABEL: test_mm256_max_epi8:
1623 ; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1624 ; CHECK-NEXT: ret{{[l|q]}}
1625 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1626 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1627 %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1628 %bc = bitcast <32 x i8> %sel to <4 x i64>
1631 declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1633 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1634 ; CHECK-LABEL: test_mm256_max_epi16:
1636 ; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1637 ; CHECK-NEXT: ret{{[l|q]}}
1638 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1639 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1640 %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1641 %bc = bitcast <16 x i16> %sel to <4 x i64>
1644 declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1646 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1647 ; CHECK-LABEL: test_mm256_max_epi32:
1649 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1650 ; CHECK-NEXT: ret{{[l|q]}}
1651 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1652 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1653 %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1654 %bc = bitcast <8 x i32> %sel to <4 x i64>
1657 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1659 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1660 ; CHECK-LABEL: test_mm256_max_epu8:
1662 ; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
1663 ; CHECK-NEXT: ret{{[l|q]}}
1664 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1665 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1666 %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1667 %bc = bitcast <32 x i8> %sel to <4 x i64>
1670 declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1672 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1673 ; CHECK-LABEL: test_mm256_max_epu16:
1675 ; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
1676 ; CHECK-NEXT: ret{{[l|q]}}
1677 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1678 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1679 %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1680 %bc = bitcast <16 x i16> %sel to <4 x i64>
1683 declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1685 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1686 ; CHECK-LABEL: test_mm256_max_epu32:
1688 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
1689 ; CHECK-NEXT: ret{{[l|q]}}
1690 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1691 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1692 %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1693 %bc = bitcast <8 x i32> %sel to <4 x i64>
1696 declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1698 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1699 ; CHECK-LABEL: test_mm256_min_epi8:
1701 ; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
1702 ; CHECK-NEXT: ret{{[l|q]}}
1703 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1704 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1705 %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1706 %bc = bitcast <32 x i8> %sel to <4 x i64>
1709 declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1711 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1712 ; CHECK-LABEL: test_mm256_min_epi16:
1714 ; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
1715 ; CHECK-NEXT: ret{{[l|q]}}
1716 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1717 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1718 %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1719 %bc = bitcast <16 x i16> %sel to <4 x i64>
1722 declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1724 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1725 ; CHECK-LABEL: test_mm256_min_epi32:
1727 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1728 ; CHECK-NEXT: ret{{[l|q]}}
1729 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1730 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1731 %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1732 %bc = bitcast <8 x i32> %sel to <4 x i64>
1735 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1737 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1738 ; CHECK-LABEL: test_mm256_min_epu8:
1740 ; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
1741 ; CHECK-NEXT: ret{{[l|q]}}
1742 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1743 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1744 %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1745 %bc = bitcast <32 x i8> %sel to <4 x i64>
1748 declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1750 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1751 ; CHECK-LABEL: test_mm256_min_epu16:
1753 ; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
1754 ; CHECK-NEXT: ret{{[l|q]}}
1755 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1756 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1757 %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1758 %bc = bitcast <16 x i16> %sel to <4 x i64>
1761 declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1763 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1764 ; CHECK-LABEL: test_mm256_min_epu32:
1766 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
1767 ; CHECK-NEXT: ret{{[l|q]}}
1768 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1769 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1770 %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1771 %bc = bitcast <8 x i32> %sel to <4 x i64>
1774 declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1776 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1777 ; CHECK-LABEL: test_mm256_movemask_epi8:
1779 ; CHECK-NEXT: vpmovmskb %ymm0, %eax
1780 ; CHECK-NEXT: vzeroupper
1781 ; CHECK-NEXT: ret{{[l|q]}}
1782 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1783 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1786 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1788 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1789 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1791 ; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
1792 ; CHECK-NEXT: ret{{[l|q]}}
1793 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1794 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1795 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1796 %bc = bitcast <16 x i16> %call to <4 x i64>
1799 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1801 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1802 ; CHECK-LABEL: test_mm256_mul_epi32:
1804 ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
1805 ; CHECK-NEXT: ret{{[l|q]}}
1806 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1807 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1808 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1809 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1810 %res = mul nsw <4 x i64> %A1, %B1
1813 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1815 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1816 ; CHECK-LABEL: test_mm256_mul_epu32:
1818 ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1819 ; CHECK-NEXT: ret{{[l|q]}}
1820 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1821 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1822 %res = mul nuw <4 x i64> %A, %B
1825 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1827 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1828 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1830 ; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1831 ; CHECK-NEXT: ret{{[l|q]}}
1832 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1833 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1834 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1835 %bc = bitcast <16 x i16> %res to <4 x i64>
1838 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1840 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1841 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1843 ; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
1844 ; CHECK-NEXT: ret{{[l|q]}}
1845 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1846 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1847 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1848 %bc = bitcast <16 x i16> %res to <4 x i64>
1851 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1853 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1854 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1856 ; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
1857 ; CHECK-NEXT: ret{{[l|q]}}
1858 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1859 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1860 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1861 %bc = bitcast <16 x i16> %res to <4 x i64>
1864 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1866 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1867 ; CHECK-LABEL: test_mm256_mullo_epi16:
1869 ; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1870 ; CHECK-NEXT: ret{{[l|q]}}
1871 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1872 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1873 %res = mul <16 x i16> %arg0, %arg1
1874 %bc = bitcast <16 x i16> %res to <4 x i64>
1878 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1879 ; CHECK-LABEL: test_mm256_mullo_epi32:
1881 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1882 ; CHECK-NEXT: ret{{[l|q]}}
1883 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1884 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1885 %res = mul <8 x i32> %arg0, %arg1
1886 %bc = bitcast <8 x i32> %res to <4 x i64>
1890 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1891 ; CHECK-LABEL: test_mm256_or_si256:
1893 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1894 ; CHECK-NEXT: ret{{[l|q]}}
1895 %res = or <4 x i64> %a0, %a1
1899 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1900 ; CHECK-LABEL: test_mm256_packs_epi16:
1902 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
1903 ; CHECK-NEXT: ret{{[l|q]}}
1904 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1905 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1906 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1907 %res = bitcast <32 x i8> %call to <4 x i64>
1910 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1912 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1913 ; CHECK-LABEL: test_mm256_packs_epi32:
1915 ; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
1916 ; CHECK-NEXT: ret{{[l|q]}}
1917 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1918 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1919 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1920 %res = bitcast <16 x i16> %call to <4 x i64>
1923 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1925 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1926 ; CHECK-LABEL: test_mm256_packus_epi16:
1928 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1929 ; CHECK-NEXT: ret{{[l|q]}}
1930 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1931 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1932 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1933 %res = bitcast <32 x i8> %call to <4 x i64>
1936 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1938 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1939 ; CHECK-LABEL: test_mm256_packus_epi32:
1941 ; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1942 ; CHECK-NEXT: ret{{[l|q]}}
1943 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1944 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1945 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1946 %res = bitcast <16 x i16> %call to <4 x i64>
1949 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1951 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1952 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1954 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1955 ; CHECK-NEXT: ret{{[l|q]}}
1956 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1959 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1961 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1962 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1964 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1965 ; CHECK-NEXT: ret{{[l|q]}}
1966 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1970 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1971 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1973 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1974 ; CHECK-NEXT: ret{{[l|q]}}
1975 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1976 ret <4 x double> %res
1979 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1980 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1982 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1983 ; CHECK-NEXT: ret{{[l|q]}}
1984 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1985 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1986 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1987 %res = bitcast <8 x i32> %call to <4 x i64>
1990 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1992 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
1993 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
1995 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1996 ; CHECK-NEXT: ret{{[l|q]}}
1997 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1998 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
1999 ret <8 x float> %res
2001 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2003 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2004 ; CHECK-LABEL: test_mm256_sad_epu8:
2006 ; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
2007 ; CHECK-NEXT: ret{{[l|q]}}
2008 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2009 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2010 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2013 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2015 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2016 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2018 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2019 ; CHECK-NEXT: ret{{[l|q]}}
2020 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2021 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2022 %res = bitcast <8 x i32> %shuf to <4 x i64>
2026 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2027 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2029 ; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2030 ; CHECK-NEXT: ret{{[l|q]}}
2031 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2032 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2033 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2034 %res = bitcast <32 x i8> %shuf to <4 x i64>
2037 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2039 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2040 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2042 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2043 ; CHECK-NEXT: ret{{[l|q]}}
2044 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2045 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2046 %res = bitcast <16 x i16> %shuf to <4 x i64>
2050 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2051 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2053 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2054 ; CHECK-NEXT: ret{{[l|q]}}
2055 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2056 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2057 %res = bitcast <16 x i16> %shuf to <4 x i64>
2061 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2062 ; CHECK-LABEL: test_mm256_sign_epi8:
2064 ; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0
2065 ; CHECK-NEXT: ret{{[l|q]}}
2066 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2067 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2068 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2069 %res = bitcast <32 x i8> %call to <4 x i64>
2072 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2074 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2075 ; CHECK-LABEL: test_mm256_sign_epi16:
2077 ; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0
2078 ; CHECK-NEXT: ret{{[l|q]}}
2079 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2080 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2081 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2082 %res = bitcast <16 x i16> %call to <4 x i64>
2085 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2087 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2088 ; CHECK-LABEL: test_mm256_sign_epi32:
2090 ; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0
2091 ; CHECK-NEXT: ret{{[l|q]}}
2092 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2093 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2094 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2095 %res = bitcast <8 x i32> %call to <4 x i64>
2098 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2100 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2101 ; CHECK-LABEL: test_mm256_sll_epi16:
2103 ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
2104 ; CHECK-NEXT: ret{{[l|q]}}
2105 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2106 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2107 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2108 %bc = bitcast <16 x i16> %res to <4 x i64>
2111 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2113 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2114 ; CHECK-LABEL: test_mm256_sll_epi32:
2116 ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
2117 ; CHECK-NEXT: ret{{[l|q]}}
2118 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2119 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2120 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2121 %bc = bitcast <8 x i32> %res to <4 x i64>
2124 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2126 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2127 ; CHECK-LABEL: test_mm256_sll_epi64:
2129 ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
2130 ; CHECK-NEXT: ret{{[l|q]}}
2131 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2134 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2136 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2137 ; CHECK-LABEL: test_mm256_slli_epi16:
2139 ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
2140 ; CHECK-NEXT: ret{{[l|q]}}
2141 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2142 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2143 %bc = bitcast <16 x i16> %res to <4 x i64>
2146 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2148 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2149 ; CHECK-LABEL: test_mm256_slli_epi32:
2151 ; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
2152 ; CHECK-NEXT: ret{{[l|q]}}
2153 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2154 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2155 %bc = bitcast <8 x i32> %res to <4 x i64>
2158 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2160 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2161 ; CHECK-LABEL: test_mm256_slli_epi64:
2163 ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
2164 ; CHECK-NEXT: ret{{[l|q]}}
2165 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2168 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2170 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2171 ; CHECK-LABEL: test_mm256_slli_si256:
2173 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2174 ; CHECK-NEXT: ret{{[l|q]}}
2175 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2176 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2177 %res = bitcast <32 x i8> %shuf to <4 x i64>
2181 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2182 ; CHECK-LABEL: test_mm_sllv_epi32:
2184 ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
2185 ; CHECK-NEXT: ret{{[l|q]}}
2186 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2187 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2188 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2189 %bc = bitcast <4 x i32> %res to <2 x i64>
2192 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2194 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2195 ; CHECK-LABEL: test_mm256_sllv_epi32:
2197 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
2198 ; CHECK-NEXT: ret{{[l|q]}}
2199 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2200 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2201 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2202 %bc = bitcast <8 x i32> %res to <4 x i64>
2205 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2207 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2208 ; CHECK-LABEL: test_mm_sllv_epi64:
2210 ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
2211 ; CHECK-NEXT: ret{{[l|q]}}
2212 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2215 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2217 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2218 ; CHECK-LABEL: test_mm256_sllv_epi64:
2220 ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
2221 ; CHECK-NEXT: ret{{[l|q]}}
2222 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2225 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2227 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2228 ; CHECK-LABEL: test_mm256_sra_epi16:
2230 ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
2231 ; CHECK-NEXT: ret{{[l|q]}}
2232 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2233 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2234 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2235 %bc = bitcast <16 x i16> %res to <4 x i64>
2238 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2240 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2241 ; CHECK-LABEL: test_mm256_sra_epi32:
2243 ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
2244 ; CHECK-NEXT: ret{{[l|q]}}
2245 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2246 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2247 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2248 %bc = bitcast <8 x i32> %res to <4 x i64>
2251 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2253 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2254 ; CHECK-LABEL: test_mm256_srai_epi16:
2256 ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
2257 ; CHECK-NEXT: ret{{[l|q]}}
2258 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2259 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2260 %bc = bitcast <16 x i16> %res to <4 x i64>
2263 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2265 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2266 ; CHECK-LABEL: test_mm256_srai_epi32:
2268 ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
2269 ; CHECK-NEXT: ret{{[l|q]}}
2270 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2271 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2272 %bc = bitcast <8 x i32> %res to <4 x i64>
2275 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2277 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2278 ; CHECK-LABEL: test_mm_srav_epi32:
2280 ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
2281 ; CHECK-NEXT: ret{{[l|q]}}
2282 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2283 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2284 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2285 %bc = bitcast <4 x i32> %res to <2 x i64>
2288 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2290 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2291 ; CHECK-LABEL: test_mm256_srav_epi32:
2293 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
2294 ; CHECK-NEXT: ret{{[l|q]}}
2295 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2296 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2297 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2298 %bc = bitcast <8 x i32> %res to <4 x i64>
2301 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2303 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2304 ; CHECK-LABEL: test_mm256_srl_epi16:
2306 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
2307 ; CHECK-NEXT: ret{{[l|q]}}
2308 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2309 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2310 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2311 %bc = bitcast <16 x i16> %res to <4 x i64>
2314 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2316 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2317 ; CHECK-LABEL: test_mm256_srl_epi32:
2319 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
2320 ; CHECK-NEXT: ret{{[l|q]}}
2321 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2322 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2323 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2324 %bc = bitcast <8 x i32> %res to <4 x i64>
2327 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2329 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2330 ; CHECK-LABEL: test_mm256_srl_epi64:
2332 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2333 ; CHECK-NEXT: ret{{[l|q]}}
2334 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2337 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2339 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2340 ; CHECK-LABEL: test_mm256_srli_epi16:
2342 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
2343 ; CHECK-NEXT: ret{{[l|q]}}
2344 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2345 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2346 %bc = bitcast <16 x i16> %res to <4 x i64>
2349 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2351 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2352 ; CHECK-LABEL: test_mm256_srli_epi32:
2354 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0
2355 ; CHECK-NEXT: ret{{[l|q]}}
2356 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2357 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2358 %bc = bitcast <8 x i32> %res to <4 x i64>
2361 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2363 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2364 ; CHECK-LABEL: test_mm256_srli_epi64:
2366 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0
2367 ; CHECK-NEXT: ret{{[l|q]}}
2368 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2371 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2373 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2374 ; CHECK-LABEL: test_mm256_srli_si256:
2376 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2377 ; CHECK-NEXT: ret{{[l|q]}}
2378 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2379 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2380 %res = bitcast <32 x i8> %shuf to <4 x i64>
2384 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2385 ; CHECK-LABEL: test_mm_srlv_epi32:
2387 ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
2388 ; CHECK-NEXT: ret{{[l|q]}}
2389 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2390 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2391 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2392 %bc = bitcast <4 x i32> %res to <2 x i64>
2395 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2397 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2398 ; CHECK-LABEL: test_mm256_srlv_epi32:
2400 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
2401 ; CHECK-NEXT: ret{{[l|q]}}
2402 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2403 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2404 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2405 %bc = bitcast <8 x i32> %res to <4 x i64>
2408 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2410 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2411 ; CHECK-LABEL: test_mm_srlv_epi64:
2413 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
2414 ; CHECK-NEXT: ret{{[l|q]}}
2415 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2418 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2420 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2421 ; CHECK-LABEL: test_mm256_srlv_epi64:
2423 ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
2424 ; CHECK-NEXT: ret{{[l|q]}}
2425 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2428 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2430 define <4 x i64> @test_mm256_stream_load_si256(ptr%a0) {
2431 ; X86-LABEL: test_mm256_stream_load_si256:
2433 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2434 ; X86-NEXT: vmovntdqa (%eax), %ymm0
2437 ; X64-LABEL: test_mm256_stream_load_si256:
2439 ; X64-NEXT: vmovntdqa (%rdi), %ymm0
2441 %arg0 = bitcast ptr%a0 to ptr
2442 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(ptr %arg0)
2445 declare <4 x i64> @llvm.x86.avx2.movntdqa(ptr) nounwind readonly
2447 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2448 ; CHECK-LABEL: test_mm256_sub_epi8:
2450 ; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2451 ; CHECK-NEXT: ret{{[l|q]}}
2452 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2453 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2454 %res = sub <32 x i8> %arg0, %arg1
2455 %bc = bitcast <32 x i8> %res to <4 x i64>
2459 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2460 ; CHECK-LABEL: test_mm256_sub_epi16:
2462 ; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0
2463 ; CHECK-NEXT: ret{{[l|q]}}
2464 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2465 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2466 %res = sub <16 x i16> %arg0, %arg1
2467 %bc = bitcast <16 x i16> %res to <4 x i64>
2471 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2472 ; CHECK-LABEL: test_mm256_sub_epi32:
2474 ; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2475 ; CHECK-NEXT: ret{{[l|q]}}
2476 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2477 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2478 %res = sub <8 x i32> %arg0, %arg1
2479 %bc = bitcast <8 x i32> %res to <4 x i64>
2483 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2484 ; CHECK-LABEL: test_mm256_sub_epi64:
2486 ; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
2487 ; CHECK-NEXT: ret{{[l|q]}}
2488 %res = sub <4 x i64> %a0, %a1
2492 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2493 ; CHECK-LABEL: test_mm256_subs_epi8:
2495 ; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
2496 ; CHECK-NEXT: ret{{[l|q]}}
2497 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2498 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2499 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2500 %bc = bitcast <32 x i8> %res to <4 x i64>
2503 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2505 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2506 ; CHECK-LABEL: test_mm256_subs_epi16:
2508 ; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
2509 ; CHECK-NEXT: ret{{[l|q]}}
2510 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2511 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2512 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2513 %bc = bitcast <16 x i16> %res to <4 x i64>
2516 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2518 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2519 ; CHECK-LABEL: test_mm256_subs_epu8:
2521 ; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
2522 ; CHECK-NEXT: ret{{[l|q]}}
2523 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2524 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2525 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2526 %bc = bitcast <32 x i8> %res to <4 x i64>
2529 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2531 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2532 ; CHECK-LABEL: test_mm256_subs_epu16:
2534 ; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
2535 ; CHECK-NEXT: ret{{[l|q]}}
2536 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2537 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2538 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2539 %bc = bitcast <16 x i16> %res to <4 x i64>
2542 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2544 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2545 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2547 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2548 ; CHECK-NEXT: ret{{[l|q]}}
2549 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2550 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2551 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2552 %bc = bitcast <32 x i8> %res to <4 x i64>
2556 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2557 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2559 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2560 ; CHECK-NEXT: ret{{[l|q]}}
2561 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2562 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2563 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2564 %bc = bitcast <16 x i16> %res to <4 x i64>
2568 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2569 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2571 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2572 ; CHECK-NEXT: ret{{[l|q]}}
2573 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2574 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2575 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2576 %bc = bitcast <8 x i32> %res to <4 x i64>
2580 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2581 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2583 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2584 ; CHECK-NEXT: ret{{[l|q]}}
2585 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2589 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2590 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2592 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2593 ; CHECK-NEXT: ret{{[l|q]}}
2594 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2595 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2596 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2597 %bc = bitcast <32 x i8> %res to <4 x i64>
2601 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2602 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2604 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2605 ; CHECK-NEXT: ret{{[l|q]}}
2606 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2607 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2608 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2609 %bc = bitcast <16 x i16> %res to <4 x i64>
2613 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2614 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2616 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2617 ; CHECK-NEXT: ret{{[l|q]}}
2618 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2619 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2620 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2621 %bc = bitcast <8 x i32> %res to <4 x i64>
2625 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2626 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2628 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2629 ; CHECK-NEXT: ret{{[l|q]}}
2630 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2634 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2635 ; CHECK-LABEL: test_mm256_xor_si256:
2637 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
2638 ; CHECK-NEXT: ret{{[l|q]}}
2639 %res = xor <4 x i64> %a0, %a1
2643 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2645 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone