1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8 ; CHECK-LABEL: test_mm256_abs_epi8:
10 ; CHECK-NEXT: vpabsb %ymm0, %ymm0
11 ; CHECK-NEXT: ret{{[l|q]}}
12 %arg = bitcast <4 x i64> %a0 to <32 x i8>
13 %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
14 %res = bitcast <32 x i8> %abs to <4 x i64>
17 declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
19 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
20 ; CHECK-LABEL: test_mm256_abs_epi16:
22 ; CHECK-NEXT: vpabsw %ymm0, %ymm0
23 ; CHECK-NEXT: ret{{[l|q]}}
24 %arg = bitcast <4 x i64> %a0 to <16 x i16>
25 %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
26 %res = bitcast <16 x i16> %abs to <4 x i64>
29 declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
31 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
32 ; CHECK-LABEL: test_mm256_abs_epi32:
34 ; CHECK-NEXT: vpabsd %ymm0, %ymm0
35 ; CHECK-NEXT: ret{{[l|q]}}
36 %arg = bitcast <4 x i64> %a0 to <8 x i32>
37 %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
38 %res = bitcast <8 x i32> %abs to <4 x i64>
41 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
43 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
44 ; CHECK-LABEL: test_mm256_add_epi8:
46 ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
49 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
50 %res = add <32 x i8> %arg0, %arg1
51 %bc = bitcast <32 x i8> %res to <4 x i64>
55 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
56 ; CHECK-LABEL: test_mm256_add_epi16:
58 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
59 ; CHECK-NEXT: ret{{[l|q]}}
60 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
61 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
62 %res = add <16 x i16> %arg0, %arg1
63 %bc = bitcast <16 x i16> %res to <4 x i64>
67 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
68 ; CHECK-LABEL: test_mm256_add_epi32:
70 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
71 ; CHECK-NEXT: ret{{[l|q]}}
72 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
73 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
74 %res = add <8 x i32> %arg0, %arg1
75 %bc = bitcast <8 x i32> %res to <4 x i64>
79 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
80 ; CHECK-LABEL: test_mm256_add_epi64:
82 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
83 ; CHECK-NEXT: ret{{[l|q]}}
84 %res = add <4 x i64> %a0, %a1
88 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
89 ; CHECK-LABEL: test_mm256_adds_epi8:
91 ; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
92 ; CHECK-NEXT: ret{{[l|q]}}
93 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
94 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
95 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
96 %bc = bitcast <32 x i8> %res to <4 x i64>
99 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
101 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
102 ; CHECK-LABEL: test_mm256_adds_epi16:
104 ; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
105 ; CHECK-NEXT: ret{{[l|q]}}
106 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
107 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
108 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
109 %bc = bitcast <16 x i16> %res to <4 x i64>
112 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
114 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
115 ; CHECK-LABEL: test_mm256_adds_epu8:
117 ; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
118 ; CHECK-NEXT: ret{{[l|q]}}
119 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
120 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
121 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
122 %bc = bitcast <32 x i8> %res to <4 x i64>
125 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
127 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
128 ; CHECK-LABEL: test_mm256_adds_epu16:
130 ; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
131 ; CHECK-NEXT: ret{{[l|q]}}
132 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
133 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
134 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
135 %bc = bitcast <16 x i16> %res to <4 x i64>
138 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
140 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
141 ; CHECK-LABEL: test_mm256_alignr_epi8:
143 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
144 ; CHECK-NEXT: ret{{[l|q]}}
145 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
146 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
147 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
148 %res = bitcast <32 x i8> %shuf to <4 x i64>
152 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
153 ; CHECK-LABEL: test2_mm256_alignr_epi8:
155 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
156 ; CHECK-NEXT: ret{{[l|q]}}
157 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
158 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
159 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
160 %res = bitcast <32 x i8> %shuf to <4 x i64>
164 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
165 ; CHECK-LABEL: test_mm256_and_si256:
167 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
168 ; CHECK-NEXT: ret{{[l|q]}}
169 %res = and <4 x i64> %a0, %a1
173 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
174 ; CHECK-LABEL: test_mm256_andnot_si256:
176 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
177 ; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0
178 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
179 ; CHECK-NEXT: ret{{[l|q]}}
180 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
181 %res = and <4 x i64> %not, %a1
185 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
186 ; CHECK-LABEL: test_mm256_avg_epu8:
188 ; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
189 ; CHECK-NEXT: ret{{[l|q]}}
190 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
191 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
192 %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
193 %bc = bitcast <32 x i8> %res to <4 x i64>
196 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
198 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
199 ; CHECK-LABEL: test_mm256_avg_epu16:
201 ; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
202 ; CHECK-NEXT: ret{{[l|q]}}
203 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
204 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
205 %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
206 %bc = bitcast <16 x i16> %res to <4 x i64>
209 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
211 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
212 ; CHECK-LABEL: test_mm256_blend_epi16:
214 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
215 ; CHECK-NEXT: ret{{[l|q]}}
216 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
217 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
218 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219 %res = bitcast <16 x i16> %shuf to <4 x i64>
223 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
224 ; CHECK-LABEL: test_mm_blend_epi32:
226 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
227 ; CHECK-NEXT: ret{{[l|q]}}
228 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
229 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
230 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
231 %res = bitcast <4 x i32> %shuf to <2 x i64>
235 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
236 ; CHECK-LABEL: test_mm256_blend_epi32:
238 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
239 ; CHECK-NEXT: ret{{[l|q]}}
240 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
241 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
242 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
243 %res = bitcast <8 x i32> %shuf to <4 x i64>
247 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
248 ; CHECK-LABEL: test_mm256_blendv_epi8:
250 ; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
251 ; CHECK-NEXT: ret{{[l|q]}}
252 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
253 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
254 %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
255 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
256 %res = bitcast <32 x i8> %call to <4 x i64>
259 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
261 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
262 ; CHECK-LABEL: test_mm_broadcastb_epi8:
264 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
265 ; CHECK-NEXT: ret{{[l|q]}}
266 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
267 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
268 %res = bitcast <16 x i8> %shuf to <2 x i64>
272 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
273 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
275 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
276 ; CHECK-NEXT: ret{{[l|q]}}
277 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
278 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
279 %res = bitcast <32 x i8> %shuf to <4 x i64>
283 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
284 ; CHECK-LABEL: test_mm_broadcastd_epi32:
286 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
287 ; CHECK-NEXT: ret{{[l|q]}}
288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
290 %res = bitcast <4 x i32> %shuf to <2 x i64>
294 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
295 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
297 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
298 ; CHECK-NEXT: ret{{[l|q]}}
299 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
300 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
301 %res = bitcast <8 x i32> %shuf to <4 x i64>
305 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
306 ; CHECK-LABEL: test_mm_broadcastq_epi64:
308 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
309 ; CHECK-NEXT: ret{{[l|q]}}
310 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
314 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
315 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
317 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
318 ; CHECK-NEXT: ret{{[l|q]}}
319 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
323 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
324 ; CHECK-LABEL: test_mm_broadcastsd_pd:
326 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
327 ; CHECK-NEXT: ret{{[l|q]}}
328 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
329 ret <2 x double> %res
332 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
333 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
335 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
336 ; CHECK-NEXT: ret{{[l|q]}}
337 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
338 ret <4 x double> %res
341 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
342 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
344 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
345 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
346 ; CHECK-NEXT: ret{{[l|q]}}
347 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
351 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
352 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
354 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
355 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
358 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
360 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
362 %a0 = load <2 x i64>, <2 x i64>* %p0
363 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
367 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
368 ; CHECK-LABEL: test_mm_broadcastss_ps:
370 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
371 ; CHECK-NEXT: ret{{[l|q]}}
372 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
376 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
377 ; CHECK-LABEL: test_mm256_broadcastss_ps:
379 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
385 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
386 ; CHECK-LABEL: test_mm_broadcastw_epi16:
388 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
389 ; CHECK-NEXT: ret{{[l|q]}}
390 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
391 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
392 %res = bitcast <8 x i16> %shuf to <2 x i64>
396 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
397 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
399 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
400 ; CHECK-NEXT: ret{{[l|q]}}
401 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
402 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
403 %res = bitcast <16 x i16> %shuf to <4 x i64>
407 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
408 ; CHECK-LABEL: test_mm256_bslli_epi128:
410 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
411 ; CHECK-NEXT: ret{{[l|q]}}
412 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
413 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
414 %res = bitcast <32 x i8> %shuf to <4 x i64>
418 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
419 ; CHECK-LABEL: test_mm256_bsrli_epi128:
421 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
422 ; CHECK-NEXT: ret{{[l|q]}}
423 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
424 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
425 %res = bitcast <32 x i8> %shuf to <4 x i64>
429 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
430 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
432 ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
433 ; CHECK-NEXT: ret{{[l|q]}}
434 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
435 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
436 %cmp = icmp eq <32 x i8> %arg0, %arg1
437 %res = sext <32 x i1> %cmp to <32 x i8>
438 %bc = bitcast <32 x i8> %res to <4 x i64>
442 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
443 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
445 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
446 ; CHECK-NEXT: ret{{[l|q]}}
447 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
448 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
449 %cmp = icmp eq <16 x i16> %arg0, %arg1
450 %res = sext <16 x i1> %cmp to <16 x i16>
451 %bc = bitcast <16 x i16> %res to <4 x i64>
455 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
456 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
458 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
459 ; CHECK-NEXT: ret{{[l|q]}}
460 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
461 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
462 %cmp = icmp eq <8 x i32> %arg0, %arg1
463 %res = sext <8 x i1> %cmp to <8 x i32>
464 %bc = bitcast <8 x i32> %res to <4 x i64>
468 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
469 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
471 ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
472 ; CHECK-NEXT: ret{{[l|q]}}
473 %cmp = icmp eq <4 x i64> %a0, %a1
474 %res = sext <4 x i1> %cmp to <4 x i64>
478 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
479 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
481 ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
482 ; CHECK-NEXT: ret{{[l|q]}}
483 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
484 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
485 %cmp = icmp sgt <32 x i8> %arg0, %arg1
486 %res = sext <32 x i1> %cmp to <32 x i8>
487 %bc = bitcast <32 x i8> %res to <4 x i64>
491 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
492 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
494 ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
495 ; CHECK-NEXT: ret{{[l|q]}}
496 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
497 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
498 %cmp = icmp sgt <16 x i16> %arg0, %arg1
499 %res = sext <16 x i1> %cmp to <16 x i16>
500 %bc = bitcast <16 x i16> %res to <4 x i64>
504 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
505 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
507 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
508 ; CHECK-NEXT: ret{{[l|q]}}
509 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
510 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
511 %cmp = icmp sgt <8 x i32> %arg0, %arg1
512 %res = sext <8 x i1> %cmp to <8 x i32>
513 %bc = bitcast <8 x i32> %res to <4 x i64>
517 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
518 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
520 ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
521 ; CHECK-NEXT: ret{{[l|q]}}
522 %cmp = icmp sgt <4 x i64> %a0, %a1
523 %res = sext <4 x i1> %cmp to <4 x i64>
527 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
528 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
530 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
531 ; CHECK-NEXT: ret{{[l|q]}}
532 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
533 %ext = sext <16 x i8> %arg0 to <16 x i16>
534 %res = bitcast <16 x i16> %ext to <4 x i64>
538 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
539 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
541 ; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
542 ; CHECK-NEXT: ret{{[l|q]}}
543 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
544 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
545 %ext = sext <8 x i8> %shuf to <8 x i32>
546 %res = bitcast <8 x i32> %ext to <4 x i64>
550 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
551 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
553 ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
554 ; CHECK-NEXT: ret{{[l|q]}}
555 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
556 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
557 %ext = sext <4 x i8> %shuf to <4 x i64>
561 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
562 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
564 ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
565 ; CHECK-NEXT: ret{{[l|q]}}
566 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
567 %ext = sext <8 x i16> %arg0 to <8 x i32>
568 %res = bitcast <8 x i32> %ext to <4 x i64>
572 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
573 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
575 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
576 ; CHECK-NEXT: ret{{[l|q]}}
577 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
578 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579 %ext = sext <4 x i16> %shuf to <4 x i64>
583 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
584 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
586 ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
587 ; CHECK-NEXT: ret{{[l|q]}}
588 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
589 %ext = sext <4 x i32> %arg0 to <4 x i64>
593 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
594 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
596 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
597 ; CHECK-NEXT: ret{{[l|q]}}
598 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
599 %ext = zext <16 x i8> %arg0 to <16 x i16>
600 %res = bitcast <16 x i16> %ext to <4 x i64>
604 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
607 ; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
610 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
611 %ext = zext <8 x i8> %shuf to <8 x i32>
612 %res = bitcast <8 x i32> %ext to <4 x i64>
616 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
617 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
619 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
620 ; CHECK-NEXT: ret{{[l|q]}}
621 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
622 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
623 %ext = zext <4 x i8> %shuf to <4 x i64>
627 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
628 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
630 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
631 ; CHECK-NEXT: ret{{[l|q]}}
632 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
633 %ext = zext <8 x i16> %arg0 to <8 x i32>
634 %res = bitcast <8 x i32> %ext to <4 x i64>
638 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
639 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
641 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
642 ; CHECK-NEXT: ret{{[l|q]}}
643 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
644 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
645 %ext = zext <4 x i16> %shuf to <4 x i64>
649 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
650 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
652 ; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
653 ; CHECK-NEXT: ret{{[l|q]}}
654 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
655 %ext = zext <4 x i32> %arg0 to <4 x i64>
659 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
660 ; CHECK-LABEL: test_mm256_extracti128_si256:
662 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
663 ; CHECK-NEXT: vzeroupper
664 ; CHECK-NEXT: ret{{[l|q]}}
665 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
669 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
670 ; CHECK-LABEL: test_mm256_hadd_epi16:
672 ; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
673 ; CHECK-NEXT: ret{{[l|q]}}
674 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
675 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
676 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
677 %bc = bitcast <16 x i16> %res to <4 x i64>
680 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
682 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
683 ; CHECK-LABEL: test_mm256_hadd_epi32:
685 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
686 ; CHECK-NEXT: ret{{[l|q]}}
687 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
688 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
689 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
690 %bc = bitcast <8 x i32> %res to <4 x i64>
693 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
695 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
696 ; CHECK-LABEL: test_mm256_hadds_epi16:
698 ; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
699 ; CHECK-NEXT: ret{{[l|q]}}
700 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
701 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
702 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
703 %bc = bitcast <16 x i16> %res to <4 x i64>
706 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
708 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
709 ; CHECK-LABEL: test_mm256_hsub_epi16:
711 ; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
712 ; CHECK-NEXT: ret{{[l|q]}}
713 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
715 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
716 %bc = bitcast <16 x i16> %res to <4 x i64>
719 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
721 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
722 ; CHECK-LABEL: test_mm256_hsub_epi32:
724 ; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
725 ; CHECK-NEXT: ret{{[l|q]}}
726 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
727 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
728 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
729 %bc = bitcast <8 x i32> %res to <4 x i64>
732 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
734 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
735 ; CHECK-LABEL: test_mm256_hsubs_epi16:
737 ; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
738 ; CHECK-NEXT: ret{{[l|q]}}
739 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
740 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
741 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
742 %bc = bitcast <16 x i16> %res to <4 x i64>
745 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
747 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
748 ; X86-LABEL: test_mm_i32gather_epi32:
750 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
751 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
752 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
753 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
754 ; X86-NEXT: vmovdqa %xmm1, %xmm0
757 ; X64-LABEL: test_mm_i32gather_epi32:
759 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
760 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
761 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
762 ; X64-NEXT: vmovdqa %xmm1, %xmm0
764 %arg0 = bitcast i32 *%a0 to i8*
765 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
766 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
767 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
768 %bc = bitcast <4 x i32> %call to <2 x i64>
771 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
773 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
774 ; X86-LABEL: test_mm_mask_i32gather_epi32:
776 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
777 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
780 ; X64-LABEL: test_mm_mask_i32gather_epi32:
782 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
784 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
785 %arg1 = bitcast i32 *%a1 to i8*
786 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
787 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
788 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
789 %bc = bitcast <4 x i32> %call to <2 x i64>
793 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
794 ; X86-LABEL: test_mm256_i32gather_epi32:
796 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
797 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
798 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
799 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
800 ; X86-NEXT: vmovdqa %ymm1, %ymm0
803 ; X64-LABEL: test_mm256_i32gather_epi32:
805 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
806 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
807 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
808 ; X64-NEXT: vmovdqa %ymm1, %ymm0
810 %arg0 = bitcast i32 *%a0 to i8*
811 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
812 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
813 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
814 %bc = bitcast <8 x i32> %call to <4 x i64>
817 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
819 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
820 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
822 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
823 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
826 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
828 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
830 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
831 %arg1 = bitcast i32 *%a1 to i8*
832 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
833 %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
834 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
835 %bc = bitcast <8 x i32> %call to <4 x i64>
839 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
840 ; X86-LABEL: test_mm_i32gather_epi64:
842 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
843 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
844 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
845 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
846 ; X86-NEXT: vmovdqa %xmm1, %xmm0
849 ; X64-LABEL: test_mm_i32gather_epi64:
851 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
852 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
853 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
854 ; X64-NEXT: vmovdqa %xmm1, %xmm0
856 %arg0 = bitcast i64 *%a0 to i8*
857 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
858 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
861 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
863 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
864 ; X86-LABEL: test_mm_mask_i32gather_epi64:
866 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
867 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
870 ; X64-LABEL: test_mm_mask_i32gather_epi64:
872 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
874 %arg1 = bitcast i64 *%a1 to i8*
875 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
876 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
880 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
881 ; X86-LABEL: test_mm256_i32gather_epi64:
883 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
884 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
885 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
886 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
887 ; X86-NEXT: vmovdqa %ymm1, %ymm0
890 ; X64-LABEL: test_mm256_i32gather_epi64:
892 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
893 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
894 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
895 ; X64-NEXT: vmovdqa %ymm1, %ymm0
897 %arg0 = bitcast i64 *%a0 to i8*
898 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
899 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
902 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
904 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
905 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
907 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
908 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
911 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
913 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
915 %arg1 = bitcast i64 *%a1 to i8*
916 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
917 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
921 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
922 ; X86-LABEL: test_mm_i32gather_pd:
924 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
926 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
927 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
928 ; X86-NEXT: vmovapd %xmm1, %xmm0
931 ; X64-LABEL: test_mm_i32gather_pd:
933 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
934 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
935 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
936 ; X64-NEXT: vmovapd %xmm1, %xmm0
938 %arg0 = bitcast double *%a0 to i8*
939 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
940 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
941 %sext = sext <2 x i1> %cmp to <2 x i64>
942 %mask = bitcast <2 x i64> %sext to <2 x double>
943 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
944 ret <2 x double> %res
946 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
948 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
949 ; X86-LABEL: test_mm_mask_i32gather_pd:
951 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
952 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
955 ; X64-LABEL: test_mm_mask_i32gather_pd:
957 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
959 %arg1 = bitcast double *%a1 to i8*
960 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
961 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
962 ret <2 x double> %res
965 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
966 ; X86-LABEL: test_mm256_i32gather_pd:
968 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
969 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
970 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
971 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
972 ; X86-NEXT: vmovapd %ymm1, %ymm0
975 ; X64-LABEL: test_mm256_i32gather_pd:
977 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
978 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
979 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
980 ; X64-NEXT: vmovapd %ymm1, %ymm0
982 %arg0 = bitcast double *%a0 to i8*
983 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
984 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
985 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
986 ret <4 x double> %res
988 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
990 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
991 ; X86-LABEL: test_mm256_mask_i32gather_pd:
993 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
994 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
997 ; X64-LABEL: test_mm256_mask_i32gather_pd:
999 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1001 %arg1 = bitcast double *%a1 to i8*
1002 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004 ret <4 x double> %res
1007 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1008 ; X86-LABEL: test_mm_i32gather_ps:
1010 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1011 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1012 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1013 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014 ; X86-NEXT: vmovaps %xmm1, %xmm0
1017 ; X64-LABEL: test_mm_i32gather_ps:
1019 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1020 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1021 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022 ; X64-NEXT: vmovaps %xmm1, %xmm0
1024 %arg0 = bitcast float *%a0 to i8*
1025 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027 %sext = sext <4 x i1> %cmp to <4 x i32>
1028 %mask = bitcast <4 x i32> %sext to <4 x float>
1029 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030 ret <4 x float> %call
1032 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1034 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1035 ; X86-LABEL: test_mm_mask_i32gather_ps:
1037 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1038 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1041 ; X64-LABEL: test_mm_mask_i32gather_ps:
1043 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1045 %arg1 = bitcast float *%a1 to i8*
1046 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048 ret <4 x float> %call
1051 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1052 ; X86-LABEL: test_mm256_i32gather_ps:
1054 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1055 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1056 ; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1057 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058 ; X86-NEXT: vmovaps %ymm1, %ymm0
1061 ; X64-LABEL: test_mm256_i32gather_ps:
1063 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1064 ; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1065 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066 ; X64-NEXT: vmovaps %ymm1, %ymm0
1068 %arg0 = bitcast float *%a0 to i8*
1069 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072 ret <8 x float> %call
1074 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1076 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1077 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1079 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1080 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1083 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1085 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1087 %arg1 = bitcast float *%a1 to i8*
1088 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090 ret <8 x float> %call
1093 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1094 ; X86-LABEL: test_mm_i64gather_epi32:
1096 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1097 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1098 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1099 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1103 ; X64-LABEL: test_mm_i64gather_epi32:
1105 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1106 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1107 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1110 %arg0 = bitcast i32 *%a0 to i8*
1111 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113 %bc = bitcast <4 x i32> %call to <2 x i64>
1116 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1118 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1121 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1122 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1125 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1127 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1129 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130 %arg1 = bitcast i32 *%a1 to i8*
1131 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133 %bc = bitcast <4 x i32> %call to <2 x i64>
1137 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1138 ; X86-LABEL: test_mm256_i64gather_epi32:
1140 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1141 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1142 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1143 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1145 ; X86-NEXT: vzeroupper
1148 ; X64-LABEL: test_mm256_i64gather_epi32:
1150 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1151 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1152 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1154 ; X64-NEXT: vzeroupper
1156 %arg0 = bitcast i32 *%a0 to i8*
1157 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159 %bc = bitcast <4 x i32> %call to <2 x i64>
1162 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1164 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1168 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169 ; X86-NEXT: vzeroupper
1172 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1174 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175 ; X64-NEXT: vzeroupper
1177 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178 %arg1 = bitcast i32 *%a1 to i8*
1179 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181 %bc = bitcast <4 x i32> %call to <2 x i64>
1185 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1186 ; X86-LABEL: test_mm_i64gather_epi64:
1188 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1189 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1190 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1191 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1195 ; X64-LABEL: test_mm_i64gather_epi64:
1197 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1198 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1199 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1202 %arg0 = bitcast i64 *%a0 to i8*
1203 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1206 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1208 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1211 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1212 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1215 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1217 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1219 %arg1 = bitcast i64 *%a1 to i8*
1220 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1224 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1225 ; X86-LABEL: test_mm256_i64gather_epi64:
1227 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1228 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1229 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1230 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1234 ; X64-LABEL: test_mm256_i64gather_epi64:
1236 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1237 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1238 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1241 %arg0 = bitcast i64 *%a0 to i8*
1242 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1245 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1247 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1250 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1251 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1254 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1256 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1258 %arg1 = bitcast i64 *%a1 to i8*
1259 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1263 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1264 ; X86-LABEL: test_mm_i64gather_pd:
1266 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1267 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1268 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1269 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270 ; X86-NEXT: vmovapd %xmm1, %xmm0
1273 ; X64-LABEL: test_mm_i64gather_pd:
1275 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1276 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1277 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278 ; X64-NEXT: vmovapd %xmm1, %xmm0
1280 %arg0 = bitcast double *%a0 to i8*
1281 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282 %sext = sext <2 x i1> %cmp to <2 x i64>
1283 %mask = bitcast <2 x i64> %sext to <2 x double>
1284 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285 ret <2 x double> %call
1287 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1289 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1290 ; X86-LABEL: test_mm_mask_i64gather_pd:
1292 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1293 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1296 ; X64-LABEL: test_mm_mask_i64gather_pd:
1298 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1300 %arg1 = bitcast double *%a1 to i8*
1301 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302 ret <2 x double> %call
1305 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1306 ; X86-LABEL: test_mm256_i64gather_pd:
1308 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1309 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1310 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1311 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312 ; X86-NEXT: vmovapd %ymm1, %ymm0
1315 ; X64-LABEL: test_mm256_i64gather_pd:
1317 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1318 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1319 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320 ; X64-NEXT: vmovapd %ymm1, %ymm0
1322 %arg0 = bitcast double *%a0 to i8*
1323 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325 ret <4 x double> %call
1327 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1329 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1330 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1332 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1333 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1336 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1338 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1340 %arg1 = bitcast i64 *%a1 to i8*
1341 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342 ret <4 x double> %call
1345 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1346 ; X86-LABEL: test_mm_i64gather_ps:
1348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1349 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1350 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1351 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352 ; X86-NEXT: vmovaps %xmm1, %xmm0
1355 ; X64-LABEL: test_mm_i64gather_ps:
1357 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1358 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1359 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360 ; X64-NEXT: vmovaps %xmm1, %xmm0
1362 %arg0 = bitcast float *%a0 to i8*
1363 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364 %sext = sext <4 x i1> %cmp to <4 x i32>
1365 %mask = bitcast <4 x i32> %sext to <4 x float>
1366 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367 ret <4 x float> %call
1369 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1371 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1372 ; X86-LABEL: test_mm_mask_i64gather_ps:
1374 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1375 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1378 ; X64-LABEL: test_mm_mask_i64gather_ps:
1380 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1382 %arg1 = bitcast float *%a1 to i8*
1383 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384 ret <4 x float> %call
1387 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1388 ; X86-LABEL: test_mm256_i64gather_ps:
1390 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1391 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1392 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1393 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394 ; X86-NEXT: vmovaps %xmm1, %xmm0
1395 ; X86-NEXT: vzeroupper
1398 ; X64-LABEL: test_mm256_i64gather_ps:
1400 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1401 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1402 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403 ; X64-NEXT: vmovaps %xmm1, %xmm0
1404 ; X64-NEXT: vzeroupper
1406 %arg0 = bitcast float *%a0 to i8*
1407 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408 %sext = sext <4 x i1> %cmp to <4 x i32>
1409 %mask = bitcast <4 x i32> %sext to <4 x float>
1410 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411 ret <4 x float> %call
1413 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1415 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1416 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1418 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1419 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420 ; X86-NEXT: vzeroupper
1423 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1425 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426 ; X64-NEXT: vzeroupper
1428 %arg1 = bitcast float *%a1 to i8*
1429 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430 ret <4 x float> %call
1433 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1436 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1437 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438 ; CHECK-NEXT: ret{{[l|q]}}
1439 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1444 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1447 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448 ; CHECK-NEXT: ret{{[l|q]}}
1449 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1454 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455 ; CHECK-LABEL: test_mm256_madd_epi16:
1457 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
1458 ; CHECK-NEXT: ret{{[l|q]}}
1459 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462 %bc = bitcast <8 x i32> %res to <4 x i64>
1465 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1467 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1470 ; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
1471 ; CHECK-NEXT: ret{{[l|q]}}
1472 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475 %bc = bitcast <16 x i16> %res to <4 x i64>
1478 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1480 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1481 ; X86-LABEL: test_mm_maskload_epi32:
1483 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1484 ; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
1487 ; X64-LABEL: test_mm_maskload_epi32:
1489 ; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
1491 %arg0 = bitcast i32* %a0 to i8*
1492 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1493 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1494 %bc = bitcast <4 x i32> %call to <2 x i64>
1497 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1499 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1500 ; X86-LABEL: test_mm256_maskload_epi32:
1502 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1503 ; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
1506 ; X64-LABEL: test_mm256_maskload_epi32:
1508 ; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
1510 %arg0 = bitcast i32* %a0 to i8*
1511 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1512 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1513 %bc = bitcast <8 x i32> %call to <4 x i64>
1516 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1518 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1519 ; X86-LABEL: test_mm_maskload_epi64:
1521 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1522 ; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
1525 ; X64-LABEL: test_mm_maskload_epi64:
1527 ; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
1529 %arg0 = bitcast i64* %a0 to i8*
1530 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1533 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1535 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1536 ; X86-LABEL: test_mm256_maskload_epi64:
1538 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1539 ; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
1542 ; X64-LABEL: test_mm256_maskload_epi64:
1544 ; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
1546 %arg0 = bitcast i64* %a0 to i8*
1547 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1550 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1552 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1553 ; X86-LABEL: test_mm_maskstore_epi32:
1555 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1556 ; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
1559 ; X64-LABEL: test_mm_maskstore_epi32:
1561 ; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
1563 %arg0 = bitcast float* %a0 to i8*
1564 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1565 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1566 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1569 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1571 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1572 ; X86-LABEL: test_mm256_maskstore_epi32:
1574 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1575 ; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
1576 ; X86-NEXT: vzeroupper
1579 ; X64-LABEL: test_mm256_maskstore_epi32:
1581 ; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
1582 ; X64-NEXT: vzeroupper
1584 %arg0 = bitcast float* %a0 to i8*
1585 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1586 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1587 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1590 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1592 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1593 ; X86-LABEL: test_mm_maskstore_epi64:
1595 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1596 ; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
1599 ; X64-LABEL: test_mm_maskstore_epi64:
1601 ; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
1603 %arg0 = bitcast i64* %a0 to i8*
1604 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1607 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1609 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1610 ; X86-LABEL: test_mm256_maskstore_epi64:
1612 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1613 ; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
1614 ; X86-NEXT: vzeroupper
1617 ; X64-LABEL: test_mm256_maskstore_epi64:
1619 ; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
1620 ; X64-NEXT: vzeroupper
1622 %arg0 = bitcast i64* %a0 to i8*
1623 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1626 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1628 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1629 ; CHECK-LABEL: test_mm256_max_epi8:
1631 ; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1632 ; CHECK-NEXT: ret{{[l|q]}}
1633 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1634 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1635 %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1636 %bc = bitcast <32 x i8> %sel to <4 x i64>
1639 declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1641 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1642 ; CHECK-LABEL: test_mm256_max_epi16:
1644 ; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1645 ; CHECK-NEXT: ret{{[l|q]}}
1646 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1647 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1648 %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1649 %bc = bitcast <16 x i16> %sel to <4 x i64>
1652 declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1654 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1655 ; CHECK-LABEL: test_mm256_max_epi32:
1657 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1658 ; CHECK-NEXT: ret{{[l|q]}}
1659 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1660 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1661 %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1662 %bc = bitcast <8 x i32> %sel to <4 x i64>
1665 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1667 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1668 ; CHECK-LABEL: test_mm256_max_epu8:
1670 ; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
1671 ; CHECK-NEXT: ret{{[l|q]}}
1672 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1673 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1674 %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1675 %bc = bitcast <32 x i8> %sel to <4 x i64>
1678 declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1680 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1681 ; CHECK-LABEL: test_mm256_max_epu16:
1683 ; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
1684 ; CHECK-NEXT: ret{{[l|q]}}
1685 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1686 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1687 %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1688 %bc = bitcast <16 x i16> %sel to <4 x i64>
1691 declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1693 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1694 ; CHECK-LABEL: test_mm256_max_epu32:
1696 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
1697 ; CHECK-NEXT: ret{{[l|q]}}
1698 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1699 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1700 %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1701 %bc = bitcast <8 x i32> %sel to <4 x i64>
1704 declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1706 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1707 ; CHECK-LABEL: test_mm256_min_epi8:
1709 ; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
1710 ; CHECK-NEXT: ret{{[l|q]}}
1711 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1712 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1713 %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1714 %bc = bitcast <32 x i8> %sel to <4 x i64>
1717 declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1719 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1720 ; CHECK-LABEL: test_mm256_min_epi16:
1722 ; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
1723 ; CHECK-NEXT: ret{{[l|q]}}
1724 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1725 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1726 %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1727 %bc = bitcast <16 x i16> %sel to <4 x i64>
1730 declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1732 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1733 ; CHECK-LABEL: test_mm256_min_epi32:
1735 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1736 ; CHECK-NEXT: ret{{[l|q]}}
1737 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1738 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1739 %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1740 %bc = bitcast <8 x i32> %sel to <4 x i64>
1743 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1745 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1746 ; CHECK-LABEL: test_mm256_min_epu8:
1748 ; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
1749 ; CHECK-NEXT: ret{{[l|q]}}
1750 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1751 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1752 %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1753 %bc = bitcast <32 x i8> %sel to <4 x i64>
1756 declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1758 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1759 ; CHECK-LABEL: test_mm256_min_epu16:
1761 ; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
1762 ; CHECK-NEXT: ret{{[l|q]}}
1763 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1764 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1765 %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1766 %bc = bitcast <16 x i16> %sel to <4 x i64>
1769 declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1771 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1772 ; CHECK-LABEL: test_mm256_min_epu32:
1774 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
1775 ; CHECK-NEXT: ret{{[l|q]}}
1776 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1777 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1778 %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1779 %bc = bitcast <8 x i32> %sel to <4 x i64>
1782 declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1784 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1785 ; CHECK-LABEL: test_mm256_movemask_epi8:
1787 ; CHECK-NEXT: vpmovmskb %ymm0, %eax
1788 ; CHECK-NEXT: vzeroupper
1789 ; CHECK-NEXT: ret{{[l|q]}}
1790 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1791 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1794 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1796 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1797 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1799 ; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
1800 ; CHECK-NEXT: ret{{[l|q]}}
1801 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1802 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1803 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1804 %bc = bitcast <16 x i16> %call to <4 x i64>
1807 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1809 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1810 ; CHECK-LABEL: test_mm256_mul_epi32:
1812 ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
1813 ; CHECK-NEXT: ret{{[l|q]}}
1814 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1815 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1816 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1817 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1818 %res = mul nsw <4 x i64> %A1, %B1
1821 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1823 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1824 ; CHECK-LABEL: test_mm256_mul_epu32:
1826 ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1827 ; CHECK-NEXT: ret{{[l|q]}}
1828 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1829 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1830 %res = mul nuw <4 x i64> %A, %B
1833 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1835 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1836 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1838 ; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1839 ; CHECK-NEXT: ret{{[l|q]}}
1840 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1841 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1842 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1843 %bc = bitcast <16 x i16> %res to <4 x i64>
1846 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1848 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1849 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1851 ; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
1852 ; CHECK-NEXT: ret{{[l|q]}}
1853 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1854 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1855 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1856 %bc = bitcast <16 x i16> %res to <4 x i64>
1859 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1861 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1862 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1864 ; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
1865 ; CHECK-NEXT: ret{{[l|q]}}
1866 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1867 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1868 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1869 %bc = bitcast <16 x i16> %res to <4 x i64>
1872 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1874 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1875 ; CHECK-LABEL: test_mm256_mullo_epi16:
1877 ; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1878 ; CHECK-NEXT: ret{{[l|q]}}
1879 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1880 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1881 %res = mul <16 x i16> %arg0, %arg1
1882 %bc = bitcast <16 x i16> %res to <4 x i64>
1886 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1887 ; CHECK-LABEL: test_mm256_mullo_epi32:
1889 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1890 ; CHECK-NEXT: ret{{[l|q]}}
1891 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1892 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1893 %res = mul <8 x i32> %arg0, %arg1
1894 %bc = bitcast <8 x i32> %res to <4 x i64>
1898 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1899 ; CHECK-LABEL: test_mm256_or_si256:
1901 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1902 ; CHECK-NEXT: ret{{[l|q]}}
1903 %res = or <4 x i64> %a0, %a1
1907 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1908 ; CHECK-LABEL: test_mm256_packs_epi16:
1910 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
1911 ; CHECK-NEXT: ret{{[l|q]}}
1912 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1913 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1914 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1915 %res = bitcast <32 x i8> %call to <4 x i64>
1918 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1920 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1921 ; CHECK-LABEL: test_mm256_packs_epi32:
1923 ; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
1924 ; CHECK-NEXT: ret{{[l|q]}}
1925 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1926 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1927 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1928 %res = bitcast <16 x i16> %call to <4 x i64>
1931 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1933 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1934 ; CHECK-LABEL: test_mm256_packus_epi16:
1936 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1937 ; CHECK-NEXT: ret{{[l|q]}}
1938 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1939 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1940 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1941 %res = bitcast <32 x i8> %call to <4 x i64>
1944 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1946 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1947 ; CHECK-LABEL: test_mm256_packus_epi32:
1949 ; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1950 ; CHECK-NEXT: ret{{[l|q]}}
1951 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1952 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1953 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1954 %res = bitcast <16 x i16> %call to <4 x i64>
1957 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1959 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1960 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1962 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1963 ; CHECK-NEXT: ret{{[l|q]}}
1964 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1967 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1969 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1970 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1972 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1973 ; CHECK-NEXT: ret{{[l|q]}}
1974 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1978 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1979 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1981 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1982 ; CHECK-NEXT: ret{{[l|q]}}
1983 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1984 ret <4 x double> %res
1987 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1988 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1990 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1991 ; CHECK-NEXT: ret{{[l|q]}}
1992 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1993 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1994 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1995 %res = bitcast <8 x i32> %call to <4 x i64>
1998 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2000 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2001 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2003 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2004 ; CHECK-NEXT: ret{{[l|q]}}
2005 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2006 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2007 ret <8 x float> %res
2009 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2011 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2012 ; CHECK-LABEL: test_mm256_sad_epu8:
2014 ; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
2015 ; CHECK-NEXT: ret{{[l|q]}}
2016 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2017 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2018 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2021 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2023 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2024 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2026 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2027 ; CHECK-NEXT: ret{{[l|q]}}
2028 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2029 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2030 %res = bitcast <8 x i32> %shuf to <4 x i64>
2034 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2035 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2037 ; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2038 ; CHECK-NEXT: ret{{[l|q]}}
2039 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2040 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2041 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2042 %res = bitcast <32 x i8> %shuf to <4 x i64>
2045 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2047 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2048 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2050 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2051 ; CHECK-NEXT: ret{{[l|q]}}
2052 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2053 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2054 %res = bitcast <16 x i16> %shuf to <4 x i64>
2058 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2059 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2061 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2062 ; CHECK-NEXT: ret{{[l|q]}}
2063 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2064 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2065 %res = bitcast <16 x i16> %shuf to <4 x i64>
2069 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2070 ; CHECK-LABEL: test_mm256_sign_epi8:
2072 ; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0
2073 ; CHECK-NEXT: ret{{[l|q]}}
2074 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2075 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2076 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2077 %res = bitcast <32 x i8> %call to <4 x i64>
2080 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2082 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2083 ; CHECK-LABEL: test_mm256_sign_epi16:
2085 ; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0
2086 ; CHECK-NEXT: ret{{[l|q]}}
2087 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2088 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2089 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2090 %res = bitcast <16 x i16> %call to <4 x i64>
2093 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2095 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2096 ; CHECK-LABEL: test_mm256_sign_epi32:
2098 ; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0
2099 ; CHECK-NEXT: ret{{[l|q]}}
2100 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2101 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2102 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2103 %res = bitcast <8 x i32> %call to <4 x i64>
2106 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2108 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2109 ; CHECK-LABEL: test_mm256_sll_epi16:
2111 ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
2112 ; CHECK-NEXT: ret{{[l|q]}}
2113 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2114 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2115 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2116 %bc = bitcast <16 x i16> %res to <4 x i64>
2119 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2121 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2122 ; CHECK-LABEL: test_mm256_sll_epi32:
2124 ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
2125 ; CHECK-NEXT: ret{{[l|q]}}
2126 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2127 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2128 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2129 %bc = bitcast <8 x i32> %res to <4 x i64>
2132 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2134 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2135 ; CHECK-LABEL: test_mm256_sll_epi64:
2137 ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
2138 ; CHECK-NEXT: ret{{[l|q]}}
2139 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2142 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2144 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2145 ; CHECK-LABEL: test_mm256_slli_epi16:
2147 ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
2148 ; CHECK-NEXT: ret{{[l|q]}}
2149 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2150 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2151 %bc = bitcast <16 x i16> %res to <4 x i64>
2154 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2156 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2157 ; CHECK-LABEL: test_mm256_slli_epi32:
2159 ; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
2160 ; CHECK-NEXT: ret{{[l|q]}}
2161 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2162 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2163 %bc = bitcast <8 x i32> %res to <4 x i64>
2166 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2168 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2169 ; CHECK-LABEL: test_mm256_slli_epi64:
2171 ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
2172 ; CHECK-NEXT: ret{{[l|q]}}
2173 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2176 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2178 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2179 ; CHECK-LABEL: test_mm256_slli_si256:
2181 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2182 ; CHECK-NEXT: ret{{[l|q]}}
2183 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2184 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2185 %res = bitcast <32 x i8> %shuf to <4 x i64>
2189 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2190 ; CHECK-LABEL: test_mm_sllv_epi32:
2192 ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
2193 ; CHECK-NEXT: ret{{[l|q]}}
2194 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2195 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2196 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2197 %bc = bitcast <4 x i32> %res to <2 x i64>
2200 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2202 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2203 ; CHECK-LABEL: test_mm256_sllv_epi32:
2205 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
2206 ; CHECK-NEXT: ret{{[l|q]}}
2207 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2208 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2209 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2210 %bc = bitcast <8 x i32> %res to <4 x i64>
2213 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2215 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2216 ; CHECK-LABEL: test_mm_sllv_epi64:
2218 ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
2219 ; CHECK-NEXT: ret{{[l|q]}}
2220 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2223 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2225 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2226 ; CHECK-LABEL: test_mm256_sllv_epi64:
2228 ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
2229 ; CHECK-NEXT: ret{{[l|q]}}
2230 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2233 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2235 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2236 ; CHECK-LABEL: test_mm256_sra_epi16:
2238 ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
2239 ; CHECK-NEXT: ret{{[l|q]}}
2240 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2241 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2242 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2243 %bc = bitcast <16 x i16> %res to <4 x i64>
2246 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2248 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2249 ; CHECK-LABEL: test_mm256_sra_epi32:
2251 ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
2252 ; CHECK-NEXT: ret{{[l|q]}}
2253 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2254 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2255 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2256 %bc = bitcast <8 x i32> %res to <4 x i64>
2259 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2261 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2262 ; CHECK-LABEL: test_mm256_srai_epi16:
2264 ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
2265 ; CHECK-NEXT: ret{{[l|q]}}
2266 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2267 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2268 %bc = bitcast <16 x i16> %res to <4 x i64>
2271 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2273 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2274 ; CHECK-LABEL: test_mm256_srai_epi32:
2276 ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
2277 ; CHECK-NEXT: ret{{[l|q]}}
2278 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2279 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2280 %bc = bitcast <8 x i32> %res to <4 x i64>
2283 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2285 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2286 ; CHECK-LABEL: test_mm_srav_epi32:
2288 ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
2289 ; CHECK-NEXT: ret{{[l|q]}}
2290 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2291 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2292 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2293 %bc = bitcast <4 x i32> %res to <2 x i64>
2296 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2298 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2299 ; CHECK-LABEL: test_mm256_srav_epi32:
2301 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
2302 ; CHECK-NEXT: ret{{[l|q]}}
2303 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2304 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2305 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2306 %bc = bitcast <8 x i32> %res to <4 x i64>
2309 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2311 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2312 ; CHECK-LABEL: test_mm256_srl_epi16:
2314 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
2315 ; CHECK-NEXT: ret{{[l|q]}}
2316 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2317 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2318 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2319 %bc = bitcast <16 x i16> %res to <4 x i64>
2322 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2324 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2325 ; CHECK-LABEL: test_mm256_srl_epi32:
2327 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
2328 ; CHECK-NEXT: ret{{[l|q]}}
2329 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2330 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2331 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2332 %bc = bitcast <8 x i32> %res to <4 x i64>
2335 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2337 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2338 ; CHECK-LABEL: test_mm256_srl_epi64:
2340 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2341 ; CHECK-NEXT: ret{{[l|q]}}
2342 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2345 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2347 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2348 ; CHECK-LABEL: test_mm256_srli_epi16:
2350 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
2351 ; CHECK-NEXT: ret{{[l|q]}}
2352 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2353 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2354 %bc = bitcast <16 x i16> %res to <4 x i64>
2357 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2359 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2360 ; CHECK-LABEL: test_mm256_srli_epi32:
2362 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0
2363 ; CHECK-NEXT: ret{{[l|q]}}
2364 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2365 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2366 %bc = bitcast <8 x i32> %res to <4 x i64>
2369 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2371 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2372 ; CHECK-LABEL: test_mm256_srli_epi64:
2374 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0
2375 ; CHECK-NEXT: ret{{[l|q]}}
2376 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2379 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2381 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2382 ; CHECK-LABEL: test_mm256_srli_si256:
2384 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2385 ; CHECK-NEXT: ret{{[l|q]}}
2386 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2387 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2388 %res = bitcast <32 x i8> %shuf to <4 x i64>
2392 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2393 ; CHECK-LABEL: test_mm_srlv_epi32:
2395 ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
2396 ; CHECK-NEXT: ret{{[l|q]}}
2397 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2398 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2399 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2400 %bc = bitcast <4 x i32> %res to <2 x i64>
2403 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2405 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2406 ; CHECK-LABEL: test_mm256_srlv_epi32:
2408 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
2409 ; CHECK-NEXT: ret{{[l|q]}}
2410 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2411 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2412 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2413 %bc = bitcast <8 x i32> %res to <4 x i64>
2416 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2418 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2419 ; CHECK-LABEL: test_mm_srlv_epi64:
2421 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
2422 ; CHECK-NEXT: ret{{[l|q]}}
2423 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2426 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2428 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2429 ; CHECK-LABEL: test_mm256_srlv_epi64:
2431 ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
2432 ; CHECK-NEXT: ret{{[l|q]}}
2433 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2436 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2438 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2439 ; X86-LABEL: test_mm256_stream_load_si256:
2441 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2442 ; X86-NEXT: vmovntdqa (%eax), %ymm0
2445 ; X64-LABEL: test_mm256_stream_load_si256:
2447 ; X64-NEXT: vmovntdqa (%rdi), %ymm0
2449 %arg0 = bitcast <4 x i64> *%a0 to i8*
2450 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2453 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2455 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2456 ; CHECK-LABEL: test_mm256_sub_epi8:
2458 ; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2459 ; CHECK-NEXT: ret{{[l|q]}}
2460 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2461 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2462 %res = sub <32 x i8> %arg0, %arg1
2463 %bc = bitcast <32 x i8> %res to <4 x i64>
2467 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2468 ; CHECK-LABEL: test_mm256_sub_epi16:
2470 ; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0
2471 ; CHECK-NEXT: ret{{[l|q]}}
2472 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2473 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2474 %res = sub <16 x i16> %arg0, %arg1
2475 %bc = bitcast <16 x i16> %res to <4 x i64>
2479 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2480 ; CHECK-LABEL: test_mm256_sub_epi32:
2482 ; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2483 ; CHECK-NEXT: ret{{[l|q]}}
2484 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2485 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2486 %res = sub <8 x i32> %arg0, %arg1
2487 %bc = bitcast <8 x i32> %res to <4 x i64>
2491 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2492 ; CHECK-LABEL: test_mm256_sub_epi64:
2494 ; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
2495 ; CHECK-NEXT: ret{{[l|q]}}
2496 %res = sub <4 x i64> %a0, %a1
2500 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2501 ; CHECK-LABEL: test_mm256_subs_epi8:
2503 ; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
2504 ; CHECK-NEXT: ret{{[l|q]}}
2505 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2506 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2507 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2508 %bc = bitcast <32 x i8> %res to <4 x i64>
2511 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2513 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2514 ; CHECK-LABEL: test_mm256_subs_epi16:
2516 ; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
2517 ; CHECK-NEXT: ret{{[l|q]}}
2518 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2519 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2520 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2521 %bc = bitcast <16 x i16> %res to <4 x i64>
2524 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2526 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2527 ; CHECK-LABEL: test_mm256_subs_epu8:
2529 ; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
2530 ; CHECK-NEXT: ret{{[l|q]}}
2531 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2532 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2533 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2534 %bc = bitcast <32 x i8> %res to <4 x i64>
2537 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2539 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2540 ; CHECK-LABEL: test_mm256_subs_epu16:
2542 ; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
2543 ; CHECK-NEXT: ret{{[l|q]}}
2544 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2545 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2546 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2547 %bc = bitcast <16 x i16> %res to <4 x i64>
2550 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2552 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2553 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2555 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2556 ; CHECK-NEXT: ret{{[l|q]}}
2557 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2558 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2559 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2560 %bc = bitcast <32 x i8> %res to <4 x i64>
2564 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2565 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2567 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2568 ; CHECK-NEXT: ret{{[l|q]}}
2569 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2570 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2571 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2572 %bc = bitcast <16 x i16> %res to <4 x i64>
2576 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2577 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2579 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2580 ; CHECK-NEXT: ret{{[l|q]}}
2581 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2582 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2583 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2584 %bc = bitcast <8 x i32> %res to <4 x i64>
2588 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2589 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2591 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2592 ; CHECK-NEXT: ret{{[l|q]}}
2593 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2597 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2598 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2600 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2601 ; CHECK-NEXT: ret{{[l|q]}}
2602 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2603 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2604 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2605 %bc = bitcast <32 x i8> %res to <4 x i64>
2609 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2610 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2612 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2613 ; CHECK-NEXT: ret{{[l|q]}}
2614 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2615 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2616 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2617 %bc = bitcast <16 x i16> %res to <4 x i64>
2621 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2622 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2624 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2625 ; CHECK-NEXT: ret{{[l|q]}}
2626 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2627 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2628 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2629 %bc = bitcast <8 x i32> %res to <4 x i64>
2633 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2634 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2636 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2637 ; CHECK-NEXT: ret{{[l|q]}}
2638 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2642 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2643 ; CHECK-LABEL: test_mm256_xor_si256:
2645 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
2646 ; CHECK-NEXT: ret{{[l|q]}}
2647 %res = xor <4 x i64> %a0, %a1
2651 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2653 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone