1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8 ; CHECK-LABEL: test_mm256_abs_epi8:
10 ; CHECK-NEXT: vpabsb %ymm0, %ymm0
11 ; CHECK-NEXT: ret{{[l|q]}}
12 %arg = bitcast <4 x i64> %a0 to <32 x i8>
13 %sub = sub <32 x i8> zeroinitializer, %arg
14 %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
15 %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
16 %res = bitcast <32 x i8> %sel to <4 x i64>
19 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
21 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
22 ; CHECK-LABEL: test_mm256_abs_epi16:
24 ; CHECK-NEXT: vpabsw %ymm0, %ymm0
25 ; CHECK-NEXT: ret{{[l|q]}}
26 %arg = bitcast <4 x i64> %a0 to <16 x i16>
27 %sub = sub <16 x i16> zeroinitializer, %arg
28 %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
29 %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
30 %res = bitcast <16 x i16> %sel to <4 x i64>
33 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
35 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
36 ; CHECK-LABEL: test_mm256_abs_epi32:
38 ; CHECK-NEXT: vpabsd %ymm0, %ymm0
39 ; CHECK-NEXT: ret{{[l|q]}}
40 %arg = bitcast <4 x i64> %a0 to <8 x i32>
41 %sub = sub <8 x i32> zeroinitializer, %arg
42 %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
43 %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
44 %res = bitcast <8 x i32> %sel to <4 x i64>
47 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
49 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
50 ; CHECK-LABEL: test_mm256_add_epi8:
52 ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
53 ; CHECK-NEXT: ret{{[l|q]}}
54 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
55 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
56 %res = add <32 x i8> %arg0, %arg1
57 %bc = bitcast <32 x i8> %res to <4 x i64>
61 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
62 ; CHECK-LABEL: test_mm256_add_epi16:
64 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
65 ; CHECK-NEXT: ret{{[l|q]}}
66 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
67 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
68 %res = add <16 x i16> %arg0, %arg1
69 %bc = bitcast <16 x i16> %res to <4 x i64>
73 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
74 ; CHECK-LABEL: test_mm256_add_epi32:
76 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
77 ; CHECK-NEXT: ret{{[l|q]}}
78 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
79 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
80 %res = add <8 x i32> %arg0, %arg1
81 %bc = bitcast <8 x i32> %res to <4 x i64>
85 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
86 ; CHECK-LABEL: test_mm256_add_epi64:
88 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
89 ; CHECK-NEXT: ret{{[l|q]}}
90 %res = add <4 x i64> %a0, %a1
94 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
95 ; CHECK-LABEL: test_mm256_adds_epi8:
97 ; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
98 ; CHECK-NEXT: ret{{[l|q]}}
99 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
100 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
101 %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
102 %bc = bitcast <32 x i8> %res to <4 x i64>
105 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
107 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
108 ; CHECK-LABEL: test_mm256_adds_epi16:
110 ; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
111 ; CHECK-NEXT: ret{{[l|q]}}
112 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
113 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
114 %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
115 %bc = bitcast <16 x i16> %res to <4 x i64>
118 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
120 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
121 ; CHECK-LABEL: test_mm256_adds_epu8:
123 ; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
124 ; CHECK-NEXT: ret{{[l|q]}}
125 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
126 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
127 %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
128 %bc = bitcast <32 x i8> %res to <4 x i64>
131 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
133 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
134 ; CHECK-LABEL: test_mm256_adds_epu16:
136 ; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
137 ; CHECK-NEXT: ret{{[l|q]}}
138 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
139 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
140 %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
141 %bc = bitcast <16 x i16> %res to <4 x i64>
144 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
146 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
147 ; CHECK-LABEL: test_mm256_alignr_epi8:
149 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
150 ; CHECK-NEXT: ret{{[l|q]}}
151 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
152 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
153 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
154 %res = bitcast <32 x i8> %shuf to <4 x i64>
158 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
159 ; CHECK-LABEL: test2_mm256_alignr_epi8:
161 ; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
162 ; CHECK-NEXT: ret{{[l|q]}}
163 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
164 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
165 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
166 %res = bitcast <32 x i8> %shuf to <4 x i64>
170 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
171 ; CHECK-LABEL: test_mm256_and_si256:
173 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
174 ; CHECK-NEXT: ret{{[l|q]}}
175 %res = and <4 x i64> %a0, %a1
179 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
180 ; CHECK-LABEL: test_mm256_andnot_si256:
182 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
183 ; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0
184 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
185 ; CHECK-NEXT: ret{{[l|q]}}
186 %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
187 %res = and <4 x i64> %not, %a1
191 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
192 ; CHECK-LABEL: test_mm256_avg_epu8:
194 ; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
195 ; CHECK-NEXT: ret{{[l|q]}}
196 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
197 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
198 %zext0 = zext <32 x i8> %arg0 to <32 x i16>
199 %zext1 = zext <32 x i8> %arg1 to <32 x i16>
200 %add = add <32 x i16> %zext0, %zext1
201 %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
202 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
203 %res = trunc <32 x i16> %lshr to <32 x i8>
204 %bc = bitcast <32 x i8> %res to <4 x i64>
208 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
209 ; CHECK-LABEL: test_mm256_avg_epu16:
211 ; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
212 ; CHECK-NEXT: ret{{[l|q]}}
213 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
214 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
215 %zext0 = zext <16 x i16> %arg0 to <16 x i32>
216 %zext1 = zext <16 x i16> %arg1 to <16 x i32>
217 %add = add <16 x i32> %zext0, %zext1
218 %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
219 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
220 %res = trunc <16 x i32> %lshr to <16 x i16>
221 %bc = bitcast <16 x i16> %res to <4 x i64>
225 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
226 ; CHECK-LABEL: test_mm256_blend_epi16:
228 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
229 ; CHECK-NEXT: ret{{[l|q]}}
230 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
231 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
232 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
233 %res = bitcast <16 x i16> %shuf to <4 x i64>
237 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
238 ; CHECK-LABEL: test_mm_blend_epi32:
240 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
241 ; CHECK-NEXT: ret{{[l|q]}}
242 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
243 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
244 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
245 %res = bitcast <4 x i32> %shuf to <2 x i64>
249 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
250 ; CHECK-LABEL: test_mm256_blend_epi32:
252 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
253 ; CHECK-NEXT: ret{{[l|q]}}
254 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
255 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
256 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
257 %res = bitcast <8 x i32> %shuf to <4 x i64>
261 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
262 ; CHECK-LABEL: test_mm256_blendv_epi8:
264 ; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
265 ; CHECK-NEXT: ret{{[l|q]}}
266 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
267 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
268 %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
269 %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
270 %res = bitcast <32 x i8> %call to <4 x i64>
273 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
275 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
276 ; CHECK-LABEL: test_mm_broadcastb_epi8:
278 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
279 ; CHECK-NEXT: ret{{[l|q]}}
280 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
281 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
282 %res = bitcast <16 x i8> %shuf to <2 x i64>
286 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
287 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
289 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
290 ; CHECK-NEXT: ret{{[l|q]}}
291 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
292 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
293 %res = bitcast <32 x i8> %shuf to <4 x i64>
297 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
298 ; CHECK-LABEL: test_mm_broadcastd_epi32:
300 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
301 ; CHECK-NEXT: ret{{[l|q]}}
302 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
303 %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
304 %res = bitcast <4 x i32> %shuf to <2 x i64>
308 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
309 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
311 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
312 ; CHECK-NEXT: ret{{[l|q]}}
313 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
314 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
315 %res = bitcast <8 x i32> %shuf to <4 x i64>
319 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
320 ; CHECK-LABEL: test_mm_broadcastq_epi64:
322 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
323 ; CHECK-NEXT: ret{{[l|q]}}
324 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
328 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
329 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
331 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
332 ; CHECK-NEXT: ret{{[l|q]}}
333 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
337 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
338 ; CHECK-LABEL: test_mm_broadcastsd_pd:
340 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
341 ; CHECK-NEXT: ret{{[l|q]}}
342 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
343 ret <2 x double> %res
346 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
347 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
349 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
350 ; CHECK-NEXT: ret{{[l|q]}}
351 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
352 ret <4 x double> %res
355 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
356 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
358 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
359 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
360 ; CHECK-NEXT: ret{{[l|q]}}
361 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
365 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
366 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
368 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
369 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
372 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
374 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
376 %a0 = load <2 x i64>, <2 x i64>* %p0
377 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
381 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
382 ; CHECK-LABEL: test_mm_broadcastss_ps:
384 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
385 ; CHECK-NEXT: ret{{[l|q]}}
386 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
390 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
391 ; CHECK-LABEL: test_mm256_broadcastss_ps:
393 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
394 ; CHECK-NEXT: ret{{[l|q]}}
395 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
399 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
400 ; CHECK-LABEL: test_mm_broadcastw_epi16:
402 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
403 ; CHECK-NEXT: ret{{[l|q]}}
404 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
405 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
406 %res = bitcast <8 x i16> %shuf to <2 x i64>
410 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
411 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
413 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
414 ; CHECK-NEXT: ret{{[l|q]}}
415 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
416 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
417 %res = bitcast <16 x i16> %shuf to <4 x i64>
421 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
422 ; CHECK-LABEL: test_mm256_bslli_epi128:
424 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
425 ; CHECK-NEXT: ret{{[l|q]}}
426 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
427 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
428 %res = bitcast <32 x i8> %shuf to <4 x i64>
432 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
433 ; CHECK-LABEL: test_mm256_bsrli_epi128:
435 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
436 ; CHECK-NEXT: ret{{[l|q]}}
437 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
438 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
439 %res = bitcast <32 x i8> %shuf to <4 x i64>
443 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
444 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
446 ; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
447 ; CHECK-NEXT: ret{{[l|q]}}
448 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
449 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
450 %cmp = icmp eq <32 x i8> %arg0, %arg1
451 %res = sext <32 x i1> %cmp to <32 x i8>
452 %bc = bitcast <32 x i8> %res to <4 x i64>
456 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
457 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
459 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
460 ; CHECK-NEXT: ret{{[l|q]}}
461 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
462 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
463 %cmp = icmp eq <16 x i16> %arg0, %arg1
464 %res = sext <16 x i1> %cmp to <16 x i16>
465 %bc = bitcast <16 x i16> %res to <4 x i64>
469 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
470 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
472 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
473 ; CHECK-NEXT: ret{{[l|q]}}
474 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
475 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
476 %cmp = icmp eq <8 x i32> %arg0, %arg1
477 %res = sext <8 x i1> %cmp to <8 x i32>
478 %bc = bitcast <8 x i32> %res to <4 x i64>
482 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
483 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
485 ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
486 ; CHECK-NEXT: ret{{[l|q]}}
487 %cmp = icmp eq <4 x i64> %a0, %a1
488 %res = sext <4 x i1> %cmp to <4 x i64>
492 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
493 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
495 ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
496 ; CHECK-NEXT: ret{{[l|q]}}
497 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
498 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
499 %cmp = icmp sgt <32 x i8> %arg0, %arg1
500 %res = sext <32 x i1> %cmp to <32 x i8>
501 %bc = bitcast <32 x i8> %res to <4 x i64>
505 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
506 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
508 ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
509 ; CHECK-NEXT: ret{{[l|q]}}
510 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
511 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
512 %cmp = icmp sgt <16 x i16> %arg0, %arg1
513 %res = sext <16 x i1> %cmp to <16 x i16>
514 %bc = bitcast <16 x i16> %res to <4 x i64>
518 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
519 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
521 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
522 ; CHECK-NEXT: ret{{[l|q]}}
523 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
524 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
525 %cmp = icmp sgt <8 x i32> %arg0, %arg1
526 %res = sext <8 x i1> %cmp to <8 x i32>
527 %bc = bitcast <8 x i32> %res to <4 x i64>
531 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
532 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
534 ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
535 ; CHECK-NEXT: ret{{[l|q]}}
536 %cmp = icmp sgt <4 x i64> %a0, %a1
537 %res = sext <4 x i1> %cmp to <4 x i64>
541 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
542 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
544 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
545 ; CHECK-NEXT: ret{{[l|q]}}
546 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
547 %ext = sext <16 x i8> %arg0 to <16 x i16>
548 %res = bitcast <16 x i16> %ext to <4 x i64>
552 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
553 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
555 ; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
556 ; CHECK-NEXT: ret{{[l|q]}}
557 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
558 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
559 %ext = sext <8 x i8> %shuf to <8 x i32>
560 %res = bitcast <8 x i32> %ext to <4 x i64>
564 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
565 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
567 ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
568 ; CHECK-NEXT: ret{{[l|q]}}
569 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
570 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
571 %ext = sext <4 x i8> %shuf to <4 x i64>
575 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
576 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
578 ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
579 ; CHECK-NEXT: ret{{[l|q]}}
580 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
581 %ext = sext <8 x i16> %arg0 to <8 x i32>
582 %res = bitcast <8 x i32> %ext to <4 x i64>
586 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
587 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
589 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
590 ; CHECK-NEXT: ret{{[l|q]}}
591 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
592 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
593 %ext = sext <4 x i16> %shuf to <4 x i64>
597 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
598 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
600 ; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
601 ; CHECK-NEXT: ret{{[l|q]}}
602 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
603 %ext = sext <4 x i32> %arg0 to <4 x i64>
607 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
608 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
610 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
611 ; CHECK-NEXT: ret{{[l|q]}}
612 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
613 %ext = zext <16 x i8> %arg0 to <16 x i16>
614 %res = bitcast <16 x i16> %ext to <4 x i64>
618 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
619 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
621 ; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
622 ; CHECK-NEXT: ret{{[l|q]}}
623 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
624 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
625 %ext = zext <8 x i8> %shuf to <8 x i32>
626 %res = bitcast <8 x i32> %ext to <4 x i64>
630 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
631 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
633 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
634 ; CHECK-NEXT: ret{{[l|q]}}
635 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
636 %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
637 %ext = zext <4 x i8> %shuf to <4 x i64>
641 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
642 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
644 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
645 ; CHECK-NEXT: ret{{[l|q]}}
646 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
647 %ext = zext <8 x i16> %arg0 to <8 x i32>
648 %res = bitcast <8 x i32> %ext to <4 x i64>
652 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
653 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
655 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
656 ; CHECK-NEXT: ret{{[l|q]}}
657 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
658 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
659 %ext = zext <4 x i16> %shuf to <4 x i64>
663 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
664 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
666 ; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
667 ; CHECK-NEXT: ret{{[l|q]}}
668 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
669 %ext = zext <4 x i32> %arg0 to <4 x i64>
673 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
674 ; CHECK-LABEL: test_mm256_extracti128_si256:
676 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
677 ; CHECK-NEXT: vzeroupper
678 ; CHECK-NEXT: ret{{[l|q]}}
679 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
683 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
684 ; CHECK-LABEL: test_mm256_hadd_epi16:
686 ; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
687 ; CHECK-NEXT: ret{{[l|q]}}
688 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
689 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
690 %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
691 %bc = bitcast <16 x i16> %res to <4 x i64>
694 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
696 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
697 ; CHECK-LABEL: test_mm256_hadd_epi32:
699 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
700 ; CHECK-NEXT: ret{{[l|q]}}
701 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
702 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
703 %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
704 %bc = bitcast <8 x i32> %res to <4 x i64>
707 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
709 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
710 ; CHECK-LABEL: test_mm256_hadds_epi16:
712 ; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
713 ; CHECK-NEXT: ret{{[l|q]}}
714 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
715 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
716 %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
717 %bc = bitcast <16 x i16> %res to <4 x i64>
720 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
722 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
723 ; CHECK-LABEL: test_mm256_hsub_epi16:
725 ; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
726 ; CHECK-NEXT: ret{{[l|q]}}
727 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
728 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
729 %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
730 %bc = bitcast <16 x i16> %res to <4 x i64>
733 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
735 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
736 ; CHECK-LABEL: test_mm256_hsub_epi32:
738 ; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
739 ; CHECK-NEXT: ret{{[l|q]}}
740 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
741 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
742 %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
743 %bc = bitcast <8 x i32> %res to <4 x i64>
746 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
748 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
749 ; CHECK-LABEL: test_mm256_hsubs_epi16:
751 ; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
752 ; CHECK-NEXT: ret{{[l|q]}}
753 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
754 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
755 %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
756 %bc = bitcast <16 x i16> %res to <4 x i64>
759 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
761 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
762 ; X86-LABEL: test_mm_i32gather_epi32:
764 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
765 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
766 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
767 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
768 ; X86-NEXT: vmovdqa %xmm1, %xmm0
771 ; X64-LABEL: test_mm_i32gather_epi32:
773 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
774 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
775 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
776 ; X64-NEXT: vmovdqa %xmm1, %xmm0
778 %arg0 = bitcast i32 *%a0 to i8*
779 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
780 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
781 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
782 %bc = bitcast <4 x i32> %call to <2 x i64>
785 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
787 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
788 ; X86-LABEL: test_mm_mask_i32gather_epi32:
790 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
791 ; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
794 ; X64-LABEL: test_mm_mask_i32gather_epi32:
796 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
798 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
799 %arg1 = bitcast i32 *%a1 to i8*
800 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
801 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
802 %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
803 %bc = bitcast <4 x i32> %call to <2 x i64>
807 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
808 ; X86-LABEL: test_mm256_i32gather_epi32:
810 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
811 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
812 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
813 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
814 ; X86-NEXT: vmovdqa %ymm1, %ymm0
817 ; X64-LABEL: test_mm256_i32gather_epi32:
819 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
820 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
821 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
822 ; X64-NEXT: vmovdqa %ymm1, %ymm0
824 %arg0 = bitcast i32 *%a0 to i8*
825 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
826 %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
827 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
828 %bc = bitcast <8 x i32> %call to <4 x i64>
831 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
833 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
834 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
836 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
837 ; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
840 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
842 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
844 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
845 %arg1 = bitcast i32 *%a1 to i8*
846 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
847 %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
848 %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
849 %bc = bitcast <8 x i32> %call to <4 x i64>
853 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
854 ; X86-LABEL: test_mm_i32gather_epi64:
856 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
857 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
858 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
859 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
860 ; X86-NEXT: vmovdqa %xmm1, %xmm0
863 ; X64-LABEL: test_mm_i32gather_epi64:
865 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
866 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
867 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
868 ; X64-NEXT: vmovdqa %xmm1, %xmm0
870 %arg0 = bitcast i64 *%a0 to i8*
871 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
872 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
875 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
877 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
878 ; X86-LABEL: test_mm_mask_i32gather_epi64:
880 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
881 ; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
884 ; X64-LABEL: test_mm_mask_i32gather_epi64:
886 ; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
888 %arg1 = bitcast i64 *%a1 to i8*
889 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
890 %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
894 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
895 ; X86-LABEL: test_mm256_i32gather_epi64:
897 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
898 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
899 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
900 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
901 ; X86-NEXT: vmovdqa %ymm1, %ymm0
904 ; X64-LABEL: test_mm256_i32gather_epi64:
906 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
907 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
908 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
909 ; X64-NEXT: vmovdqa %ymm1, %ymm0
911 %arg0 = bitcast i64 *%a0 to i8*
912 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
913 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
916 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
918 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
919 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
921 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
922 ; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
925 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
927 ; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
929 %arg1 = bitcast i64 *%a1 to i8*
930 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
931 %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
935 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
936 ; X86-LABEL: test_mm_i32gather_pd:
938 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
939 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
940 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
941 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
942 ; X86-NEXT: vmovapd %xmm1, %xmm0
945 ; X64-LABEL: test_mm_i32gather_pd:
947 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
948 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
949 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
950 ; X64-NEXT: vmovapd %xmm1, %xmm0
952 %arg0 = bitcast double *%a0 to i8*
953 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
954 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
955 %sext = sext <2 x i1> %cmp to <2 x i64>
956 %mask = bitcast <2 x i64> %sext to <2 x double>
957 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
958 ret <2 x double> %res
960 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
962 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
963 ; X86-LABEL: test_mm_mask_i32gather_pd:
965 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
966 ; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
969 ; X64-LABEL: test_mm_mask_i32gather_pd:
971 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
973 %arg1 = bitcast double *%a1 to i8*
974 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
975 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
976 ret <2 x double> %res
979 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
980 ; X86-LABEL: test_mm256_i32gather_pd:
982 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
983 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
984 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
985 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
986 ; X86-NEXT: vmovapd %ymm1, %ymm0
989 ; X64-LABEL: test_mm256_i32gather_pd:
991 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
992 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
993 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
994 ; X64-NEXT: vmovapd %ymm1, %ymm0
996 %arg0 = bitcast double *%a0 to i8*
997 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
998 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
999 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
1000 ret <4 x double> %res
1002 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
1004 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
1005 ; X86-LABEL: test_mm256_mask_i32gather_pd:
1007 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1008 ; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1011 ; X64-LABEL: test_mm256_mask_i32gather_pd:
1013 ; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1015 %arg1 = bitcast double *%a1 to i8*
1016 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1017 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1018 ret <4 x double> %res
1021 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1022 ; X86-LABEL: test_mm_i32gather_ps:
1024 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1025 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1026 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1027 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1028 ; X86-NEXT: vmovaps %xmm1, %xmm0
1031 ; X64-LABEL: test_mm_i32gather_ps:
1033 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1034 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1035 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1036 ; X64-NEXT: vmovaps %xmm1, %xmm0
1038 %arg0 = bitcast float *%a0 to i8*
1039 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1040 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1041 %sext = sext <4 x i1> %cmp to <4 x i32>
1042 %mask = bitcast <4 x i32> %sext to <4 x float>
1043 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1044 ret <4 x float> %call
1046 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1048 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1049 ; X86-LABEL: test_mm_mask_i32gather_ps:
1051 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1052 ; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1055 ; X64-LABEL: test_mm_mask_i32gather_ps:
1057 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1059 %arg1 = bitcast float *%a1 to i8*
1060 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1061 %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1062 ret <4 x float> %call
1065 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1066 ; X86-LABEL: test_mm256_i32gather_ps:
1068 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1069 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1070 ; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1071 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1072 ; X86-NEXT: vmovaps %ymm1, %ymm0
1075 ; X64-LABEL: test_mm256_i32gather_ps:
1077 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1078 ; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
1079 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1080 ; X64-NEXT: vmovaps %ymm1, %ymm0
1082 %arg0 = bitcast float *%a0 to i8*
1083 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1084 %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1085 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1086 ret <8 x float> %call
1088 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1090 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1091 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1093 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1094 ; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1097 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1099 ; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1101 %arg1 = bitcast float *%a1 to i8*
1102 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1103 %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1104 ret <8 x float> %call
1107 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1108 ; X86-LABEL: test_mm_i64gather_epi32:
1110 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1111 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1112 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1113 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1114 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1117 ; X64-LABEL: test_mm_i64gather_epi32:
1119 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1120 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1121 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1122 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1124 %arg0 = bitcast i32 *%a0 to i8*
1125 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1126 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1127 %bc = bitcast <4 x i32> %call to <2 x i64>
1130 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1132 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1133 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1135 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1136 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1139 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1141 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1143 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1144 %arg1 = bitcast i32 *%a1 to i8*
1145 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1146 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1147 %bc = bitcast <4 x i32> %call to <2 x i64>
1151 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1152 ; X86-LABEL: test_mm256_i64gather_epi32:
1154 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1155 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1156 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1157 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1158 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1159 ; X86-NEXT: vzeroupper
1162 ; X64-LABEL: test_mm256_i64gather_epi32:
1164 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1165 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1166 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1167 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1168 ; X64-NEXT: vzeroupper
1170 %arg0 = bitcast i32 *%a0 to i8*
1171 %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1172 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1173 %bc = bitcast <4 x i32> %call to <2 x i64>
1176 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1178 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1179 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1181 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1182 ; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1183 ; X86-NEXT: vzeroupper
1186 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1188 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1189 ; X64-NEXT: vzeroupper
1191 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1192 %arg1 = bitcast i32 *%a1 to i8*
1193 %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1194 %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1195 %bc = bitcast <4 x i32> %call to <2 x i64>
1199 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1200 ; X86-LABEL: test_mm_i64gather_epi64:
1202 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1203 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1204 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1205 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1206 ; X86-NEXT: vmovdqa %xmm1, %xmm0
1209 ; X64-LABEL: test_mm_i64gather_epi64:
1211 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1212 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1213 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1214 ; X64-NEXT: vmovdqa %xmm1, %xmm0
1216 %arg0 = bitcast i64 *%a0 to i8*
1217 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1220 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1222 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1223 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1225 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1226 ; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1229 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1231 ; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1233 %arg1 = bitcast i64 *%a1 to i8*
1234 %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1238 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1239 ; X86-LABEL: test_mm256_i64gather_epi64:
1241 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1242 ; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1243 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1244 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1245 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1248 ; X64-LABEL: test_mm256_i64gather_epi64:
1250 ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
1251 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1252 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1253 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1255 %arg0 = bitcast i64 *%a0 to i8*
1256 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1259 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1261 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1262 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1264 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1265 ; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1268 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1270 ; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1272 %arg1 = bitcast i64 *%a1 to i8*
1273 %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1277 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1278 ; X86-LABEL: test_mm_i64gather_pd:
1280 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1281 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1282 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1283 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1284 ; X86-NEXT: vmovapd %xmm1, %xmm0
1287 ; X64-LABEL: test_mm_i64gather_pd:
1289 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1290 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1291 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1292 ; X64-NEXT: vmovapd %xmm1, %xmm0
1294 %arg0 = bitcast double *%a0 to i8*
1295 %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1296 %sext = sext <2 x i1> %cmp to <2 x i64>
1297 %mask = bitcast <2 x i64> %sext to <2 x double>
1298 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1299 ret <2 x double> %call
1301 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1303 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1304 ; X86-LABEL: test_mm_mask_i64gather_pd:
1306 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1307 ; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1310 ; X64-LABEL: test_mm_mask_i64gather_pd:
1312 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1314 %arg1 = bitcast double *%a1 to i8*
1315 %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1316 ret <2 x double> %call
1319 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1320 ; X86-LABEL: test_mm256_i64gather_pd:
1322 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1323 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1324 ; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1325 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1326 ; X86-NEXT: vmovapd %ymm1, %ymm0
1329 ; X64-LABEL: test_mm256_i64gather_pd:
1331 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1332 ; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
1333 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1334 ; X64-NEXT: vmovapd %ymm1, %ymm0
1336 %arg0 = bitcast double *%a0 to i8*
1337 %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1338 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1339 ret <4 x double> %call
1341 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1343 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1344 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1346 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1347 ; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1350 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1352 ; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1354 %arg1 = bitcast i64 *%a1 to i8*
1355 %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1356 ret <4 x double> %call
1359 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1360 ; X86-LABEL: test_mm_i64gather_ps:
1362 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1363 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1364 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1365 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1366 ; X86-NEXT: vmovaps %xmm1, %xmm0
1369 ; X64-LABEL: test_mm_i64gather_ps:
1371 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1372 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1373 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1374 ; X64-NEXT: vmovaps %xmm1, %xmm0
1376 %arg0 = bitcast float *%a0 to i8*
1377 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1378 %sext = sext <4 x i1> %cmp to <4 x i32>
1379 %mask = bitcast <4 x i32> %sext to <4 x float>
1380 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1381 ret <4 x float> %call
1383 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1385 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1386 ; X86-LABEL: test_mm_mask_i64gather_ps:
1388 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1389 ; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1392 ; X64-LABEL: test_mm_mask_i64gather_ps:
1394 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1396 %arg1 = bitcast float *%a1 to i8*
1397 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1398 ret <4 x float> %call
1401 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1402 ; X86-LABEL: test_mm256_i64gather_ps:
1404 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1405 ; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1406 ; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
1407 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1408 ; X86-NEXT: vmovaps %xmm1, %xmm0
1409 ; X86-NEXT: vzeroupper
1412 ; X64-LABEL: test_mm256_i64gather_ps:
1414 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1415 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
1416 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1417 ; X64-NEXT: vmovaps %xmm1, %xmm0
1418 ; X64-NEXT: vzeroupper
1420 %arg0 = bitcast float *%a0 to i8*
1421 %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1422 %sext = sext <4 x i1> %cmp to <4 x i32>
1423 %mask = bitcast <4 x i32> %sext to <4 x float>
1424 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1425 ret <4 x float> %call
1427 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1429 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1430 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1432 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1433 ; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1434 ; X86-NEXT: vzeroupper
1437 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1439 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1440 ; X64-NEXT: vzeroupper
1442 %arg1 = bitcast float *%a1 to i8*
1443 %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1444 ret <4 x float> %call
1447 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1448 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1450 ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1451 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1452 ; CHECK-NEXT: ret{{[l|q]}}
1453 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1454 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1458 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1459 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1461 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1462 ; CHECK-NEXT: ret{{[l|q]}}
1463 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1464 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1468 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1469 ; CHECK-LABEL: test_mm256_madd_epi16:
1471 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
1472 ; CHECK-NEXT: ret{{[l|q]}}
1473 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1474 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1475 %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1476 %bc = bitcast <8 x i32> %res to <4 x i64>
1479 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1481 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1482 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1484 ; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
1485 ; CHECK-NEXT: ret{{[l|q]}}
1486 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1487 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1488 %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1489 %bc = bitcast <16 x i16> %res to <4 x i64>
1492 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1494 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1495 ; X86-LABEL: test_mm_maskload_epi32:
1497 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1498 ; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
1501 ; X64-LABEL: test_mm_maskload_epi32:
1503 ; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
1505 %arg0 = bitcast i32* %a0 to i8*
1506 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1507 %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1508 %bc = bitcast <4 x i32> %call to <2 x i64>
1511 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1513 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1514 ; X86-LABEL: test_mm256_maskload_epi32:
1516 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1517 ; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
1520 ; X64-LABEL: test_mm256_maskload_epi32:
1522 ; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
1524 %arg0 = bitcast i32* %a0 to i8*
1525 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1526 %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1527 %bc = bitcast <8 x i32> %call to <4 x i64>
1530 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1532 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1533 ; X86-LABEL: test_mm_maskload_epi64:
1535 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1536 ; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
1539 ; X64-LABEL: test_mm_maskload_epi64:
1541 ; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
1543 %arg0 = bitcast i64* %a0 to i8*
1544 %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1547 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1549 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1550 ; X86-LABEL: test_mm256_maskload_epi64:
1552 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1553 ; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
1556 ; X64-LABEL: test_mm256_maskload_epi64:
1558 ; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
1560 %arg0 = bitcast i64* %a0 to i8*
1561 %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1564 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1566 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1567 ; X86-LABEL: test_mm_maskstore_epi32:
1569 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1570 ; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
1573 ; X64-LABEL: test_mm_maskstore_epi32:
1575 ; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
1577 %arg0 = bitcast float* %a0 to i8*
1578 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1579 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1580 call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1583 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1585 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1586 ; X86-LABEL: test_mm256_maskstore_epi32:
1588 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1589 ; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
1590 ; X86-NEXT: vzeroupper
1593 ; X64-LABEL: test_mm256_maskstore_epi32:
1595 ; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
1596 ; X64-NEXT: vzeroupper
1598 %arg0 = bitcast float* %a0 to i8*
1599 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1600 %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1601 call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1604 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1606 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1607 ; X86-LABEL: test_mm_maskstore_epi64:
1609 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
1613 ; X64-LABEL: test_mm_maskstore_epi64:
1615 ; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
1617 %arg0 = bitcast i64* %a0 to i8*
1618 call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1621 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1623 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1624 ; X86-LABEL: test_mm256_maskstore_epi64:
1626 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1627 ; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
1628 ; X86-NEXT: vzeroupper
1631 ; X64-LABEL: test_mm256_maskstore_epi64:
1633 ; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
1634 ; X64-NEXT: vzeroupper
1636 %arg0 = bitcast i64* %a0 to i8*
1637 call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1640 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1642 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1643 ; CHECK-LABEL: test_mm256_max_epi8:
1645 ; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
1646 ; CHECK-NEXT: ret{{[l|q]}}
1647 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1648 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1649 %cmp = icmp sgt <32 x i8> %arg0, %arg1
1650 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1651 %bc = bitcast <32 x i8> %sel to <4 x i64>
1655 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1656 ; CHECK-LABEL: test_mm256_max_epi16:
1658 ; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
1659 ; CHECK-NEXT: ret{{[l|q]}}
1660 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1661 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1662 %cmp = icmp sgt <16 x i16> %arg0, %arg1
1663 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1664 %bc = bitcast <16 x i16> %sel to <4 x i64>
1668 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1669 ; CHECK-LABEL: test_mm256_max_epi32:
1671 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
1672 ; CHECK-NEXT: ret{{[l|q]}}
1673 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1674 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1675 %cmp = icmp sgt <8 x i32> %arg0, %arg1
1676 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1677 %bc = bitcast <8 x i32> %sel to <4 x i64>
1681 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1682 ; CHECK-LABEL: test_mm256_max_epu8:
1684 ; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
1685 ; CHECK-NEXT: ret{{[l|q]}}
1686 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1687 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1688 %cmp = icmp ugt <32 x i8> %arg0, %arg1
1689 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1690 %bc = bitcast <32 x i8> %sel to <4 x i64>
1694 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1695 ; CHECK-LABEL: test_mm256_max_epu16:
1697 ; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
1698 ; CHECK-NEXT: ret{{[l|q]}}
1699 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1700 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1701 %cmp = icmp ugt <16 x i16> %arg0, %arg1
1702 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1703 %bc = bitcast <16 x i16> %sel to <4 x i64>
1707 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1708 ; CHECK-LABEL: test_mm256_max_epu32:
1710 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
1711 ; CHECK-NEXT: ret{{[l|q]}}
1712 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1713 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1714 %cmp = icmp ugt <8 x i32> %arg0, %arg1
1715 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1716 %bc = bitcast <8 x i32> %sel to <4 x i64>
1720 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1721 ; CHECK-LABEL: test_mm256_min_epi8:
1723 ; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
1724 ; CHECK-NEXT: ret{{[l|q]}}
1725 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1726 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1727 %cmp = icmp slt <32 x i8> %arg0, %arg1
1728 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1729 %bc = bitcast <32 x i8> %sel to <4 x i64>
1733 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1734 ; CHECK-LABEL: test_mm256_min_epi16:
1736 ; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
1737 ; CHECK-NEXT: ret{{[l|q]}}
1738 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1739 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1740 %cmp = icmp slt <16 x i16> %arg0, %arg1
1741 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1742 %bc = bitcast <16 x i16> %sel to <4 x i64>
1746 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1747 ; CHECK-LABEL: test_mm256_min_epi32:
1749 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
1750 ; CHECK-NEXT: ret{{[l|q]}}
1751 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1752 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1753 %cmp = icmp slt <8 x i32> %arg0, %arg1
1754 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1755 %bc = bitcast <8 x i32> %sel to <4 x i64>
1759 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1760 ; CHECK-LABEL: test_mm256_min_epu8:
1762 ; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
1763 ; CHECK-NEXT: ret{{[l|q]}}
1764 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1765 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1766 %cmp = icmp ult <32 x i8> %arg0, %arg1
1767 %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1768 %bc = bitcast <32 x i8> %sel to <4 x i64>
1772 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1773 ; CHECK-LABEL: test_mm256_min_epu16:
1775 ; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
1776 ; CHECK-NEXT: ret{{[l|q]}}
1777 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1778 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1779 %cmp = icmp ult <16 x i16> %arg0, %arg1
1780 %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1781 %bc = bitcast <16 x i16> %sel to <4 x i64>
1785 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1786 ; CHECK-LABEL: test_mm256_min_epu32:
1788 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
1789 ; CHECK-NEXT: ret{{[l|q]}}
1790 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1791 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1792 %cmp = icmp ult <8 x i32> %arg0, %arg1
1793 %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1794 %bc = bitcast <8 x i32> %sel to <4 x i64>
1798 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1799 ; CHECK-LABEL: test_mm256_movemask_epi8:
1801 ; CHECK-NEXT: vpmovmskb %ymm0, %eax
1802 ; CHECK-NEXT: vzeroupper
1803 ; CHECK-NEXT: ret{{[l|q]}}
1804 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1805 %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1808 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1810 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1811 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1813 ; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
1814 ; CHECK-NEXT: ret{{[l|q]}}
1815 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1816 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1817 %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1818 %bc = bitcast <16 x i16> %call to <4 x i64>
1821 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1823 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1824 ; CHECK-LABEL: test_mm256_mul_epi32:
1826 ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
1827 ; CHECK-NEXT: ret{{[l|q]}}
1828 %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1829 %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1830 %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1831 %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1832 %res = mul nsw <4 x i64> %A1, %B1
1835 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1837 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1838 ; CHECK-LABEL: test_mm256_mul_epu32:
1840 ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1841 ; CHECK-NEXT: ret{{[l|q]}}
1842 %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1843 %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1844 %res = mul nuw <4 x i64> %A, %B
1847 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1849 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1850 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1852 ; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1853 ; CHECK-NEXT: ret{{[l|q]}}
1854 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1855 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1856 %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1857 %bc = bitcast <16 x i16> %res to <4 x i64>
1860 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1862 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1863 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1865 ; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
1866 ; CHECK-NEXT: ret{{[l|q]}}
1867 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1868 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1869 %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1870 %bc = bitcast <16 x i16> %res to <4 x i64>
1873 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1875 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1876 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1878 ; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
1879 ; CHECK-NEXT: ret{{[l|q]}}
1880 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1881 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1882 %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1883 %bc = bitcast <16 x i16> %res to <4 x i64>
1886 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1888 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1889 ; CHECK-LABEL: test_mm256_mullo_epi16:
1891 ; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1892 ; CHECK-NEXT: ret{{[l|q]}}
1893 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1894 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1895 %res = mul <16 x i16> %arg0, %arg1
1896 %bc = bitcast <16 x i16> %res to <4 x i64>
1900 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1901 ; CHECK-LABEL: test_mm256_mullo_epi32:
1903 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1904 ; CHECK-NEXT: ret{{[l|q]}}
1905 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1906 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1907 %res = mul <8 x i32> %arg0, %arg1
1908 %bc = bitcast <8 x i32> %res to <4 x i64>
1912 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1913 ; CHECK-LABEL: test_mm256_or_si256:
1915 ; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
1916 ; CHECK-NEXT: ret{{[l|q]}}
1917 %res = or <4 x i64> %a0, %a1
1921 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1922 ; CHECK-LABEL: test_mm256_packs_epi16:
1924 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
1925 ; CHECK-NEXT: ret{{[l|q]}}
1926 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1927 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1928 %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1929 %res = bitcast <32 x i8> %call to <4 x i64>
1932 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1934 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1935 ; CHECK-LABEL: test_mm256_packs_epi32:
1937 ; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
1938 ; CHECK-NEXT: ret{{[l|q]}}
1939 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1940 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1941 %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1942 %res = bitcast <16 x i16> %call to <4 x i64>
1945 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1947 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1948 ; CHECK-LABEL: test_mm256_packus_epi16:
1950 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1951 ; CHECK-NEXT: ret{{[l|q]}}
1952 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1953 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1954 %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1955 %res = bitcast <32 x i8> %call to <4 x i64>
1958 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1960 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1961 ; CHECK-LABEL: test_mm256_packus_epi32:
1963 ; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
1964 ; CHECK-NEXT: ret{{[l|q]}}
1965 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1966 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1967 %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1968 %res = bitcast <16 x i16> %call to <4 x i64>
1971 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1973 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1974 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1976 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1977 ; CHECK-NEXT: ret{{[l|q]}}
1978 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1981 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1983 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1984 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1986 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1987 ; CHECK-NEXT: ret{{[l|q]}}
1988 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1992 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1993 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1995 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1996 ; CHECK-NEXT: ret{{[l|q]}}
1997 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1998 ret <4 x double> %res
2001 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2002 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
2004 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2005 ; CHECK-NEXT: ret{{[l|q]}}
2006 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2007 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2008 %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2009 %res = bitcast <8 x i32> %call to <4 x i64>
2012 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2014 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2015 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2017 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2018 ; CHECK-NEXT: ret{{[l|q]}}
2019 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2020 %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2021 ret <8 x float> %res
2023 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2025 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2026 ; CHECK-LABEL: test_mm256_sad_epu8:
2028 ; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
2029 ; CHECK-NEXT: ret{{[l|q]}}
2030 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2031 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2032 %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2035 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2037 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2038 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2040 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2041 ; CHECK-NEXT: ret{{[l|q]}}
2042 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2043 %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2044 %res = bitcast <8 x i32> %shuf to <4 x i64>
2048 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2049 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2051 ; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2052 ; CHECK-NEXT: ret{{[l|q]}}
2053 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2054 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2055 %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2056 %res = bitcast <32 x i8> %shuf to <4 x i64>
2059 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2061 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2062 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2064 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2065 ; CHECK-NEXT: ret{{[l|q]}}
2066 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2067 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2068 %res = bitcast <16 x i16> %shuf to <4 x i64>
2072 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2073 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2075 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2076 ; CHECK-NEXT: ret{{[l|q]}}
2077 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2078 %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2079 %res = bitcast <16 x i16> %shuf to <4 x i64>
2083 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2084 ; CHECK-LABEL: test_mm256_sign_epi8:
2086 ; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0
2087 ; CHECK-NEXT: ret{{[l|q]}}
2088 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2089 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2090 %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2091 %res = bitcast <32 x i8> %call to <4 x i64>
2094 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2096 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2097 ; CHECK-LABEL: test_mm256_sign_epi16:
2099 ; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0
2100 ; CHECK-NEXT: ret{{[l|q]}}
2101 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2102 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2103 %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2104 %res = bitcast <16 x i16> %call to <4 x i64>
2107 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2109 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2110 ; CHECK-LABEL: test_mm256_sign_epi32:
2112 ; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0
2113 ; CHECK-NEXT: ret{{[l|q]}}
2114 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2115 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2116 %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2117 %res = bitcast <8 x i32> %call to <4 x i64>
2120 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2122 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2123 ; CHECK-LABEL: test_mm256_sll_epi16:
2125 ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
2126 ; CHECK-NEXT: ret{{[l|q]}}
2127 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2128 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2129 %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2130 %bc = bitcast <16 x i16> %res to <4 x i64>
2133 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2135 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2136 ; CHECK-LABEL: test_mm256_sll_epi32:
2138 ; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
2139 ; CHECK-NEXT: ret{{[l|q]}}
2140 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2141 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2142 %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2143 %bc = bitcast <8 x i32> %res to <4 x i64>
2146 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2148 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2149 ; CHECK-LABEL: test_mm256_sll_epi64:
2151 ; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
2152 ; CHECK-NEXT: ret{{[l|q]}}
2153 %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2156 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2158 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2159 ; CHECK-LABEL: test_mm256_slli_epi16:
2161 ; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
2162 ; CHECK-NEXT: ret{{[l|q]}}
2163 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2164 %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2165 %bc = bitcast <16 x i16> %res to <4 x i64>
2168 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2170 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2171 ; CHECK-LABEL: test_mm256_slli_epi32:
2173 ; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
2174 ; CHECK-NEXT: ret{{[l|q]}}
2175 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2176 %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2177 %bc = bitcast <8 x i32> %res to <4 x i64>
2180 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2182 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2183 ; CHECK-LABEL: test_mm256_slli_epi64:
2185 ; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
2186 ; CHECK-NEXT: ret{{[l|q]}}
2187 %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2190 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2192 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2193 ; CHECK-LABEL: test_mm256_slli_si256:
2195 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2196 ; CHECK-NEXT: ret{{[l|q]}}
2197 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2198 %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2199 %res = bitcast <32 x i8> %shuf to <4 x i64>
2203 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2204 ; CHECK-LABEL: test_mm_sllv_epi32:
2206 ; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
2207 ; CHECK-NEXT: ret{{[l|q]}}
2208 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2209 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2210 %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2211 %bc = bitcast <4 x i32> %res to <2 x i64>
2214 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2216 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2217 ; CHECK-LABEL: test_mm256_sllv_epi32:
2219 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
2220 ; CHECK-NEXT: ret{{[l|q]}}
2221 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2222 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2223 %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2224 %bc = bitcast <8 x i32> %res to <4 x i64>
2227 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2229 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2230 ; CHECK-LABEL: test_mm_sllv_epi64:
2232 ; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
2233 ; CHECK-NEXT: ret{{[l|q]}}
2234 %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2237 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2239 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2240 ; CHECK-LABEL: test_mm256_sllv_epi64:
2242 ; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
2243 ; CHECK-NEXT: ret{{[l|q]}}
2244 %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2247 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2249 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2250 ; CHECK-LABEL: test_mm256_sra_epi16:
2252 ; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
2253 ; CHECK-NEXT: ret{{[l|q]}}
2254 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2255 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2256 %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2257 %bc = bitcast <16 x i16> %res to <4 x i64>
2260 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2262 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2263 ; CHECK-LABEL: test_mm256_sra_epi32:
2265 ; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
2266 ; CHECK-NEXT: ret{{[l|q]}}
2267 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2268 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2269 %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2270 %bc = bitcast <8 x i32> %res to <4 x i64>
2273 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2275 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2276 ; CHECK-LABEL: test_mm256_srai_epi16:
2278 ; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
2279 ; CHECK-NEXT: ret{{[l|q]}}
2280 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2281 %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2282 %bc = bitcast <16 x i16> %res to <4 x i64>
2285 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2287 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2288 ; CHECK-LABEL: test_mm256_srai_epi32:
2290 ; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
2291 ; CHECK-NEXT: ret{{[l|q]}}
2292 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2293 %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2294 %bc = bitcast <8 x i32> %res to <4 x i64>
2297 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2299 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2300 ; CHECK-LABEL: test_mm_srav_epi32:
2302 ; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
2303 ; CHECK-NEXT: ret{{[l|q]}}
2304 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2305 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2306 %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2307 %bc = bitcast <4 x i32> %res to <2 x i64>
2310 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2312 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2313 ; CHECK-LABEL: test_mm256_srav_epi32:
2315 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
2316 ; CHECK-NEXT: ret{{[l|q]}}
2317 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2318 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2319 %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2320 %bc = bitcast <8 x i32> %res to <4 x i64>
2323 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2325 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2326 ; CHECK-LABEL: test_mm256_srl_epi16:
2328 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
2329 ; CHECK-NEXT: ret{{[l|q]}}
2330 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2331 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2332 %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2333 %bc = bitcast <16 x i16> %res to <4 x i64>
2336 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2338 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2339 ; CHECK-LABEL: test_mm256_srl_epi32:
2341 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
2342 ; CHECK-NEXT: ret{{[l|q]}}
2343 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2344 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2345 %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2346 %bc = bitcast <8 x i32> %res to <4 x i64>
2349 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2351 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2352 ; CHECK-LABEL: test_mm256_srl_epi64:
2354 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
2355 ; CHECK-NEXT: ret{{[l|q]}}
2356 %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2359 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2361 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2362 ; CHECK-LABEL: test_mm256_srli_epi16:
2364 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
2365 ; CHECK-NEXT: ret{{[l|q]}}
2366 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2367 %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2368 %bc = bitcast <16 x i16> %res to <4 x i64>
2371 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2373 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2374 ; CHECK-LABEL: test_mm256_srli_epi32:
2376 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0
2377 ; CHECK-NEXT: ret{{[l|q]}}
2378 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2379 %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2380 %bc = bitcast <8 x i32> %res to <4 x i64>
2383 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2385 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2386 ; CHECK-LABEL: test_mm256_srli_epi64:
2388 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0
2389 ; CHECK-NEXT: ret{{[l|q]}}
2390 %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2393 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2395 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2396 ; CHECK-LABEL: test_mm256_srli_si256:
2398 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2399 ; CHECK-NEXT: ret{{[l|q]}}
2400 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2401 %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2402 %res = bitcast <32 x i8> %shuf to <4 x i64>
2406 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2407 ; CHECK-LABEL: test_mm_srlv_epi32:
2409 ; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
2410 ; CHECK-NEXT: ret{{[l|q]}}
2411 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2412 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2413 %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2414 %bc = bitcast <4 x i32> %res to <2 x i64>
2417 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2419 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2420 ; CHECK-LABEL: test_mm256_srlv_epi32:
2422 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
2423 ; CHECK-NEXT: ret{{[l|q]}}
2424 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2425 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2426 %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2427 %bc = bitcast <8 x i32> %res to <4 x i64>
2430 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2432 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2433 ; CHECK-LABEL: test_mm_srlv_epi64:
2435 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
2436 ; CHECK-NEXT: ret{{[l|q]}}
2437 %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2440 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2442 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2443 ; CHECK-LABEL: test_mm256_srlv_epi64:
2445 ; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
2446 ; CHECK-NEXT: ret{{[l|q]}}
2447 %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2450 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2452 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2453 ; X86-LABEL: test_mm256_stream_load_si256:
2455 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2456 ; X86-NEXT: vmovntdqa (%eax), %ymm0
2459 ; X64-LABEL: test_mm256_stream_load_si256:
2461 ; X64-NEXT: vmovntdqa (%rdi), %ymm0
2463 %arg0 = bitcast <4 x i64> *%a0 to i8*
2464 %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2467 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2469 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2470 ; CHECK-LABEL: test_mm256_sub_epi8:
2472 ; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
2473 ; CHECK-NEXT: ret{{[l|q]}}
2474 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2475 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2476 %res = sub <32 x i8> %arg0, %arg1
2477 %bc = bitcast <32 x i8> %res to <4 x i64>
2481 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2482 ; CHECK-LABEL: test_mm256_sub_epi16:
2484 ; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0
2485 ; CHECK-NEXT: ret{{[l|q]}}
2486 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2487 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2488 %res = sub <16 x i16> %arg0, %arg1
2489 %bc = bitcast <16 x i16> %res to <4 x i64>
2493 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2494 ; CHECK-LABEL: test_mm256_sub_epi32:
2496 ; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2497 ; CHECK-NEXT: ret{{[l|q]}}
2498 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2499 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2500 %res = sub <8 x i32> %arg0, %arg1
2501 %bc = bitcast <8 x i32> %res to <4 x i64>
2505 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2506 ; CHECK-LABEL: test_mm256_sub_epi64:
2508 ; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
2509 ; CHECK-NEXT: ret{{[l|q]}}
2510 %res = sub <4 x i64> %a0, %a1
2514 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2515 ; CHECK-LABEL: test_mm256_subs_epi8:
2517 ; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
2518 ; CHECK-NEXT: ret{{[l|q]}}
2519 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2520 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2521 %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2522 %bc = bitcast <32 x i8> %res to <4 x i64>
2525 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2527 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2528 ; CHECK-LABEL: test_mm256_subs_epi16:
2530 ; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
2531 ; CHECK-NEXT: ret{{[l|q]}}
2532 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2533 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2534 %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2535 %bc = bitcast <16 x i16> %res to <4 x i64>
2538 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2540 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2541 ; CHECK-LABEL: test_mm256_subs_epu8:
2543 ; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
2544 ; CHECK-NEXT: ret{{[l|q]}}
2545 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2546 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2547 %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2548 %bc = bitcast <32 x i8> %res to <4 x i64>
2551 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2553 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2554 ; CHECK-LABEL: test_mm256_subs_epu16:
2556 ; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
2557 ; CHECK-NEXT: ret{{[l|q]}}
2558 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2559 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2560 %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2561 %bc = bitcast <16 x i16> %res to <4 x i64>
2564 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2566 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2567 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2569 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2570 ; CHECK-NEXT: ret{{[l|q]}}
2571 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2572 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2573 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2574 %bc = bitcast <32 x i8> %res to <4 x i64>
2578 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2579 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2581 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2582 ; CHECK-NEXT: ret{{[l|q]}}
2583 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2584 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2585 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2586 %bc = bitcast <16 x i16> %res to <4 x i64>
2590 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2591 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2593 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2594 ; CHECK-NEXT: ret{{[l|q]}}
2595 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2596 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2597 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2598 %bc = bitcast <8 x i32> %res to <4 x i64>
2602 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2603 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2605 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2606 ; CHECK-NEXT: ret{{[l|q]}}
2607 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2611 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2612 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2614 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2615 ; CHECK-NEXT: ret{{[l|q]}}
2616 %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2617 %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2618 %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2619 %bc = bitcast <32 x i8> %res to <4 x i64>
2623 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2624 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2626 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2627 ; CHECK-NEXT: ret{{[l|q]}}
2628 %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2629 %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2630 %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2631 %bc = bitcast <16 x i16> %res to <4 x i64>
2635 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2636 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2638 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2639 ; CHECK-NEXT: ret{{[l|q]}}
2640 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2641 %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2642 %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2643 %bc = bitcast <8 x i32> %res to <4 x i64>
2647 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2648 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2650 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2651 ; CHECK-NEXT: ret{{[l|q]}}
2652 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2656 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2657 ; CHECK-LABEL: test_mm256_xor_si256:
2659 ; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
2660 ; CHECK-NEXT: ret{{[l|q]}}
2661 %res = xor <4 x i64> %a0, %a1
2665 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2667 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone