1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
5 define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
6 ; X32-LABEL: test_vpaddq:
8 ; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
11 ; X64-LABEL: test_vpaddq:
13 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
15 %x = add <4 x i64> %i, %j
19 define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
20 ; X32-LABEL: test_vpaddd:
22 ; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
25 ; X64-LABEL: test_vpaddd:
27 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
29 %x = add <8 x i32> %i, %j
33 define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
34 ; X32-LABEL: test_vpaddw:
36 ; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
39 ; X64-LABEL: test_vpaddw:
41 ; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
43 %x = add <16 x i16> %i, %j
47 define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
48 ; X32-LABEL: test_vpaddb:
50 ; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
53 ; X64-LABEL: test_vpaddb:
55 ; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
57 %x = add <32 x i8> %i, %j
61 define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
62 ; X32-LABEL: test_vpsubq:
64 ; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0
67 ; X64-LABEL: test_vpsubq:
69 ; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0
71 %x = sub <4 x i64> %i, %j
75 define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
76 ; X32-LABEL: test_vpsubd:
78 ; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0
81 ; X64-LABEL: test_vpsubd:
83 ; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0
85 %x = sub <8 x i32> %i, %j
89 define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
90 ; X32-LABEL: test_vpsubw:
92 ; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0
95 ; X64-LABEL: test_vpsubw:
97 ; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0
99 %x = sub <16 x i16> %i, %j
103 define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
104 ; X32-LABEL: test_vpsubb:
106 ; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0
109 ; X64-LABEL: test_vpsubb:
111 ; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
113 %x = sub <32 x i8> %i, %j
117 define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
118 ; X32-LABEL: test_vpmulld:
120 ; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
123 ; X64-LABEL: test_vpmulld:
125 ; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
127 %x = mul <8 x i32> %i, %j
131 define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
132 ; X32-LABEL: test_vpmullw:
134 ; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
137 ; X64-LABEL: test_vpmullw:
139 ; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
141 %x = mul <16 x i16> %i, %j
145 define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
146 ; X32-LABEL: mul_v16i8:
148 ; X32-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
149 ; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
150 ; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
151 ; X32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
152 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
153 ; X32-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
154 ; X32-NEXT: vzeroupper
157 ; X64-LABEL: mul_v16i8:
159 ; X64-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
160 ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
161 ; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
162 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
163 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
164 ; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
165 ; X64-NEXT: vzeroupper
167 %x = mul <16 x i8> %i, %j
171 define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
172 ; X32-LABEL: mul_v32i8:
174 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
175 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
176 ; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2
177 ; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
178 ; X32-NEXT: vpand %ymm3, %ymm2, %ymm2
179 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
180 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
181 ; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
182 ; X32-NEXT: vpand %ymm3, %ymm0, %ymm0
183 ; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
186 ; X64-LABEL: mul_v32i8:
188 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
189 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
190 ; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2
191 ; X64-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
192 ; X64-NEXT: vpand %ymm3, %ymm2, %ymm2
193 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
194 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
195 ; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
196 ; X64-NEXT: vpand %ymm3, %ymm0, %ymm0
197 ; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
199 %x = mul <32 x i8> %i, %j
203 define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
204 ; X32-LABEL: mul_v4i64:
206 ; X32-NEXT: vpsrlq $32, %ymm0, %ymm2
207 ; X32-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
208 ; X32-NEXT: vpsrlq $32, %ymm1, %ymm3
209 ; X32-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
210 ; X32-NEXT: vpaddq %ymm2, %ymm3, %ymm2
211 ; X32-NEXT: vpsllq $32, %ymm2, %ymm2
212 ; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
213 ; X32-NEXT: vpaddq %ymm2, %ymm0, %ymm0
216 ; X64-LABEL: mul_v4i64:
218 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
219 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
220 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
221 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
222 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
223 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
224 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
225 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
227 %x = mul <4 x i64> %i, %j
231 define <8 x i32> @mul_const1(<8 x i32> %x) {
232 ; X32-LABEL: mul_const1:
234 ; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0
237 ; X64-LABEL: mul_const1:
239 ; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
241 %y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
245 define <4 x i64> @mul_const2(<4 x i64> %x) {
246 ; X32-LABEL: mul_const2:
248 ; X32-NEXT: vpsllq $2, %ymm0, %ymm0
251 ; X64-LABEL: mul_const2:
253 ; X64-NEXT: vpsllq $2, %ymm0, %ymm0
255 %y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4>
259 define <16 x i16> @mul_const3(<16 x i16> %x) {
260 ; X32-LABEL: mul_const3:
262 ; X32-NEXT: vpsllw $3, %ymm0, %ymm0
265 ; X64-LABEL: mul_const3:
267 ; X64-NEXT: vpsllw $3, %ymm0, %ymm0
269 %y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
273 define <4 x i64> @mul_const4(<4 x i64> %x) {
274 ; X32-LABEL: mul_const4:
276 ; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
277 ; X32-NEXT: vpsubq %ymm0, %ymm1, %ymm0
280 ; X64-LABEL: mul_const4:
282 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
283 ; X64-NEXT: vpsubq %ymm0, %ymm1, %ymm0
285 %y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1>
289 define <8 x i32> @mul_const5(<8 x i32> %x) {
290 ; X32-LABEL: mul_const5:
292 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
295 ; X64-LABEL: mul_const5:
297 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
299 %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
303 define <8 x i32> @mul_const6(<8 x i32> %x) {
304 ; X32-LABEL: mul_const6:
306 ; X32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
309 ; X64-LABEL: mul_const6:
311 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
313 %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
317 define <8 x i64> @mul_const7(<8 x i64> %x) {
318 ; X32-LABEL: mul_const7:
320 ; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0
321 ; X32-NEXT: vpaddq %ymm1, %ymm1, %ymm1
324 ; X64-LABEL: mul_const7:
326 ; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
327 ; X64-NEXT: vpaddq %ymm1, %ymm1, %ymm1
329 %y = mul <8 x i64> %x, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
333 define <8 x i16> @mul_const8(<8 x i16> %x) {
334 ; X32-LABEL: mul_const8:
336 ; X32-NEXT: vpsllw $3, %xmm0, %xmm0
339 ; X64-LABEL: mul_const8:
341 ; X64-NEXT: vpsllw $3, %xmm0, %xmm0
343 %y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
347 define <8 x i32> @mul_const9(<8 x i32> %x) {
348 ; X32-LABEL: mul_const9:
350 ; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
351 ; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
354 ; X64-LABEL: mul_const9:
356 ; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
357 ; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
359 %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
364 define <4 x i32> @mul_const10(<4 x i32> %x) {
365 ; X32-LABEL: mul_const10:
367 ; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
368 ; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
371 ; X64-LABEL: mul_const10:
373 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
374 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
376 %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
381 define <4 x i32> @mul_const11(<4 x i32> %x) {
382 ; X32-LABEL: mul_const11:
384 ; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
385 ; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
388 ; X64-LABEL: mul_const11:
390 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
391 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
393 %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>