1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SHUF
10 define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
11 ; SSSE3-LABEL: phaddw1:
13 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
18 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
20 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
21 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
22 %r = add <8 x i16> %a, %b
26 define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
27 ; SSSE3-LABEL: phaddw2:
29 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
34 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
36 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
37 %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
38 %r = add <8 x i16> %a, %b
42 define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
43 ; SSSE3-LABEL: phaddd1:
45 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
50 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
52 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
53 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
54 %r = add <4 x i32> %a, %b
58 define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
59 ; SSSE3-LABEL: phaddd2:
61 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
66 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
68 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
69 %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
70 %r = add <4 x i32> %a, %b
74 define <4 x i32> @phaddd3(<4 x i32> %x) {
75 ; SSSE3-LABEL: phaddd3:
77 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
82 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
84 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
85 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
86 %r = add <4 x i32> %a, %b
90 define <4 x i32> @phaddd4(<4 x i32> %x) {
91 ; SSSE3-LABEL: phaddd4:
93 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
98 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
100 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
101 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
102 %r = add <4 x i32> %a, %b
106 define <4 x i32> @phaddd5(<4 x i32> %x) {
107 ; SSSE3-LABEL: phaddd5:
109 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
112 ; AVX-LABEL: phaddd5:
114 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
116 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
117 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
118 %r = add <4 x i32> %a, %b
122 define <4 x i32> @phaddd6(<4 x i32> %x) {
123 ; SSSE3-SLOW-LABEL: phaddd6:
124 ; SSSE3-SLOW: # %bb.0:
125 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
126 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
127 ; SSSE3-SLOW-NEXT: retq
129 ; SSSE3-FAST-LABEL: phaddd6:
130 ; SSSE3-FAST: # %bb.0:
131 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
132 ; SSSE3-FAST-NEXT: retq
134 ; AVX-SLOW-LABEL: phaddd6:
136 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
137 ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
138 ; AVX-SLOW-NEXT: retq
140 ; AVX-FAST-LABEL: phaddd6:
142 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
143 ; AVX-FAST-NEXT: retq
145 ; AVX2-SHUF-LABEL: phaddd6:
146 ; AVX2-SHUF: # %bb.0:
147 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
148 ; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
149 ; AVX2-SHUF-NEXT: retq
150 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
151 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
152 %r = add <4 x i32> %a, %b
156 define <4 x i32> @phaddd7(<4 x i32> %x) {
157 ; SSSE3-LABEL: phaddd7:
159 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
162 ; AVX-LABEL: phaddd7:
164 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
166 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
167 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
168 %r = add <4 x i32> %a, %b
172 define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
173 ; SSSE3-LABEL: phsubw1:
175 ; SSSE3-NEXT: phsubw %xmm1, %xmm0
178 ; AVX-LABEL: phsubw1:
180 ; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
182 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
183 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
184 %r = sub <8 x i16> %a, %b
188 define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
189 ; SSSE3-LABEL: phsubd1:
191 ; SSSE3-NEXT: phsubd %xmm1, %xmm0
194 ; AVX-LABEL: phsubd1:
196 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
198 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
199 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
200 %r = sub <4 x i32> %a, %b
204 define <4 x i32> @phsubd2(<4 x i32> %x) {
205 ; SSSE3-LABEL: phsubd2:
207 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
210 ; AVX-LABEL: phsubd2:
212 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
214 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
215 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
216 %r = sub <4 x i32> %a, %b
220 define <4 x i32> @phsubd3(<4 x i32> %x) {
221 ; SSSE3-LABEL: phsubd3:
223 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
226 ; AVX-LABEL: phsubd3:
228 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
230 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
231 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
232 %r = sub <4 x i32> %a, %b
236 define <4 x i32> @phsubd4(<4 x i32> %x) {
237 ; SSSE3-SLOW-LABEL: phsubd4:
238 ; SSSE3-SLOW: # %bb.0:
239 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
240 ; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0
241 ; SSSE3-SLOW-NEXT: retq
243 ; SSSE3-FAST-LABEL: phsubd4:
244 ; SSSE3-FAST: # %bb.0:
245 ; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0
246 ; SSSE3-FAST-NEXT: retq
248 ; AVX-SLOW-LABEL: phsubd4:
250 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
251 ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
252 ; AVX-SLOW-NEXT: retq
254 ; AVX-FAST-LABEL: phsubd4:
256 ; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
257 ; AVX-FAST-NEXT: retq
259 ; AVX2-SHUF-LABEL: phsubd4:
260 ; AVX2-SHUF: # %bb.0:
261 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
262 ; AVX2-SHUF-NEXT: vpsubd %xmm1, %xmm0, %xmm0
263 ; AVX2-SHUF-NEXT: retq
264 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
265 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
266 %r = sub <4 x i32> %a, %b
270 define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
271 ; SSSE3-LABEL: phsubw1_reverse:
273 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
274 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
275 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
276 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
277 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
278 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
279 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
280 ; SSSE3-NEXT: pshufb %xmm3, %xmm1
281 ; SSSE3-NEXT: pshufb %xmm3, %xmm0
282 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
283 ; SSSE3-NEXT: psubw %xmm0, %xmm2
284 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
287 ; AVX-LABEL: phsubw1_reverse:
289 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
290 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3
291 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
292 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
293 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
294 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
295 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
296 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
297 ; AVX-NEXT: vpsubw %xmm0, %xmm2, %xmm0
299 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
300 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
301 %r = sub <8 x i16> %a, %b
305 define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
306 ; SSSE3-LABEL: phsubd1_reverse:
308 ; SSSE3-NEXT: movaps %xmm0, %xmm2
309 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
310 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
311 ; SSSE3-NEXT: psubd %xmm0, %xmm2
312 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
315 ; AVX-LABEL: phsubd1_reverse:
317 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
318 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
319 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
321 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
322 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
323 %r = sub <4 x i32> %a, %b
327 define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
328 ; SSSE3-LABEL: phaddd_single_source1:
330 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
333 ; AVX-LABEL: phaddd_single_source1:
335 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
337 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
338 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
339 %add = add <4 x i32> %l, %r
343 define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
344 ; SSSE3-LABEL: phaddd_single_source2:
346 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
347 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
350 ; AVX-LABEL: phaddd_single_source2:
352 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
353 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
355 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
356 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
357 %add = add <4 x i32> %l, %r
358 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
359 ret <4 x i32> %shuffle2
362 define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
363 ; SSSE3-LABEL: phaddd_single_source3:
365 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
368 ; AVX-LABEL: phaddd_single_source3:
370 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
372 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
373 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
374 %add = add <4 x i32> %l, %r
378 define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
379 ; SSSE3-SLOW-LABEL: phaddd_single_source4:
380 ; SSSE3-SLOW: # %bb.0:
381 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
382 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
383 ; SSSE3-SLOW-NEXT: retq
385 ; SSSE3-FAST-LABEL: phaddd_single_source4:
386 ; SSSE3-FAST: # %bb.0:
387 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
388 ; SSSE3-FAST-NEXT: retq
390 ; AVX-SLOW-LABEL: phaddd_single_source4:
392 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
393 ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
394 ; AVX-SLOW-NEXT: retq
396 ; AVX-FAST-LABEL: phaddd_single_source4:
398 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
399 ; AVX-FAST-NEXT: retq
401 ; AVX2-SHUF-LABEL: phaddd_single_source4:
402 ; AVX2-SHUF: # %bb.0:
403 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
404 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
405 ; AVX2-SHUF-NEXT: retq
406 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
407 %add = add <4 x i32> %l, %x
411 define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
412 ; SSSE3-SLOW-LABEL: phaddd_single_source5:
413 ; SSSE3-SLOW: # %bb.0:
414 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
415 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
416 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
417 ; SSSE3-SLOW-NEXT: retq
419 ; SSSE3-FAST-LABEL: phaddd_single_source5:
420 ; SSSE3-FAST: # %bb.0:
421 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
422 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
423 ; SSSE3-FAST-NEXT: retq
425 ; AVX-SLOW-LABEL: phaddd_single_source5:
427 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
428 ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
429 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
430 ; AVX-SLOW-NEXT: retq
432 ; AVX-FAST-LABEL: phaddd_single_source5:
434 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
435 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
436 ; AVX-FAST-NEXT: retq
438 ; AVX2-SHUF-LABEL: phaddd_single_source5:
439 ; AVX2-SHUF: # %bb.0:
440 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
441 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
442 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
443 ; AVX2-SHUF-NEXT: retq
444 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
445 %add = add <4 x i32> %l, %x
446 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
447 ret <4 x i32> %shuffle2
450 define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
451 ; SSSE3-LABEL: phaddd_single_source6:
453 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
454 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
457 ; AVX-LABEL: phaddd_single_source6:
459 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
460 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
462 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
463 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
464 %add = add <4 x i32> %l, %r
465 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
466 ret <4 x i32> %shuffle2
469 define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
470 ; SSSE3-LABEL: phaddw_single_source1:
472 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
475 ; AVX-LABEL: phaddw_single_source1:
477 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
479 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
480 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
481 %add = add <8 x i16> %l, %r
485 define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
486 ; SSSE3-LABEL: phaddw_single_source2:
488 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
489 ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
490 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
493 ; AVX-SLOW-LABEL: phaddw_single_source2:
495 ; AVX-SLOW-NEXT: vphaddw %xmm0, %xmm0, %xmm0
496 ; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
497 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
498 ; AVX-SLOW-NEXT: retq
500 ; AVX-FAST-LABEL: phaddw_single_source2:
502 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
503 ; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
504 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
505 ; AVX-FAST-NEXT: retq
507 ; AVX2-SHUF-LABEL: phaddw_single_source2:
508 ; AVX2-SHUF: # %bb.0:
509 ; AVX2-SHUF-NEXT: vphaddw %xmm0, %xmm0, %xmm0
510 ; AVX2-SHUF-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,4,5,6,7,10,11,8,9,12,13,14,15]
511 ; AVX2-SHUF-NEXT: retq
512 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
513 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
514 %add = add <8 x i16> %l, %r
515 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
516 ret <8 x i16> %shuffle2
519 define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
520 ; SSSE3-LABEL: phaddw_single_source3:
522 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
525 ; AVX-LABEL: phaddw_single_source3:
527 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
529 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
530 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
531 %add = add <8 x i16> %l, %r
535 define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
536 ; SSSE3-SLOW-LABEL: phaddw_single_source4:
537 ; SSSE3-SLOW: # %bb.0:
538 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
539 ; SSSE3-SLOW-NEXT: pslld $16, %xmm1
540 ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
541 ; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0
542 ; SSSE3-SLOW-NEXT: retq
544 ; SSSE3-FAST-LABEL: phaddw_single_source4:
545 ; SSSE3-FAST: # %bb.0:
546 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
547 ; SSSE3-FAST-NEXT: retq
549 ; AVX-SLOW-LABEL: phaddw_single_source4:
551 ; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1
552 ; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
553 ; AVX-SLOW-NEXT: retq
555 ; AVX-FAST-LABEL: phaddw_single_source4:
557 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
558 ; AVX-FAST-NEXT: retq
560 ; AVX2-SHUF-LABEL: phaddw_single_source4:
561 ; AVX2-SHUF: # %bb.0:
562 ; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1
563 ; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0
564 ; AVX2-SHUF-NEXT: retq
565 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
566 %add = add <8 x i16> %l, %x
570 define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
571 ; SSSE3-LABEL: phaddw_single_source6:
573 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
574 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
577 ; AVX-LABEL: phaddw_single_source6:
579 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
580 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
582 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
583 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
584 %add = add <8 x i16> %l, %r
585 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
586 ret <8 x i16> %shuffle2
590 define i32 @PR39936_v8i32(<8 x i32>) {
591 ; SSSE3-SLOW-LABEL: PR39936_v8i32:
592 ; SSSE3-SLOW: # %bb.0:
593 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
594 ; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
595 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
596 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
597 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax
598 ; SSSE3-SLOW-NEXT: retq
600 ; SSSE3-FAST-LABEL: PR39936_v8i32:
601 ; SSSE3-FAST: # %bb.0:
602 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
603 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
604 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
605 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
606 ; SSSE3-FAST-NEXT: retq
608 ; AVX1-SLOW-LABEL: PR39936_v8i32:
609 ; AVX1-SLOW: # %bb.0:
610 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
611 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
612 ; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
613 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
614 ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
615 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
616 ; AVX1-SLOW-NEXT: vzeroupper
617 ; AVX1-SLOW-NEXT: retq
619 ; AVX1-FAST-LABEL: PR39936_v8i32:
620 ; AVX1-FAST: # %bb.0:
621 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
622 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
623 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
624 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
625 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
626 ; AVX1-FAST-NEXT: vzeroupper
627 ; AVX1-FAST-NEXT: retq
629 ; AVX2-SLOW-LABEL: PR39936_v8i32:
630 ; AVX2-SLOW: # %bb.0:
631 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
632 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
633 ; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
634 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
635 ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
636 ; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
637 ; AVX2-SLOW-NEXT: vzeroupper
638 ; AVX2-SLOW-NEXT: retq
640 ; AVX2-FAST-LABEL: PR39936_v8i32:
641 ; AVX2-FAST: # %bb.0:
642 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
643 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
644 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
645 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
646 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax
647 ; AVX2-FAST-NEXT: vzeroupper
648 ; AVX2-FAST-NEXT: retq
650 ; AVX2-SHUF-LABEL: PR39936_v8i32:
651 ; AVX2-SHUF: # %bb.0:
652 ; AVX2-SHUF-NEXT: vextracti128 $1, %ymm0, %xmm1
653 ; AVX2-SHUF-NEXT: vphaddd %xmm1, %xmm0, %xmm0
654 ; AVX2-SHUF-NEXT: vphaddd %xmm0, %xmm0, %xmm0
655 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
656 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
657 ; AVX2-SHUF-NEXT: vmovd %xmm0, %eax
658 ; AVX2-SHUF-NEXT: vzeroupper
659 ; AVX2-SHUF-NEXT: retq
660 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
661 %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
662 %4 = add <8 x i32> %2, %3
663 %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
664 %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
665 %7 = add <8 x i32> %5, %6
666 %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
667 %9 = add <8 x i32> %8, %7
668 %10 = extractelement <8 x i32> %9, i32 0