1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
11 define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
12 ; SSSE3-LABEL: phaddw1:
14 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
19 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
21 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
22 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23 %r = add <8 x i16> %a, %b
27 define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
28 ; SSSE3-LABEL: phaddw2:
30 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
35 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
37 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
38 %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
39 %r = add <8 x i16> %a, %b
43 define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
44 ; SSSE3-LABEL: phaddd1:
46 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
51 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
53 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
54 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
55 %r = add <4 x i32> %a, %b
59 define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
60 ; SSSE3-LABEL: phaddd2:
62 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
67 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
69 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
70 %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
71 %r = add <4 x i32> %a, %b
75 define <4 x i32> @phaddd3(<4 x i32> %x) {
76 ; SSSE3-LABEL: phaddd3:
78 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
83 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
85 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
86 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
87 %r = add <4 x i32> %a, %b
91 define <4 x i32> @phaddd4(<4 x i32> %x) {
92 ; SSSE3-LABEL: phaddd4:
94 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
99 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
101 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
102 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
103 %r = add <4 x i32> %a, %b
107 define <4 x i32> @phaddd5(<4 x i32> %x) {
108 ; SSSE3-LABEL: phaddd5:
110 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
113 ; AVX-LABEL: phaddd5:
115 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
117 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
118 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
119 %r = add <4 x i32> %a, %b
123 define <4 x i32> @phaddd6(<4 x i32> %x) {
124 ; SSSE3-SLOW-LABEL: phaddd6:
125 ; SSSE3-SLOW: # %bb.0:
126 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
127 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
128 ; SSSE3-SLOW-NEXT: retq
130 ; SSSE3-FAST-LABEL: phaddd6:
131 ; SSSE3-FAST: # %bb.0:
132 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
133 ; SSSE3-FAST-NEXT: retq
135 ; AVX-SLOW-LABEL: phaddd6:
137 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
138 ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
139 ; AVX-SLOW-NEXT: retq
141 ; AVX-FAST-LABEL: phaddd6:
143 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
144 ; AVX-FAST-NEXT: retq
146 ; AVX2-SHUF-LABEL: phaddd6:
147 ; AVX2-SHUF: # %bb.0:
148 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
149 ; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
150 ; AVX2-SHUF-NEXT: retq
151 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
152 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
153 %r = add <4 x i32> %a, %b
157 define <4 x i32> @phaddd7(<4 x i32> %x) {
158 ; SSSE3-LABEL: phaddd7:
160 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
163 ; AVX-LABEL: phaddd7:
165 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
167 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
168 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
169 %r = add <4 x i32> %a, %b
173 define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
174 ; SSSE3-LABEL: phsubw1:
176 ; SSSE3-NEXT: phsubw %xmm1, %xmm0
179 ; AVX-LABEL: phsubw1:
181 ; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
183 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
184 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
185 %r = sub <8 x i16> %a, %b
189 define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
190 ; SSSE3-LABEL: phsubd1:
192 ; SSSE3-NEXT: phsubd %xmm1, %xmm0
195 ; AVX-LABEL: phsubd1:
197 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
199 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
200 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
201 %r = sub <4 x i32> %a, %b
205 define <4 x i32> @phsubd2(<4 x i32> %x) {
206 ; SSSE3-LABEL: phsubd2:
208 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
211 ; AVX-LABEL: phsubd2:
213 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
215 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
216 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
217 %r = sub <4 x i32> %a, %b
221 define <4 x i32> @phsubd3(<4 x i32> %x) {
222 ; SSSE3-LABEL: phsubd3:
224 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
227 ; AVX-LABEL: phsubd3:
229 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
231 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
232 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
233 %r = sub <4 x i32> %a, %b
237 define <4 x i32> @phsubd4(<4 x i32> %x) {
238 ; SSSE3-SLOW-LABEL: phsubd4:
239 ; SSSE3-SLOW: # %bb.0:
240 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
241 ; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0
242 ; SSSE3-SLOW-NEXT: retq
244 ; SSSE3-FAST-LABEL: phsubd4:
245 ; SSSE3-FAST: # %bb.0:
246 ; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0
247 ; SSSE3-FAST-NEXT: retq
249 ; AVX-SLOW-LABEL: phsubd4:
251 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
252 ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
253 ; AVX-SLOW-NEXT: retq
255 ; AVX-FAST-LABEL: phsubd4:
257 ; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
258 ; AVX-FAST-NEXT: retq
260 ; AVX2-SHUF-LABEL: phsubd4:
261 ; AVX2-SHUF: # %bb.0:
262 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
263 ; AVX2-SHUF-NEXT: vpsubd %xmm1, %xmm0, %xmm0
264 ; AVX2-SHUF-NEXT: retq
265 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
266 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
267 %r = sub <4 x i32> %a, %b
271 define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
272 ; SSSE3-LABEL: phsubw1_reverse:
274 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
275 ; SSSE3-NEXT: movdqa %xmm1, %xmm4
276 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
277 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
278 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
279 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
280 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
281 ; SSSE3-NEXT: pshufb %xmm3, %xmm1
282 ; SSSE3-NEXT: pshufb %xmm3, %xmm0
283 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
284 ; SSSE3-NEXT: psubw %xmm0, %xmm2
285 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
288 ; AVX-LABEL: phsubw1_reverse:
290 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
291 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3
292 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
293 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
294 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
295 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
296 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
297 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
298 ; AVX-NEXT: vpsubw %xmm0, %xmm2, %xmm0
300 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
301 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
302 %r = sub <8 x i16> %a, %b
306 define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
307 ; SSSE3-LABEL: phsubd1_reverse:
309 ; SSSE3-NEXT: movaps %xmm0, %xmm2
310 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
311 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
312 ; SSSE3-NEXT: psubd %xmm0, %xmm2
313 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
316 ; AVX-LABEL: phsubd1_reverse:
318 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
319 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
320 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
322 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
323 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
324 %r = sub <4 x i32> %a, %b
328 define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
329 ; SSSE3-LABEL: phaddd_single_source1:
331 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
334 ; AVX-LABEL: phaddd_single_source1:
336 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
338 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
339 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
340 %add = add <4 x i32> %l, %r
344 define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
345 ; SSSE3-SLOW-LABEL: phaddd_single_source2:
346 ; SSSE3-SLOW: # %bb.0:
347 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
348 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
349 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
350 ; SSSE3-SLOW-NEXT: retq
352 ; SSSE3-FAST-LABEL: phaddd_single_source2:
353 ; SSSE3-FAST: # %bb.0:
354 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
355 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
356 ; SSSE3-FAST-NEXT: retq
358 ; AVX-SLOW-LABEL: phaddd_single_source2:
360 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
361 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
362 ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
363 ; AVX-SLOW-NEXT: retq
365 ; AVX-FAST-LABEL: phaddd_single_source2:
367 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
368 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
369 ; AVX-FAST-NEXT: retq
371 ; AVX2-SHUF-LABEL: phaddd_single_source2:
372 ; AVX2-SHUF: # %bb.0:
373 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
374 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
375 ; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
376 ; AVX2-SHUF-NEXT: retq
377 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
378 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
379 %add = add <4 x i32> %l, %r
380 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
381 ret <4 x i32> %shuffle2
384 define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
385 ; SSSE3-LABEL: phaddd_single_source3:
387 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
390 ; AVX-LABEL: phaddd_single_source3:
392 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
394 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
395 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
396 %add = add <4 x i32> %l, %r
400 define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
401 ; SSSE3-SLOW-LABEL: phaddd_single_source4:
402 ; SSSE3-SLOW: # %bb.0:
403 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
404 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
405 ; SSSE3-SLOW-NEXT: retq
407 ; SSSE3-FAST-LABEL: phaddd_single_source4:
408 ; SSSE3-FAST: # %bb.0:
409 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
410 ; SSSE3-FAST-NEXT: retq
412 ; AVX-SLOW-LABEL: phaddd_single_source4:
414 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
415 ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
416 ; AVX-SLOW-NEXT: retq
418 ; AVX-FAST-LABEL: phaddd_single_source4:
420 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
421 ; AVX-FAST-NEXT: retq
423 ; AVX2-SHUF-LABEL: phaddd_single_source4:
424 ; AVX2-SHUF: # %bb.0:
425 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
426 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
427 ; AVX2-SHUF-NEXT: retq
428 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
429 %add = add <4 x i32> %l, %x
433 define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
434 ; SSSE3-SLOW-LABEL: phaddd_single_source5:
435 ; SSSE3-SLOW: # %bb.0:
436 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
437 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
438 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
439 ; SSSE3-SLOW-NEXT: retq
441 ; SSSE3-FAST-LABEL: phaddd_single_source5:
442 ; SSSE3-FAST: # %bb.0:
443 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
444 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
445 ; SSSE3-FAST-NEXT: retq
447 ; AVX-SLOW-LABEL: phaddd_single_source5:
449 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
450 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
451 ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
452 ; AVX-SLOW-NEXT: retq
454 ; AVX-FAST-LABEL: phaddd_single_source5:
456 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
457 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
458 ; AVX-FAST-NEXT: retq
460 ; AVX2-SHUF-LABEL: phaddd_single_source5:
461 ; AVX2-SHUF: # %bb.0:
462 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
463 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
464 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
465 ; AVX2-SHUF-NEXT: retq
466 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
467 %add = add <4 x i32> %l, %x
468 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
469 ret <4 x i32> %shuffle2
472 define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
473 ; SSSE3-LABEL: phaddd_single_source6:
475 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
476 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
479 ; AVX-LABEL: phaddd_single_source6:
481 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
482 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
484 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
485 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
486 %add = add <4 x i32> %l, %r
487 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
488 ret <4 x i32> %shuffle2
491 define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
492 ; SSSE3-LABEL: phaddw_single_source1:
494 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
497 ; AVX-LABEL: phaddw_single_source1:
499 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
501 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
502 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
503 %add = add <8 x i16> %l, %r
507 define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
508 ; SSSE3-SLOW-LABEL: phaddw_single_source2:
509 ; SSSE3-SLOW: # %bb.0:
510 ; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
511 ; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
512 ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
513 ; SSSE3-SLOW-NEXT: retq
515 ; SSSE3-FAST-LABEL: phaddw_single_source2:
516 ; SSSE3-FAST: # %bb.0:
517 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
518 ; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
519 ; SSSE3-FAST-NEXT: retq
521 ; AVX-SLOW-LABEL: phaddw_single_source2:
523 ; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
524 ; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
525 ; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
526 ; AVX-SLOW-NEXT: retq
528 ; AVX-FAST-LABEL: phaddw_single_source2:
530 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
531 ; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
532 ; AVX-FAST-NEXT: retq
534 ; AVX2-SHUF-LABEL: phaddw_single_source2:
535 ; AVX2-SHUF: # %bb.0:
536 ; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
537 ; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
538 ; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0
539 ; AVX2-SHUF-NEXT: retq
540 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
541 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
542 %add = add <8 x i16> %l, %r
543 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
544 ret <8 x i16> %shuffle2
547 define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
548 ; SSSE3-LABEL: phaddw_single_source3:
550 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
553 ; AVX-LABEL: phaddw_single_source3:
555 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
557 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
558 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
559 %add = add <8 x i16> %l, %r
563 define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
564 ; SSSE3-SLOW-LABEL: phaddw_single_source4:
565 ; SSSE3-SLOW: # %bb.0:
566 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
567 ; SSSE3-SLOW-NEXT: pslld $16, %xmm1
568 ; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
569 ; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0
570 ; SSSE3-SLOW-NEXT: retq
572 ; SSSE3-FAST-LABEL: phaddw_single_source4:
573 ; SSSE3-FAST: # %bb.0:
574 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
575 ; SSSE3-FAST-NEXT: retq
577 ; AVX-SLOW-LABEL: phaddw_single_source4:
579 ; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1
580 ; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
581 ; AVX-SLOW-NEXT: retq
583 ; AVX-FAST-LABEL: phaddw_single_source4:
585 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
586 ; AVX-FAST-NEXT: retq
588 ; AVX2-SHUF-LABEL: phaddw_single_source4:
589 ; AVX2-SHUF: # %bb.0:
590 ; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1
591 ; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0
592 ; AVX2-SHUF-NEXT: retq
593 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
594 %add = add <8 x i16> %l, %x
598 define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
599 ; SSSE3-LABEL: phaddw_single_source6:
601 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
602 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
605 ; AVX-LABEL: phaddw_single_source6:
607 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
608 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
610 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
611 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
612 %add = add <8 x i16> %l, %r
613 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
614 ret <8 x i16> %shuffle2
618 define i32 @PR39936_v8i32(<8 x i32>) {
619 ; SSSE3-SLOW-LABEL: PR39936_v8i32:
620 ; SSSE3-SLOW: # %bb.0:
621 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
622 ; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
623 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
624 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
625 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax
626 ; SSSE3-SLOW-NEXT: retq
628 ; SSSE3-FAST-LABEL: PR39936_v8i32:
629 ; SSSE3-FAST: # %bb.0:
630 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
631 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
632 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
633 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
634 ; SSSE3-FAST-NEXT: retq
636 ; AVX1-SLOW-LABEL: PR39936_v8i32:
637 ; AVX1-SLOW: # %bb.0:
638 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
639 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
640 ; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
641 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
642 ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
643 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
644 ; AVX1-SLOW-NEXT: vzeroupper
645 ; AVX1-SLOW-NEXT: retq
647 ; AVX1-FAST-LABEL: PR39936_v8i32:
648 ; AVX1-FAST: # %bb.0:
649 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
650 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
651 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
652 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
653 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
654 ; AVX1-FAST-NEXT: vzeroupper
655 ; AVX1-FAST-NEXT: retq
657 ; AVX2-SLOW-LABEL: PR39936_v8i32:
658 ; AVX2-SLOW: # %bb.0:
659 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
660 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
661 ; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
662 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
663 ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
664 ; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
665 ; AVX2-SLOW-NEXT: vzeroupper
666 ; AVX2-SLOW-NEXT: retq
668 ; AVX2-FAST-LABEL: PR39936_v8i32:
669 ; AVX2-FAST: # %bb.0:
670 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
671 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
672 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
673 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
674 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax
675 ; AVX2-FAST-NEXT: vzeroupper
676 ; AVX2-FAST-NEXT: retq
678 ; AVX2-SHUF-LABEL: PR39936_v8i32:
679 ; AVX2-SHUF: # %bb.0:
680 ; AVX2-SHUF-NEXT: vextracti128 $1, %ymm0, %xmm1
681 ; AVX2-SHUF-NEXT: vphaddd %xmm1, %xmm0, %xmm0
682 ; AVX2-SHUF-NEXT: vphaddd %xmm0, %xmm0, %xmm0
683 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
684 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
685 ; AVX2-SHUF-NEXT: vmovd %xmm0, %eax
686 ; AVX2-SHUF-NEXT: vzeroupper
687 ; AVX2-SHUF-NEXT: retq
688 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
689 %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
690 %4 = add <8 x i32> %2, %3
691 %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
692 %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
693 %7 = add <8 x i32> %5, %6
694 %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
695 %9 = add <8 x i32> %8, %7
696 %10 = extractelement <8 x i32> %9, i32 0