1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF
11 define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
12 ; SSSE3-LABEL: phaddw1:
14 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
19 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
21 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
22 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23 %r = add <8 x i16> %a, %b
27 define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
28 ; SSSE3-LABEL: phaddw2:
30 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
35 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
37 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
38 %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
39 %r = add <8 x i16> %a, %b
43 define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
44 ; SSSE3-LABEL: phaddd1:
46 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
51 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
53 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
54 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
55 %r = add <4 x i32> %a, %b
59 define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
60 ; SSSE3-LABEL: phaddd2:
62 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
67 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
69 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
70 %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
71 %r = add <4 x i32> %a, %b
75 define <4 x i32> @phaddd3(<4 x i32> %x) {
76 ; SSSE3-LABEL: phaddd3:
78 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
83 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
85 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
86 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
87 %r = add <4 x i32> %a, %b
91 define <4 x i32> @phaddd4(<4 x i32> %x) {
92 ; SSSE3-LABEL: phaddd4:
94 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
99 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
101 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
102 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
103 %r = add <4 x i32> %a, %b
107 define <4 x i32> @phaddd5(<4 x i32> %x) {
108 ; SSSE3-LABEL: phaddd5:
110 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
113 ; AVX-LABEL: phaddd5:
115 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
117 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
118 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
119 %r = add <4 x i32> %a, %b
123 define <4 x i32> @phaddd6(<4 x i32> %x) {
124 ; SSSE3-SLOW-LABEL: phaddd6:
125 ; SSSE3-SLOW: # %bb.0:
126 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
127 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
128 ; SSSE3-SLOW-NEXT: retq
130 ; SSSE3-FAST-LABEL: phaddd6:
131 ; SSSE3-FAST: # %bb.0:
132 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
133 ; SSSE3-FAST-NEXT: retq
135 ; AVX-SLOW-LABEL: phaddd6:
137 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
138 ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
139 ; AVX-SLOW-NEXT: retq
141 ; AVX-FAST-LABEL: phaddd6:
143 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
144 ; AVX-FAST-NEXT: retq
146 ; AVX2-SHUF-LABEL: phaddd6:
147 ; AVX2-SHUF: # %bb.0:
148 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
149 ; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
150 ; AVX2-SHUF-NEXT: retq
151 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
152 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
153 %r = add <4 x i32> %a, %b
157 define <4 x i32> @phaddd7(<4 x i32> %x) {
158 ; SSSE3-LABEL: phaddd7:
160 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
163 ; AVX-LABEL: phaddd7:
165 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
167 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
168 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
169 %r = add <4 x i32> %a, %b
173 define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
174 ; SSSE3-LABEL: phsubw1:
176 ; SSSE3-NEXT: phsubw %xmm1, %xmm0
179 ; AVX-LABEL: phsubw1:
181 ; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
183 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
184 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
185 %r = sub <8 x i16> %a, %b
189 define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
190 ; SSSE3-LABEL: phsubd1:
192 ; SSSE3-NEXT: phsubd %xmm1, %xmm0
195 ; AVX-LABEL: phsubd1:
197 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
199 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
200 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
201 %r = sub <4 x i32> %a, %b
205 define <4 x i32> @phsubd2(<4 x i32> %x) {
206 ; SSSE3-LABEL: phsubd2:
208 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
211 ; AVX-LABEL: phsubd2:
213 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
215 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
216 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
217 %r = sub <4 x i32> %a, %b
221 define <4 x i32> @phsubd3(<4 x i32> %x) {
222 ; SSSE3-LABEL: phsubd3:
224 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
227 ; AVX-LABEL: phsubd3:
229 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
231 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
232 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
233 %r = sub <4 x i32> %a, %b
237 define <4 x i32> @phsubd4(<4 x i32> %x) {
238 ; SSSE3-SLOW-LABEL: phsubd4:
239 ; SSSE3-SLOW: # %bb.0:
240 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
241 ; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0
242 ; SSSE3-SLOW-NEXT: retq
244 ; SSSE3-FAST-LABEL: phsubd4:
245 ; SSSE3-FAST: # %bb.0:
246 ; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0
247 ; SSSE3-FAST-NEXT: retq
249 ; AVX-SLOW-LABEL: phsubd4:
251 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
252 ; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
253 ; AVX-SLOW-NEXT: retq
255 ; AVX-FAST-LABEL: phsubd4:
257 ; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
258 ; AVX-FAST-NEXT: retq
260 ; AVX2-SHUF-LABEL: phsubd4:
261 ; AVX2-SHUF: # %bb.0:
262 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
263 ; AVX2-SHUF-NEXT: vpsubd %xmm1, %xmm0, %xmm0
264 ; AVX2-SHUF-NEXT: retq
265 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
266 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
267 %r = sub <4 x i32> %a, %b
271 define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
272 ; SSSE3-LABEL: phsubw1_reverse:
274 ; SSSE3-NEXT: movdqa %xmm1, %xmm3
275 ; SSSE3-NEXT: psrad $16, %xmm3
276 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
277 ; SSSE3-NEXT: psrad $16, %xmm2
278 ; SSSE3-NEXT: packssdw %xmm3, %xmm2
279 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
280 ; SSSE3-NEXT: pshufb %xmm3, %xmm1
281 ; SSSE3-NEXT: pshufb %xmm3, %xmm0
282 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
283 ; SSSE3-NEXT: psubw %xmm0, %xmm2
284 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
287 ; AVX-LABEL: phsubw1_reverse:
289 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
290 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm3
291 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
292 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
293 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
294 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
295 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
296 ; AVX-NEXT: vpsubw %xmm0, %xmm2, %xmm0
298 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
299 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
300 %r = sub <8 x i16> %a, %b
304 define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
305 ; SSSE3-LABEL: phsubd1_reverse:
307 ; SSSE3-NEXT: movaps %xmm0, %xmm2
308 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
309 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
310 ; SSSE3-NEXT: psubd %xmm0, %xmm2
311 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
314 ; AVX-LABEL: phsubd1_reverse:
316 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
317 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
318 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
320 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
321 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
322 %r = sub <4 x i32> %a, %b
326 define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
327 ; SSSE3-LABEL: phaddd_single_source1:
329 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
332 ; AVX-LABEL: phaddd_single_source1:
334 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
336 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
337 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
338 %add = add <4 x i32> %l, %r
342 define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
343 ; SSSE3-SLOW-LABEL: phaddd_single_source2:
344 ; SSSE3-SLOW: # %bb.0:
345 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
346 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
347 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
348 ; SSSE3-SLOW-NEXT: retq
350 ; SSSE3-FAST-LABEL: phaddd_single_source2:
351 ; SSSE3-FAST: # %bb.0:
352 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
353 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
354 ; SSSE3-FAST-NEXT: retq
356 ; AVX-SLOW-LABEL: phaddd_single_source2:
358 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
359 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
360 ; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
361 ; AVX-SLOW-NEXT: retq
363 ; AVX-FAST-LABEL: phaddd_single_source2:
365 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
366 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
367 ; AVX-FAST-NEXT: retq
369 ; AVX2-SHUF-LABEL: phaddd_single_source2:
370 ; AVX2-SHUF: # %bb.0:
371 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
372 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
373 ; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
374 ; AVX2-SHUF-NEXT: retq
375 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
376 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
377 %add = add <4 x i32> %l, %r
378 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
379 ret <4 x i32> %shuffle2
382 define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
383 ; SSSE3-LABEL: phaddd_single_source3:
385 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
388 ; AVX-LABEL: phaddd_single_source3:
390 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
392 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
393 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
394 %add = add <4 x i32> %l, %r
398 define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
399 ; SSSE3-SLOW-LABEL: phaddd_single_source4:
400 ; SSSE3-SLOW: # %bb.0:
401 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
402 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
403 ; SSSE3-SLOW-NEXT: retq
405 ; SSSE3-FAST-LABEL: phaddd_single_source4:
406 ; SSSE3-FAST: # %bb.0:
407 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
408 ; SSSE3-FAST-NEXT: retq
410 ; AVX-SLOW-LABEL: phaddd_single_source4:
412 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
413 ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
414 ; AVX-SLOW-NEXT: retq
416 ; AVX-FAST-LABEL: phaddd_single_source4:
418 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
419 ; AVX-FAST-NEXT: retq
421 ; AVX2-SHUF-LABEL: phaddd_single_source4:
422 ; AVX2-SHUF: # %bb.0:
423 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
424 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
425 ; AVX2-SHUF-NEXT: retq
426 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
427 %add = add <4 x i32> %l, %x
431 define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
432 ; SSSE3-SLOW-LABEL: phaddd_single_source5:
433 ; SSSE3-SLOW: # %bb.0:
434 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
435 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
436 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
437 ; SSSE3-SLOW-NEXT: retq
439 ; SSSE3-FAST-LABEL: phaddd_single_source5:
440 ; SSSE3-FAST: # %bb.0:
441 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
442 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
443 ; SSSE3-FAST-NEXT: retq
445 ; AVX-SLOW-LABEL: phaddd_single_source5:
447 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
448 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
449 ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
450 ; AVX-SLOW-NEXT: retq
452 ; AVX-FAST-LABEL: phaddd_single_source5:
454 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
455 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
456 ; AVX-FAST-NEXT: retq
458 ; AVX2-SHUF-LABEL: phaddd_single_source5:
459 ; AVX2-SHUF: # %bb.0:
460 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
461 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
462 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
463 ; AVX2-SHUF-NEXT: retq
464 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
465 %add = add <4 x i32> %l, %x
466 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
467 ret <4 x i32> %shuffle2
470 define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
471 ; SSSE3-LABEL: phaddd_single_source6:
473 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
474 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
477 ; AVX-LABEL: phaddd_single_source6:
479 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
480 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
482 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
483 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
484 %add = add <4 x i32> %l, %r
485 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
486 ret <4 x i32> %shuffle2
489 define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
490 ; SSSE3-LABEL: phaddw_single_source1:
492 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
495 ; AVX-LABEL: phaddw_single_source1:
497 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
499 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
500 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
501 %add = add <8 x i16> %l, %r
505 define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
506 ; SSSE3-SLOW-LABEL: phaddw_single_source2:
507 ; SSSE3-SLOW: # %bb.0:
508 ; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
509 ; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
510 ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
511 ; SSSE3-SLOW-NEXT: retq
513 ; SSSE3-FAST-LABEL: phaddw_single_source2:
514 ; SSSE3-FAST: # %bb.0:
515 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
516 ; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
517 ; SSSE3-FAST-NEXT: retq
519 ; AVX-SLOW-LABEL: phaddw_single_source2:
521 ; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
522 ; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
523 ; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
524 ; AVX-SLOW-NEXT: retq
526 ; AVX-FAST-LABEL: phaddw_single_source2:
528 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
529 ; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
530 ; AVX-FAST-NEXT: retq
532 ; AVX2-SHUF-LABEL: phaddw_single_source2:
533 ; AVX2-SHUF: # %bb.0:
534 ; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
535 ; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
536 ; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0
537 ; AVX2-SHUF-NEXT: retq
538 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
539 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
540 %add = add <8 x i16> %l, %r
541 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
542 ret <8 x i16> %shuffle2
545 define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
546 ; SSSE3-LABEL: phaddw_single_source3:
548 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
551 ; AVX-LABEL: phaddw_single_source3:
553 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
555 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
556 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
557 %add = add <8 x i16> %l, %r
561 define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
562 ; SSSE3-SLOW-LABEL: phaddw_single_source4:
563 ; SSSE3-SLOW: # %bb.0:
564 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
565 ; SSSE3-SLOW-NEXT: pslld $16, %xmm1
566 ; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
567 ; SSSE3-SLOW-NEXT: retq
569 ; SSSE3-FAST-LABEL: phaddw_single_source4:
570 ; SSSE3-FAST: # %bb.0:
571 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
572 ; SSSE3-FAST-NEXT: retq
574 ; AVX-SLOW-LABEL: phaddw_single_source4:
576 ; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1
577 ; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
578 ; AVX-SLOW-NEXT: retq
580 ; AVX-FAST-LABEL: phaddw_single_source4:
582 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
583 ; AVX-FAST-NEXT: retq
585 ; AVX2-SHUF-LABEL: phaddw_single_source4:
586 ; AVX2-SHUF: # %bb.0:
587 ; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1
588 ; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0
589 ; AVX2-SHUF-NEXT: retq
590 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
591 %add = add <8 x i16> %l, %x
595 define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
596 ; SSSE3-LABEL: phaddw_single_source6:
598 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
599 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
602 ; AVX-LABEL: phaddw_single_source6:
604 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
605 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
607 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
608 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
609 %add = add <8 x i16> %l, %r
610 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
611 ret <8 x i16> %shuffle2
615 define i32 @PR39936_v8i32(<8 x i32>) {
616 ; SSSE3-SLOW-LABEL: PR39936_v8i32:
617 ; SSSE3-SLOW: # %bb.0:
618 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
619 ; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
620 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
621 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
622 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax
623 ; SSSE3-SLOW-NEXT: retq
625 ; SSSE3-FAST-LABEL: PR39936_v8i32:
626 ; SSSE3-FAST: # %bb.0:
627 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
628 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
629 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
630 ; SSSE3-FAST-NEXT: movd %xmm0, %eax
631 ; SSSE3-FAST-NEXT: retq
633 ; AVX1-SLOW-LABEL: PR39936_v8i32:
634 ; AVX1-SLOW: # %bb.0:
635 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
636 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
637 ; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
638 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
639 ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
640 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
641 ; AVX1-SLOW-NEXT: vzeroupper
642 ; AVX1-SLOW-NEXT: retq
644 ; AVX1-FAST-LABEL: PR39936_v8i32:
645 ; AVX1-FAST: # %bb.0:
646 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
647 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
648 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
649 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
650 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax
651 ; AVX1-FAST-NEXT: vzeroupper
652 ; AVX1-FAST-NEXT: retq
654 ; AVX2-SLOW-LABEL: PR39936_v8i32:
655 ; AVX2-SLOW: # %bb.0:
656 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
657 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
658 ; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
659 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
660 ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
661 ; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
662 ; AVX2-SLOW-NEXT: vzeroupper
663 ; AVX2-SLOW-NEXT: retq
665 ; AVX2-FAST-LABEL: PR39936_v8i32:
666 ; AVX2-FAST: # %bb.0:
667 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
668 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
669 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
670 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
671 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax
672 ; AVX2-FAST-NEXT: vzeroupper
673 ; AVX2-FAST-NEXT: retq
675 ; AVX2-SHUF-LABEL: PR39936_v8i32:
676 ; AVX2-SHUF: # %bb.0:
677 ; AVX2-SHUF-NEXT: vextracti128 $1, %ymm0, %xmm1
678 ; AVX2-SHUF-NEXT: vphaddd %xmm1, %xmm0, %xmm0
679 ; AVX2-SHUF-NEXT: vphaddd %xmm0, %xmm0, %xmm0
680 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
681 ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
682 ; AVX2-SHUF-NEXT: vmovd %xmm0, %eax
683 ; AVX2-SHUF-NEXT: vzeroupper
684 ; AVX2-SHUF-NEXT: retq
685 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
686 %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
687 %4 = add <8 x i32> %2, %3
688 %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
689 %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
690 %7 = add <8 x i32> %5, %6
691 %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
692 %9 = add <8 x i32> %8, %7
693 %10 = extractelement <8 x i32> %9, i32 0