1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX512
11 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
13 define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14 ; SSE-LABEL: test14_undef:
16 ; SSE-NEXT: phaddd %xmm2, %xmm0
19 ; AVX-LABEL: test14_undef:
21 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
23 %vecext = extractelement <8 x i32> %a, i32 0
24 %vecext1 = extractelement <8 x i32> %a, i32 1
25 %add = add i32 %vecext, %vecext1
26 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
27 %vecext2 = extractelement <8 x i32> %b, i32 2
28 %vecext3 = extractelement <8 x i32> %b, i32 3
29 %add4 = add i32 %vecext2, %vecext3
30 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
31 ret <8 x i32> %vecinit5
34 ; integer horizontal adds instead of two scalar adds followed by vector inserts.
35 define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
36 ; SSE-SLOW-LABEL: test15_undef:
38 ; SSE-SLOW-NEXT: movd %xmm0, %eax
39 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
40 ; SSE-SLOW-NEXT: movd %xmm0, %ecx
41 ; SSE-SLOW-NEXT: addl %eax, %ecx
42 ; SSE-SLOW-NEXT: movd %xmm3, %eax
43 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
44 ; SSE-SLOW-NEXT: movd %xmm0, %edx
45 ; SSE-SLOW-NEXT: addl %eax, %edx
46 ; SSE-SLOW-NEXT: movd %ecx, %xmm0
47 ; SSE-SLOW-NEXT: movd %edx, %xmm1
48 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
51 ; SSE-FAST-LABEL: test15_undef:
53 ; SSE-FAST-NEXT: movdqa %xmm3, %xmm1
54 ; SSE-FAST-NEXT: phaddd %xmm0, %xmm0
55 ; SSE-FAST-NEXT: phaddd %xmm3, %xmm1
58 ; AVX1-SLOW-LABEL: test15_undef:
60 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
61 ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %ecx
62 ; AVX1-SLOW-NEXT: addl %eax, %ecx
63 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
64 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
65 ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %edx
66 ; AVX1-SLOW-NEXT: addl %eax, %edx
67 ; AVX1-SLOW-NEXT: vmovd %ecx, %xmm0
68 ; AVX1-SLOW-NEXT: vmovd %edx, %xmm1
69 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
70 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
71 ; AVX1-SLOW-NEXT: retq
73 ; AVX1-FAST-LABEL: test15_undef:
75 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
76 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
77 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1
78 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
79 ; AVX1-FAST-NEXT: retq
81 ; AVX2-LABEL: test15_undef:
83 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
86 ; AVX512-LABEL: test15_undef:
88 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
90 %vecext = extractelement <8 x i32> %a, i32 0
91 %vecext1 = extractelement <8 x i32> %a, i32 1
92 %add = add i32 %vecext, %vecext1
93 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
94 %vecext2 = extractelement <8 x i32> %b, i32 4
95 %vecext3 = extractelement <8 x i32> %b, i32 5
96 %add4 = add i32 %vecext2, %vecext3
97 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
98 ret <8 x i32> %vecinit5
101 define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
102 ; SSE-LABEL: PR40243_alt:
104 ; SSE-NEXT: phaddd %xmm3, %xmm1
107 ; AVX1-LABEL: PR40243_alt:
109 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
110 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
111 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
112 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
115 ; AVX2-LABEL: PR40243_alt:
117 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
120 ; AVX512-LABEL: PR40243_alt:
122 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
124 %a4 = extractelement <8 x i32> %a, i32 4
125 %a5 = extractelement <8 x i32> %a, i32 5
126 %add4 = add i32 %a4, %a5
127 %b6 = extractelement <8 x i32> %b, i32 6
128 %b7 = extractelement <8 x i32> %b, i32 7
129 %add7 = add i32 %b6, %b7
130 %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4
131 %r = insertelement <8 x i32> %r4, i32 %add7, i32 7
135 define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
136 ; SSE-LABEL: test16_undef:
138 ; SSE-NEXT: phaddd %xmm0, %xmm0
141 ; AVX-LABEL: test16_undef:
143 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
145 %vecext = extractelement <8 x i32> %a, i32 0
146 %vecext1 = extractelement <8 x i32> %a, i32 1
147 %add = add i32 %vecext, %vecext1
148 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
149 %vecext2 = extractelement <8 x i32> %a, i32 2
150 %vecext3 = extractelement <8 x i32> %a, i32 3
151 %add4 = add i32 %vecext2, %vecext3
152 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
153 ret <8 x i32> %vecinit5
156 define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
157 ; SSE-LABEL: test16_v16i32_undef:
159 ; SSE-NEXT: phaddd %xmm0, %xmm0
162 ; AVX-LABEL: test16_v16i32_undef:
164 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
166 %vecext = extractelement <16 x i32> %a, i32 0
167 %vecext1 = extractelement <16 x i32> %a, i32 1
168 %add = add i32 %vecext, %vecext1
169 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
170 %vecext2 = extractelement <16 x i32> %a, i32 2
171 %vecext3 = extractelement <16 x i32> %a, i32 3
172 %add4 = add i32 %vecext2, %vecext3
173 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
174 ret <16 x i32> %vecinit5
177 define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
178 ; SSE-LABEL: test17_undef:
180 ; SSE-NEXT: phaddd %xmm1, %xmm0
183 ; AVX1-LABEL: test17_undef:
185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
186 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
189 ; AVX2-LABEL: test17_undef:
191 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
192 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
195 ; AVX512-LABEL: test17_undef:
197 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
198 ; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
200 %vecext = extractelement <8 x i32> %a, i32 0
201 %vecext1 = extractelement <8 x i32> %a, i32 1
202 %add1 = add i32 %vecext, %vecext1
203 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
204 %vecext2 = extractelement <8 x i32> %a, i32 2
205 %vecext3 = extractelement <8 x i32> %a, i32 3
206 %add2 = add i32 %vecext2, %vecext3
207 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
208 %vecext4 = extractelement <8 x i32> %a, i32 4
209 %vecext5 = extractelement <8 x i32> %a, i32 5
210 %add3 = add i32 %vecext4, %vecext5
211 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
212 %vecext6 = extractelement <8 x i32> %a, i32 6
213 %vecext7 = extractelement <8 x i32> %a, i32 7
214 %add4 = add i32 %vecext6, %vecext7
215 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
216 ret <8 x i32> %vecinit4
219 define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
220 ; SSE-LABEL: test17_v16i32_undef:
222 ; SSE-NEXT: phaddd %xmm1, %xmm0
225 ; AVX1-LABEL: test17_v16i32_undef:
227 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
228 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
231 ; AVX2-LABEL: test17_v16i32_undef:
233 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
234 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
237 ; AVX512-LABEL: test17_v16i32_undef:
239 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
240 ; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
242 %vecext = extractelement <16 x i32> %a, i32 0
243 %vecext1 = extractelement <16 x i32> %a, i32 1
244 %add1 = add i32 %vecext, %vecext1
245 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
246 %vecext2 = extractelement <16 x i32> %a, i32 2
247 %vecext3 = extractelement <16 x i32> %a, i32 3
248 %add2 = add i32 %vecext2, %vecext3
249 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
250 %vecext4 = extractelement <16 x i32> %a, i32 4
251 %vecext5 = extractelement <16 x i32> %a, i32 5
252 %add3 = add i32 %vecext4, %vecext5
253 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
254 %vecext6 = extractelement <16 x i32> %a, i32 6
255 %vecext7 = extractelement <16 x i32> %a, i32 7
256 %add4 = add i32 %vecext6, %vecext7
257 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
258 ret <16 x i32> %vecinit4