1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
11 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
13 define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14 ; SSE-LABEL: test14_undef:
16 ; SSE-NEXT: phaddd %xmm2, %xmm0
19 ; AVX1-LABEL: test14_undef:
21 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
24 ; AVX2-LABEL: test14_undef:
26 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
29 ; AVX512-LABEL: test14_undef:
31 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
33 %vecext = extractelement <8 x i32> %a, i32 0
34 %vecext1 = extractelement <8 x i32> %a, i32 1
35 %add = add i32 %vecext, %vecext1
36 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
37 %vecext2 = extractelement <8 x i32> %b, i32 2
38 %vecext3 = extractelement <8 x i32> %b, i32 3
39 %add4 = add i32 %vecext2, %vecext3
40 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
41 ret <8 x i32> %vecinit5
44 ; integer horizontal adds instead of two scalar adds followed by vector inserts.
45 define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
46 ; SSE-SLOW-LABEL: test15_undef:
48 ; SSE-SLOW-NEXT: movd %xmm0, %eax
49 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
50 ; SSE-SLOW-NEXT: movd %xmm0, %ecx
51 ; SSE-SLOW-NEXT: addl %eax, %ecx
52 ; SSE-SLOW-NEXT: movd %xmm3, %eax
53 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
54 ; SSE-SLOW-NEXT: movd %xmm0, %edx
55 ; SSE-SLOW-NEXT: addl %eax, %edx
56 ; SSE-SLOW-NEXT: movd %ecx, %xmm0
57 ; SSE-SLOW-NEXT: movd %edx, %xmm1
58 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
61 ; SSE-FAST-LABEL: test15_undef:
63 ; SSE-FAST-NEXT: phaddd %xmm0, %xmm0
64 ; SSE-FAST-NEXT: phaddd %xmm3, %xmm3
65 ; SSE-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
68 ; AVX1-SLOW-LABEL: test15_undef:
70 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
71 ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %ecx
72 ; AVX1-SLOW-NEXT: addl %eax, %ecx
73 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
74 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
75 ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %edx
76 ; AVX1-SLOW-NEXT: addl %eax, %edx
77 ; AVX1-SLOW-NEXT: vmovd %ecx, %xmm0
78 ; AVX1-SLOW-NEXT: vmovd %edx, %xmm1
79 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
80 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
81 ; AVX1-SLOW-NEXT: retq
83 ; AVX1-FAST-LABEL: test15_undef:
85 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
86 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
87 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1
88 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
89 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
90 ; AVX1-FAST-NEXT: retq
92 ; AVX2-LABEL: test15_undef:
94 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
97 ; AVX512-LABEL: test15_undef:
99 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
101 %vecext = extractelement <8 x i32> %a, i32 0
102 %vecext1 = extractelement <8 x i32> %a, i32 1
103 %add = add i32 %vecext, %vecext1
104 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
105 %vecext2 = extractelement <8 x i32> %b, i32 4
106 %vecext3 = extractelement <8 x i32> %b, i32 5
107 %add4 = add i32 %vecext2, %vecext3
108 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
109 ret <8 x i32> %vecinit5
112 define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
113 ; SSE-LABEL: PR40243_alt:
115 ; SSE-NEXT: phaddd %xmm3, %xmm1
118 ; AVX1-LABEL: PR40243_alt:
120 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
121 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
122 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
123 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
126 ; AVX2-LABEL: PR40243_alt:
128 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
131 ; AVX512-LABEL: PR40243_alt:
133 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
135 %a4 = extractelement <8 x i32> %a, i32 4
136 %a5 = extractelement <8 x i32> %a, i32 5
137 %add4 = add i32 %a4, %a5
138 %b6 = extractelement <8 x i32> %b, i32 6
139 %b7 = extractelement <8 x i32> %b, i32 7
140 %add7 = add i32 %b6, %b7
141 %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4
142 %r = insertelement <8 x i32> %r4, i32 %add7, i32 7
146 define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
147 ; SSE-LABEL: test16_undef:
149 ; SSE-NEXT: phaddd %xmm0, %xmm0
152 ; AVX1-LABEL: test16_undef:
154 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
157 ; AVX2-LABEL: test16_undef:
159 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
162 ; AVX512-LABEL: test16_undef:
164 ; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
166 %vecext = extractelement <8 x i32> %a, i32 0
167 %vecext1 = extractelement <8 x i32> %a, i32 1
168 %add = add i32 %vecext, %vecext1
169 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
170 %vecext2 = extractelement <8 x i32> %a, i32 2
171 %vecext3 = extractelement <8 x i32> %a, i32 3
172 %add4 = add i32 %vecext2, %vecext3
173 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
174 ret <8 x i32> %vecinit5
177 define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
178 ; SSE-LABEL: test16_v16i32_undef:
180 ; SSE-NEXT: phaddd %xmm0, %xmm0
183 ; AVX1-LABEL: test16_v16i32_undef:
185 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
188 ; AVX2-LABEL: test16_v16i32_undef:
190 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
193 ; AVX512-LABEL: test16_v16i32_undef:
195 ; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
197 %vecext = extractelement <16 x i32> %a, i32 0
198 %vecext1 = extractelement <16 x i32> %a, i32 1
199 %add = add i32 %vecext, %vecext1
200 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
201 %vecext2 = extractelement <16 x i32> %a, i32 2
202 %vecext3 = extractelement <16 x i32> %a, i32 3
203 %add4 = add i32 %vecext2, %vecext3
204 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
205 ret <16 x i32> %vecinit5
208 define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
209 ; SSE-LABEL: test17_undef:
211 ; SSE-NEXT: phaddd %xmm1, %xmm0
214 ; AVX1-LABEL: test17_undef:
216 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
217 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
220 ; AVX2-LABEL: test17_undef:
222 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
223 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
226 ; AVX512-LABEL: test17_undef:
228 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
229 ; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
231 %vecext = extractelement <8 x i32> %a, i32 0
232 %vecext1 = extractelement <8 x i32> %a, i32 1
233 %add1 = add i32 %vecext, %vecext1
234 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
235 %vecext2 = extractelement <8 x i32> %a, i32 2
236 %vecext3 = extractelement <8 x i32> %a, i32 3
237 %add2 = add i32 %vecext2, %vecext3
238 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
239 %vecext4 = extractelement <8 x i32> %a, i32 4
240 %vecext5 = extractelement <8 x i32> %a, i32 5
241 %add3 = add i32 %vecext4, %vecext5
242 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
243 %vecext6 = extractelement <8 x i32> %a, i32 6
244 %vecext7 = extractelement <8 x i32> %a, i32 7
245 %add4 = add i32 %vecext6, %vecext7
246 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
247 ret <8 x i32> %vecinit4
250 define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
251 ; SSE-LABEL: test17_v16i32_undef:
253 ; SSE-NEXT: phaddd %xmm1, %xmm0
256 ; AVX1-LABEL: test17_v16i32_undef:
258 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
259 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
262 ; AVX2-LABEL: test17_v16i32_undef:
264 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
265 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
268 ; AVX512-LABEL: test17_v16i32_undef:
270 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
271 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
273 %vecext = extractelement <16 x i32> %a, i32 0
274 %vecext1 = extractelement <16 x i32> %a, i32 1
275 %add1 = add i32 %vecext, %vecext1
276 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
277 %vecext2 = extractelement <16 x i32> %a, i32 2
278 %vecext3 = extractelement <16 x i32> %a, i32 3
279 %add2 = add i32 %vecext2, %vecext3
280 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
281 %vecext4 = extractelement <16 x i32> %a, i32 4
282 %vecext5 = extractelement <16 x i32> %a, i32 5
283 %add3 = add i32 %vecext4, %vecext5
284 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
285 %vecext6 = extractelement <16 x i32> %a, i32 6
286 %vecext7 = extractelement <16 x i32> %a, i32 7
287 %add4 = add i32 %vecext6, %vecext7
288 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
289 ret <16 x i32> %vecinit4