1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
11 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
13 define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
14 ; SSE-LABEL: test14_undef:
16 ; SSE-NEXT: phaddd %xmm2, %xmm0
19 ; AVX-LABEL: test14_undef:
21 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
23 %vecext = extractelement <8 x i32> %a, i32 0
24 %vecext1 = extractelement <8 x i32> %a, i32 1
25 %add = add i32 %vecext, %vecext1
26 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
27 %vecext2 = extractelement <8 x i32> %b, i32 2
28 %vecext3 = extractelement <8 x i32> %b, i32 3
29 %add4 = add i32 %vecext2, %vecext3
30 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
31 ret <8 x i32> %vecinit5
34 ; integer horizontal adds instead of two scalar adds followed by vector inserts.
35 define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
36 ; SSE-SLOW-LABEL: test15_undef:
38 ; SSE-SLOW-NEXT: movd %xmm0, %eax
39 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
40 ; SSE-SLOW-NEXT: movd %xmm0, %ecx
41 ; SSE-SLOW-NEXT: addl %eax, %ecx
42 ; SSE-SLOW-NEXT: movd %xmm3, %eax
43 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
44 ; SSE-SLOW-NEXT: movd %xmm0, %edx
45 ; SSE-SLOW-NEXT: addl %eax, %edx
46 ; SSE-SLOW-NEXT: movd %ecx, %xmm0
47 ; SSE-SLOW-NEXT: movd %edx, %xmm1
48 ; SSE-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
51 ; SSE-FAST-LABEL: test15_undef:
53 ; SSE-FAST-NEXT: phaddd %xmm0, %xmm0
54 ; SSE-FAST-NEXT: phaddd %xmm3, %xmm3
55 ; SSE-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
58 ; AVX1-SLOW-LABEL: test15_undef:
60 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
61 ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %ecx
62 ; AVX1-SLOW-NEXT: addl %eax, %ecx
63 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
64 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
65 ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %edx
66 ; AVX1-SLOW-NEXT: addl %eax, %edx
67 ; AVX1-SLOW-NEXT: vmovd %ecx, %xmm0
68 ; AVX1-SLOW-NEXT: vmovd %edx, %xmm1
69 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
70 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
71 ; AVX1-SLOW-NEXT: retq
73 ; AVX1-FAST-LABEL: test15_undef:
75 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
76 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
77 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1
78 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
79 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
80 ; AVX1-FAST-NEXT: retq
82 ; AVX2-LABEL: test15_undef:
84 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
87 ; AVX512-LABEL: test15_undef:
89 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
91 %vecext = extractelement <8 x i32> %a, i32 0
92 %vecext1 = extractelement <8 x i32> %a, i32 1
93 %add = add i32 %vecext, %vecext1
94 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
95 %vecext2 = extractelement <8 x i32> %b, i32 4
96 %vecext3 = extractelement <8 x i32> %b, i32 5
97 %add4 = add i32 %vecext2, %vecext3
98 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
99 ret <8 x i32> %vecinit5
102 define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
103 ; SSE-LABEL: PR40243_alt:
105 ; SSE-NEXT: phaddd %xmm3, %xmm1
108 ; AVX1-LABEL: PR40243_alt:
110 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
111 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
112 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
113 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
116 ; AVX2-LABEL: PR40243_alt:
118 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
121 ; AVX512-LABEL: PR40243_alt:
123 ; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
125 %a4 = extractelement <8 x i32> %a, i32 4
126 %a5 = extractelement <8 x i32> %a, i32 5
127 %add4 = add i32 %a4, %a5
128 %b6 = extractelement <8 x i32> %b, i32 6
129 %b7 = extractelement <8 x i32> %b, i32 7
130 %add7 = add i32 %b6, %b7
131 %r4 = insertelement <8 x i32> undef, i32 %add4, i32 4
132 %r = insertelement <8 x i32> %r4, i32 %add7, i32 7
136 define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
137 ; SSE-LABEL: test16_undef:
139 ; SSE-NEXT: phaddd %xmm0, %xmm0
142 ; AVX-LABEL: test16_undef:
144 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
146 %vecext = extractelement <8 x i32> %a, i32 0
147 %vecext1 = extractelement <8 x i32> %a, i32 1
148 %add = add i32 %vecext, %vecext1
149 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
150 %vecext2 = extractelement <8 x i32> %a, i32 2
151 %vecext3 = extractelement <8 x i32> %a, i32 3
152 %add4 = add i32 %vecext2, %vecext3
153 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
154 ret <8 x i32> %vecinit5
157 define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
158 ; SSE-LABEL: test16_v16i32_undef:
160 ; SSE-NEXT: phaddd %xmm0, %xmm0
163 ; AVX-LABEL: test16_v16i32_undef:
165 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
167 %vecext = extractelement <16 x i32> %a, i32 0
168 %vecext1 = extractelement <16 x i32> %a, i32 1
169 %add = add i32 %vecext, %vecext1
170 %vecinit = insertelement <16 x i32> undef, i32 %add, i32 0
171 %vecext2 = extractelement <16 x i32> %a, i32 2
172 %vecext3 = extractelement <16 x i32> %a, i32 3
173 %add4 = add i32 %vecext2, %vecext3
174 %vecinit5 = insertelement <16 x i32> %vecinit, i32 %add4, i32 1
175 ret <16 x i32> %vecinit5
178 define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
179 ; SSE-LABEL: test17_undef:
181 ; SSE-NEXT: phaddd %xmm1, %xmm0
184 ; AVX1-LABEL: test17_undef:
186 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
187 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
190 ; AVX2-LABEL: test17_undef:
192 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
193 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
196 ; AVX512-LABEL: test17_undef:
198 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
199 ; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
201 %vecext = extractelement <8 x i32> %a, i32 0
202 %vecext1 = extractelement <8 x i32> %a, i32 1
203 %add1 = add i32 %vecext, %vecext1
204 %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
205 %vecext2 = extractelement <8 x i32> %a, i32 2
206 %vecext3 = extractelement <8 x i32> %a, i32 3
207 %add2 = add i32 %vecext2, %vecext3
208 %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
209 %vecext4 = extractelement <8 x i32> %a, i32 4
210 %vecext5 = extractelement <8 x i32> %a, i32 5
211 %add3 = add i32 %vecext4, %vecext5
212 %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
213 %vecext6 = extractelement <8 x i32> %a, i32 6
214 %vecext7 = extractelement <8 x i32> %a, i32 7
215 %add4 = add i32 %vecext6, %vecext7
216 %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
217 ret <8 x i32> %vecinit4
220 define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
221 ; SSE-LABEL: test17_v16i32_undef:
223 ; SSE-NEXT: phaddd %xmm1, %xmm0
226 ; AVX1-LABEL: test17_v16i32_undef:
228 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
229 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
232 ; AVX2-LABEL: test17_v16i32_undef:
234 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
235 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
238 ; AVX512-LABEL: test17_v16i32_undef:
240 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
241 ; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
243 %vecext = extractelement <16 x i32> %a, i32 0
244 %vecext1 = extractelement <16 x i32> %a, i32 1
245 %add1 = add i32 %vecext, %vecext1
246 %vecinit1 = insertelement <16 x i32> undef, i32 %add1, i32 0
247 %vecext2 = extractelement <16 x i32> %a, i32 2
248 %vecext3 = extractelement <16 x i32> %a, i32 3
249 %add2 = add i32 %vecext2, %vecext3
250 %vecinit2 = insertelement <16 x i32> %vecinit1, i32 %add2, i32 1
251 %vecext4 = extractelement <16 x i32> %a, i32 4
252 %vecext5 = extractelement <16 x i32> %a, i32 5
253 %add3 = add i32 %vecext4, %vecext5
254 %vecinit3 = insertelement <16 x i32> %vecinit2, i32 %add3, i32 2
255 %vecext6 = extractelement <16 x i32> %a, i32 6
256 %vecext7 = extractelement <16 x i32> %a, i32 7
257 %add4 = add i32 %vecext6, %vecext7
258 %vecinit4 = insertelement <16 x i32> %vecinit3, i32 %add4, i32 3
259 ret <16 x i32> %vecinit4