1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
9 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
11 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
12 ; SSE-LABEL: test1_undef:
14 ; SSE-NEXT: haddps %xmm1, %xmm0
17 ; AVX-LABEL: test1_undef:
19 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
21 %vecext = extractelement <4 x float> %a, i32 0
22 %vecext1 = extractelement <4 x float> %a, i32 1
23 %add = fadd float %vecext, %vecext1
24 %vecinit = insertelement <4 x float> undef, float %add, i32 0
25 %vecext2 = extractelement <4 x float> %a, i32 2
26 %vecext3 = extractelement <4 x float> %a, i32 3
27 %add4 = fadd float %vecext2, %vecext3
28 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
29 %vecext10 = extractelement <4 x float> %b, i32 2
30 %vecext11 = extractelement <4 x float> %b, i32 3
31 %add12 = fadd float %vecext10, %vecext11
32 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
33 ret <4 x float> %vecinit13
36 define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
37 ; SSE-LABEL: test2_undef:
39 ; SSE-NEXT: haddps %xmm1, %xmm0
42 ; AVX-LABEL: test2_undef:
44 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
46 %vecext = extractelement <4 x float> %a, i32 0
47 %vecext1 = extractelement <4 x float> %a, i32 1
48 %add = fadd float %vecext, %vecext1
49 %vecinit = insertelement <4 x float> undef, float %add, i32 0
50 %vecext6 = extractelement <4 x float> %b, i32 0
51 %vecext7 = extractelement <4 x float> %b, i32 1
52 %add8 = fadd float %vecext6, %vecext7
53 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
54 %vecext10 = extractelement <4 x float> %b, i32 2
55 %vecext11 = extractelement <4 x float> %b, i32 3
56 %add12 = fadd float %vecext10, %vecext11
57 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
58 ret <4 x float> %vecinit13
61 define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
62 ; SSE-LABEL: test3_undef:
64 ; SSE-NEXT: haddps %xmm1, %xmm0
67 ; AVX-LABEL: test3_undef:
69 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
71 %vecext = extractelement <4 x float> %a, i32 0
72 %vecext1 = extractelement <4 x float> %a, i32 1
73 %add = fadd float %vecext, %vecext1
74 %vecinit = insertelement <4 x float> undef, float %add, i32 0
75 %vecext2 = extractelement <4 x float> %a, i32 2
76 %vecext3 = extractelement <4 x float> %a, i32 3
77 %add4 = fadd float %vecext2, %vecext3
78 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
79 %vecext6 = extractelement <4 x float> %b, i32 0
80 %vecext7 = extractelement <4 x float> %b, i32 1
81 %add8 = fadd float %vecext6, %vecext7
82 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
83 ret <4 x float> %vecinit9
86 define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
87 ; SSE-SLOW-LABEL: test4_undef:
89 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
90 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
93 ; SSE-FAST-LABEL: test4_undef:
95 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
98 ; AVX-SLOW-LABEL: test4_undef:
100 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
101 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
102 ; AVX-SLOW-NEXT: retq
104 ; AVX-FAST-LABEL: test4_undef:
106 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
107 ; AVX-FAST-NEXT: retq
108 %vecext = extractelement <4 x float> %a, i32 0
109 %vecext1 = extractelement <4 x float> %a, i32 1
110 %add = fadd float %vecext, %vecext1
111 %vecinit = insertelement <4 x float> undef, float %add, i32 0
112 ret <4 x float> %vecinit
115 define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
116 ; SSE-SLOW-LABEL: test5_undef:
118 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
119 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
120 ; SSE-SLOW-NEXT: addsd %xmm0, %xmm1
121 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
122 ; SSE-SLOW-NEXT: retq
124 ; SSE-FAST-LABEL: test5_undef:
126 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
127 ; SSE-FAST-NEXT: retq
129 ; AVX-SLOW-LABEL: test5_undef:
131 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
132 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
133 ; AVX-SLOW-NEXT: retq
135 ; AVX-FAST-LABEL: test5_undef:
137 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
138 ; AVX-FAST-NEXT: retq
139 %vecext = extractelement <2 x double> %a, i32 0
140 %vecext1 = extractelement <2 x double> %a, i32 1
141 %add = fadd double %vecext, %vecext1
142 %vecinit = insertelement <2 x double> undef, double %add, i32 0
143 ret <2 x double> %vecinit
146 define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
147 ; SSE-LABEL: test6_undef:
149 ; SSE-NEXT: haddps %xmm0, %xmm0
152 ; AVX-LABEL: test6_undef:
154 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
156 %vecext = extractelement <4 x float> %a, i32 0
157 %vecext1 = extractelement <4 x float> %a, i32 1
158 %add = fadd float %vecext, %vecext1
159 %vecinit = insertelement <4 x float> undef, float %add, i32 0
160 %vecext2 = extractelement <4 x float> %a, i32 2
161 %vecext3 = extractelement <4 x float> %a, i32 3
162 %add4 = fadd float %vecext2, %vecext3
163 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
164 ret <4 x float> %vecinit5
167 define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
168 ; SSE-LABEL: test7_undef:
170 ; SSE-NEXT: haddps %xmm1, %xmm0
173 ; AVX-LABEL: test7_undef:
175 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
177 %vecext = extractelement <4 x float> %b, i32 0
178 %vecext1 = extractelement <4 x float> %b, i32 1
179 %add = fadd float %vecext, %vecext1
180 %vecinit = insertelement <4 x float> undef, float %add, i32 2
181 %vecext2 = extractelement <4 x float> %b, i32 2
182 %vecext3 = extractelement <4 x float> %b, i32 3
183 %add4 = fadd float %vecext2, %vecext3
184 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
185 ret <4 x float> %vecinit5
188 define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
189 ; SSE-SLOW-LABEL: test8_undef:
191 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
192 ; SSE-SLOW-NEXT: addss %xmm0, %xmm1
193 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
194 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
195 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
196 ; SSE-SLOW-NEXT: addss %xmm2, %xmm0
197 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
198 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
199 ; SSE-SLOW-NEXT: retq
201 ; SSE-FAST-LABEL: test8_undef:
203 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
204 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
205 ; SSE-FAST-NEXT: retq
207 ; AVX-SLOW-LABEL: test8_undef:
209 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
210 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
211 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
212 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
213 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
214 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
215 ; AVX-SLOW-NEXT: retq
217 ; AVX-FAST-LABEL: test8_undef:
219 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
220 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
221 ; AVX-FAST-NEXT: retq
222 %vecext = extractelement <4 x float> %a, i32 0
223 %vecext1 = extractelement <4 x float> %a, i32 1
224 %add = fadd float %vecext, %vecext1
225 %vecinit = insertelement <4 x float> undef, float %add, i32 0
226 %vecext2 = extractelement <4 x float> %a, i32 2
227 %vecext3 = extractelement <4 x float> %a, i32 3
228 %add4 = fadd float %vecext2, %vecext3
229 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
230 ret <4 x float> %vecinit5
233 define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
234 ; SSE-LABEL: test9_undef:
236 ; SSE-NEXT: haddps %xmm1, %xmm0
239 ; AVX-LABEL: test9_undef:
241 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
243 %vecext = extractelement <4 x float> %a, i32 0
244 %vecext1 = extractelement <4 x float> %a, i32 1
245 %add = fadd float %vecext, %vecext1
246 %vecinit = insertelement <4 x float> undef, float %add, i32 0
247 %vecext2 = extractelement <4 x float> %b, i32 2
248 %vecext3 = extractelement <4 x float> %b, i32 3
249 %add4 = fadd float %vecext2, %vecext3
250 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
251 ret <4 x float> %vecinit5
254 define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
255 ; SSE-LABEL: test10_undef:
257 ; SSE-NEXT: haddps %xmm2, %xmm0
260 ; AVX-LABEL: test10_undef:
262 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
264 %vecext = extractelement <8 x float> %a, i32 0
265 %vecext1 = extractelement <8 x float> %a, i32 1
266 %add = fadd float %vecext, %vecext1
267 %vecinit = insertelement <8 x float> undef, float %add, i32 0
268 %vecext2 = extractelement <8 x float> %b, i32 2
269 %vecext3 = extractelement <8 x float> %b, i32 3
270 %add4 = fadd float %vecext2, %vecext3
271 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
272 ret <8 x float> %vecinit5
275 define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
276 ; SSE-SLOW-LABEL: test11_undef:
278 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
279 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
280 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
281 ; SSE-SLOW-NEXT: addss %xmm3, %xmm1
282 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
283 ; SSE-SLOW-NEXT: retq
285 ; SSE-FAST-LABEL: test11_undef:
287 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
288 ; SSE-FAST-NEXT: haddps %xmm3, %xmm3
289 ; SSE-FAST-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
290 ; SSE-FAST-NEXT: retq
292 ; AVX-LABEL: test11_undef:
294 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
296 %vecext = extractelement <8 x float> %a, i32 0
297 %vecext1 = extractelement <8 x float> %a, i32 1
298 %add = fadd float %vecext, %vecext1
299 %vecinit = insertelement <8 x float> undef, float %add, i32 0
300 %vecext2 = extractelement <8 x float> %b, i32 4
301 %vecext3 = extractelement <8 x float> %b, i32 5
302 %add4 = fadd float %vecext2, %vecext3
303 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
304 ret <8 x float> %vecinit5
307 define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
308 ; SSE-LABEL: test12_undef:
310 ; SSE-NEXT: haddps %xmm0, %xmm0
313 ; AVX-LABEL: test12_undef:
315 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
317 %vecext = extractelement <8 x float> %a, i32 0
318 %vecext1 = extractelement <8 x float> %a, i32 1
319 %add = fadd float %vecext, %vecext1
320 %vecinit = insertelement <8 x float> undef, float %add, i32 0
321 %vecext2 = extractelement <8 x float> %a, i32 2
322 %vecext3 = extractelement <8 x float> %a, i32 3
323 %add4 = fadd float %vecext2, %vecext3
324 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
325 ret <8 x float> %vecinit5
328 define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
329 ; SSE-LABEL: test13_undef:
331 ; SSE-NEXT: haddps %xmm1, %xmm0
334 ; AVX-LABEL: test13_undef:
336 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
337 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
339 %vecext = extractelement <8 x float> %a, i32 0
340 %vecext1 = extractelement <8 x float> %a, i32 1
341 %add1 = fadd float %vecext, %vecext1
342 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
343 %vecext2 = extractelement <8 x float> %a, i32 2
344 %vecext3 = extractelement <8 x float> %a, i32 3
345 %add2 = fadd float %vecext2, %vecext3
346 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
347 %vecext4 = extractelement <8 x float> %a, i32 4
348 %vecext5 = extractelement <8 x float> %a, i32 5
349 %add3 = fadd float %vecext4, %vecext5
350 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
351 %vecext6 = extractelement <8 x float> %a, i32 6
352 %vecext7 = extractelement <8 x float> %a, i32 7
353 %add4 = fadd float %vecext6, %vecext7
354 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
355 ret <8 x float> %vecinit4
358 define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
359 ; SSE-LABEL: test13_v16f32_undef:
361 ; SSE-NEXT: haddps %xmm1, %xmm0
364 ; AVX1-SLOW-LABEL: test13_v16f32_undef:
365 ; AVX1-SLOW: # %bb.0:
366 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
367 ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
368 ; AVX1-SLOW-NEXT: retq
370 ; AVX-FAST-LABEL: test13_v16f32_undef:
372 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
373 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
374 ; AVX-FAST-NEXT: retq
376 ; AVX512-SLOW-LABEL: test13_v16f32_undef:
377 ; AVX512-SLOW: # %bb.0:
378 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
379 ; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
380 ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
381 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
382 ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
383 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
384 ; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
385 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
386 ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2
387 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
388 ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
389 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
390 ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
391 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
392 ; AVX512-SLOW-NEXT: retq
393 %vecext = extractelement <16 x float> %a, i32 0
394 %vecext1 = extractelement <16 x float> %a, i32 1
395 %add1 = fadd float %vecext, %vecext1
396 %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0
397 %vecext2 = extractelement <16 x float> %a, i32 2
398 %vecext3 = extractelement <16 x float> %a, i32 3
399 %add2 = fadd float %vecext2, %vecext3
400 %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1
401 %vecext4 = extractelement <16 x float> %a, i32 4
402 %vecext5 = extractelement <16 x float> %a, i32 5
403 %add3 = fadd float %vecext4, %vecext5
404 %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2
405 %vecext6 = extractelement <16 x float> %a, i32 6
406 %vecext7 = extractelement <16 x float> %a, i32 7
407 %add4 = fadd float %vecext6, %vecext7
408 %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3
409 ret <16 x float> %vecinit4
411 define <2 x double> @add_pd_003(<2 x double> %x) {
412 ; SSE-SLOW-LABEL: add_pd_003:
414 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
415 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
416 ; SSE-SLOW-NEXT: retq
418 ; SSE-FAST-LABEL: add_pd_003:
420 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
421 ; SSE-FAST-NEXT: retq
423 ; AVX-SLOW-LABEL: add_pd_003:
425 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
426 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
427 ; AVX-SLOW-NEXT: retq
429 ; AVX-FAST-LABEL: add_pd_003:
431 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
432 ; AVX-FAST-NEXT: retq
433 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
434 %add = fadd <2 x double> %l, %x
435 ret <2 x double> %add
438 ; Change shuffle mask - no undefs.
440 define <2 x double> @add_pd_003_2(<2 x double> %x) {
441 ; SSE-SLOW-LABEL: add_pd_003_2:
443 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
444 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
445 ; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
446 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
447 ; SSE-SLOW-NEXT: retq
449 ; SSE-FAST-LABEL: add_pd_003_2:
451 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
452 ; SSE-FAST-NEXT: retq
454 ; AVX-SLOW-LABEL: add_pd_003_2:
456 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
457 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
458 ; AVX-SLOW-NEXT: retq
460 ; AVX-FAST-LABEL: add_pd_003_2:
462 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
463 ; AVX-FAST-NEXT: retq
464 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
465 %add = fadd <2 x double> %l, %x
466 ret <2 x double> %add
469 define <2 x double> @add_pd_010(<2 x double> %x) {
470 ; SSE-SLOW-LABEL: add_pd_010:
472 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
473 ; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
474 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
475 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
476 ; SSE-SLOW-NEXT: retq
478 ; SSE-FAST-LABEL: add_pd_010:
480 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
481 ; SSE-FAST-NEXT: retq
483 ; AVX-SLOW-LABEL: add_pd_010:
485 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
486 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
487 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
488 ; AVX-SLOW-NEXT: retq
490 ; AVX-FAST-LABEL: add_pd_010:
492 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
493 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
494 ; AVX-FAST-NEXT: retq
495 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
496 %add = fadd <2 x double> %l, %x
497 %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
498 ret <2 x double> %shuffle2
501 define <4 x float> @add_ps_007(<4 x float> %x) {
502 ; SSE-LABEL: add_ps_007:
504 ; SSE-NEXT: haddps %xmm0, %xmm0
507 ; AVX-LABEL: add_ps_007:
509 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
511 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
512 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
513 %add = fadd <4 x float> %l, %r
517 define <4 x float> @add_ps_030(<4 x float> %x) {
518 ; SSE-LABEL: add_ps_030:
520 ; SSE-NEXT: haddps %xmm0, %xmm0
521 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
524 ; AVX-LABEL: add_ps_030:
526 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
527 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
529 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
530 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
531 %add = fadd <4 x float> %l, %r
532 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
533 ret <4 x float> %shuffle2
536 define <4 x float> @add_ps_007_2(<4 x float> %x) {
537 ; SSE-LABEL: add_ps_007_2:
539 ; SSE-NEXT: haddps %xmm0, %xmm0
542 ; AVX-LABEL: add_ps_007_2:
544 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
546 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
547 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
548 %add = fadd <4 x float> %l, %r
552 define <4 x float> @add_ps_008(<4 x float> %x) {
553 ; SSE-SLOW-LABEL: add_ps_008:
555 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
556 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
557 ; SSE-SLOW-NEXT: retq
559 ; SSE-FAST-LABEL: add_ps_008:
561 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
562 ; SSE-FAST-NEXT: retq
564 ; AVX-SLOW-LABEL: add_ps_008:
566 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
567 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
568 ; AVX-SLOW-NEXT: retq
570 ; AVX-FAST-LABEL: add_ps_008:
572 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
573 ; AVX-FAST-NEXT: retq
574 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
575 %add = fadd <4 x float> %l, %x
579 define <4 x float> @add_ps_017(<4 x float> %x) {
580 ; SSE-SLOW-LABEL: add_ps_017:
582 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
583 ; SSE-SLOW-NEXT: addps %xmm0, %xmm1
584 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
585 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
586 ; SSE-SLOW-NEXT: retq
588 ; SSE-FAST-LABEL: add_ps_017:
590 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
591 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
592 ; SSE-FAST-NEXT: retq
594 ; AVX-SLOW-LABEL: add_ps_017:
596 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
597 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
598 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
599 ; AVX-SLOW-NEXT: retq
601 ; AVX-FAST-LABEL: add_ps_017:
603 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
604 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
605 ; AVX-FAST-NEXT: retq
606 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
607 %add = fadd <4 x float> %l, %x
608 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
609 ret <4 x float> %shuffle2
612 define <4 x float> @add_ps_018(<4 x float> %x) {
613 ; SSE-LABEL: add_ps_018:
615 ; SSE-NEXT: haddps %xmm0, %xmm0
616 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
619 ; AVX-LABEL: add_ps_018:
621 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
622 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
624 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
625 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
626 %add = fadd <4 x float> %l, %r
627 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
628 ret <4 x float> %shuffle2
631 define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) {
632 ; SSE-LABEL: v8f32_inputs_v4f32_output_0101:
634 ; SSE-NEXT: haddps %xmm2, %xmm0
637 ; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
639 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
640 ; AVX-NEXT: vzeroupper
642 %a0 = extractelement <8 x float> %a, i32 0
643 %a1 = extractelement <8 x float> %a, i32 1
644 %b0 = extractelement <8 x float> %b, i32 0
645 %b1 = extractelement <8 x float> %b, i32 1
646 %add0 = fadd float %a0, %a1
647 %add2 = fadd float %b0, %b1
648 %r0 = insertelement <4 x float> undef, float %add0, i32 0
649 %r = insertelement <4 x float> %r0, float %add2, i32 2
653 define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) {
654 ; SSE-LABEL: v8f32_input0_v4f32_output_0123:
656 ; SSE-NEXT: haddps %xmm2, %xmm0
659 ; AVX-LABEL: v8f32_input0_v4f32_output_0123:
661 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
662 ; AVX-NEXT: vzeroupper
664 %a0 = extractelement <8 x float> %a, i32 0
665 %a1 = extractelement <8 x float> %a, i32 1
666 %b2 = extractelement <4 x float> %b, i32 2
667 %b3 = extractelement <4 x float> %b, i32 3
668 %add0 = fadd float %a0, %a1
669 %add3 = fadd float %b2, %b3
670 %r0 = insertelement <4 x float> undef, float %add0, i32 0
671 %r = insertelement <4 x float> %r0, float %add3, i32 3
675 define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) {
676 ; SSE-LABEL: v8f32_input1_v4f32_output_2301:
678 ; SSE-NEXT: haddps %xmm1, %xmm0
681 ; AVX-LABEL: v8f32_input1_v4f32_output_2301:
683 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
684 ; AVX-NEXT: vzeroupper
686 %a2 = extractelement <4 x float> %a, i32 2
687 %a3 = extractelement <4 x float> %a, i32 3
688 %b0 = extractelement <8 x float> %b, i32 0
689 %b1 = extractelement <8 x float> %b, i32 1
690 %add1 = fadd float %a2, %a3
691 %add2 = fadd float %b0, %b1
692 %r1 = insertelement <4 x float> undef, float %add1, i32 1
693 %r = insertelement <4 x float> %r1, float %add2, i32 2
697 define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) {
698 ; SSE-LABEL: v8f32_inputs_v4f32_output_2323:
700 ; SSE-NEXT: haddps %xmm2, %xmm0
703 ; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
705 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
706 ; AVX-NEXT: vzeroupper
708 %a2 = extractelement <8 x float> %a, i32 2
709 %a3 = extractelement <8 x float> %a, i32 3
710 %b2 = extractelement <8 x float> %b, i32 2
711 %b3 = extractelement <8 x float> %b, i32 3
712 %add1 = fadd float %a2, %a3
713 %add3 = fadd float %b2, %b3
714 %r1 = insertelement <4 x float> undef, float %add1, i32 1
715 %r = insertelement <4 x float> %r1, float %add3, i32 3
719 define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) {
720 ; SSE-LABEL: v16f32_inputs_v4f32_output_0123:
722 ; SSE-NEXT: haddps %xmm4, %xmm0
725 ; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
726 ; AVX1-SLOW: # %bb.0:
727 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0
728 ; AVX1-SLOW-NEXT: vzeroupper
729 ; AVX1-SLOW-NEXT: retq
731 ; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
732 ; AVX1-FAST: # %bb.0:
733 ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
734 ; AVX1-FAST-NEXT: vzeroupper
735 ; AVX1-FAST-NEXT: retq
737 ; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
739 ; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0
740 ; AVX512-NEXT: vzeroupper
742 %a0 = extractelement <16 x float> %a, i32 0
743 %a1 = extractelement <16 x float> %a, i32 1
744 %b2 = extractelement <16 x float> %b, i32 2
745 %b3 = extractelement <16 x float> %b, i32 3
746 %add0 = fadd float %a0, %a1
747 %add3 = fadd float %b2, %b3
748 %r0 = insertelement <4 x float> undef, float %add0, i32 0
749 %r = insertelement <4 x float> %r0, float %add3, i32 3
753 define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) {
754 ; SSE-LABEL: v16f32_inputs_v8f32_output_4567:
756 ; SSE-NEXT: haddps %xmm5, %xmm1
759 ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
760 ; AVX1-SLOW: # %bb.0:
761 ; AVX1-SLOW-NEXT: vhaddps %ymm2, %ymm0, %ymm0
762 ; AVX1-SLOW-NEXT: retq
764 ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
765 ; AVX1-FAST: # %bb.0:
766 ; AVX1-FAST-NEXT: vhaddps %ymm2, %ymm0, %ymm0
767 ; AVX1-FAST-NEXT: retq
769 ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
771 ; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0
773 %a4 = extractelement <16 x float> %a, i32 4
774 %a5 = extractelement <16 x float> %a, i32 5
775 %b6 = extractelement <16 x float> %b, i32 6
776 %b7 = extractelement <16 x float> %b, i32 7
777 %add4 = fadd float %a4, %a5
778 %add7 = fadd float %b6, %b7
779 %r4 = insertelement <8 x float> undef, float %add4, i32 4
780 %r = insertelement <8 x float> %r4, float %add7, i32 7
784 define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
785 ; SSE-LABEL: PR40243:
787 ; SSE-NEXT: haddps %xmm3, %xmm1
790 ; AVX-LABEL: PR40243:
792 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
794 %a4 = extractelement <8 x float> %a, i32 4
795 %a5 = extractelement <8 x float> %a, i32 5
796 %add4 = fadd float %a4, %a5
797 %b6 = extractelement <8 x float> %b, i32 6
798 %b7 = extractelement <8 x float> %b, i32 7
799 %add7 = fadd float %b6, %b7
800 %r4 = insertelement <8 x float> undef, float %add4, i32 4
801 %r = insertelement <8 x float> %r4, float %add7, i32 7