1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
9 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
11 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
12 ; SSE-LABEL: test1_undef:
14 ; SSE-NEXT: haddps %xmm1, %xmm0
17 ; AVX-LABEL: test1_undef:
19 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
21 %vecext = extractelement <4 x float> %a, i32 0
22 %vecext1 = extractelement <4 x float> %a, i32 1
23 %add = fadd float %vecext, %vecext1
24 %vecinit = insertelement <4 x float> undef, float %add, i32 0
25 %vecext2 = extractelement <4 x float> %a, i32 2
26 %vecext3 = extractelement <4 x float> %a, i32 3
27 %add4 = fadd float %vecext2, %vecext3
28 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
29 %vecext10 = extractelement <4 x float> %b, i32 2
30 %vecext11 = extractelement <4 x float> %b, i32 3
31 %add12 = fadd float %vecext10, %vecext11
32 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
33 ret <4 x float> %vecinit13
36 define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
37 ; SSE-LABEL: test2_undef:
39 ; SSE-NEXT: haddps %xmm1, %xmm0
42 ; AVX-LABEL: test2_undef:
44 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
46 %vecext = extractelement <4 x float> %a, i32 0
47 %vecext1 = extractelement <4 x float> %a, i32 1
48 %add = fadd float %vecext, %vecext1
49 %vecinit = insertelement <4 x float> undef, float %add, i32 0
50 %vecext6 = extractelement <4 x float> %b, i32 0
51 %vecext7 = extractelement <4 x float> %b, i32 1
52 %add8 = fadd float %vecext6, %vecext7
53 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
54 %vecext10 = extractelement <4 x float> %b, i32 2
55 %vecext11 = extractelement <4 x float> %b, i32 3
56 %add12 = fadd float %vecext10, %vecext11
57 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
58 ret <4 x float> %vecinit13
61 define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
62 ; SSE-LABEL: test3_undef:
64 ; SSE-NEXT: haddps %xmm1, %xmm0
67 ; AVX-LABEL: test3_undef:
69 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
71 %vecext = extractelement <4 x float> %a, i32 0
72 %vecext1 = extractelement <4 x float> %a, i32 1
73 %add = fadd float %vecext, %vecext1
74 %vecinit = insertelement <4 x float> undef, float %add, i32 0
75 %vecext2 = extractelement <4 x float> %a, i32 2
76 %vecext3 = extractelement <4 x float> %a, i32 3
77 %add4 = fadd float %vecext2, %vecext3
78 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
79 %vecext6 = extractelement <4 x float> %b, i32 0
80 %vecext7 = extractelement <4 x float> %b, i32 1
81 %add8 = fadd float %vecext6, %vecext7
82 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
83 ret <4 x float> %vecinit9
86 define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
87 ; SSE-SLOW-LABEL: test4_undef:
89 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
90 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
93 ; SSE-FAST-LABEL: test4_undef:
95 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
98 ; AVX-SLOW-LABEL: test4_undef:
100 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
101 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
102 ; AVX-SLOW-NEXT: retq
104 ; AVX-FAST-LABEL: test4_undef:
106 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
107 ; AVX-FAST-NEXT: retq
108 %vecext = extractelement <4 x float> %a, i32 0
109 %vecext1 = extractelement <4 x float> %a, i32 1
110 %add = fadd float %vecext, %vecext1
111 %vecinit = insertelement <4 x float> undef, float %add, i32 0
112 ret <4 x float> %vecinit
115 define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
116 ; SSE-SLOW-LABEL: test5_undef:
118 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
119 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
120 ; SSE-SLOW-NEXT: addsd %xmm0, %xmm1
121 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
122 ; SSE-SLOW-NEXT: retq
124 ; SSE-FAST-LABEL: test5_undef:
126 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
127 ; SSE-FAST-NEXT: retq
129 ; AVX-SLOW-LABEL: test5_undef:
131 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
132 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
133 ; AVX-SLOW-NEXT: retq
135 ; AVX-FAST-LABEL: test5_undef:
137 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
138 ; AVX-FAST-NEXT: retq
139 %vecext = extractelement <2 x double> %a, i32 0
140 %vecext1 = extractelement <2 x double> %a, i32 1
141 %add = fadd double %vecext, %vecext1
142 %vecinit = insertelement <2 x double> undef, double %add, i32 0
143 ret <2 x double> %vecinit
146 define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
147 ; SSE-LABEL: test6_undef:
149 ; SSE-NEXT: haddps %xmm0, %xmm0
152 ; AVX-LABEL: test6_undef:
154 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
156 %vecext = extractelement <4 x float> %a, i32 0
157 %vecext1 = extractelement <4 x float> %a, i32 1
158 %add = fadd float %vecext, %vecext1
159 %vecinit = insertelement <4 x float> undef, float %add, i32 0
160 %vecext2 = extractelement <4 x float> %a, i32 2
161 %vecext3 = extractelement <4 x float> %a, i32 3
162 %add4 = fadd float %vecext2, %vecext3
163 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
164 ret <4 x float> %vecinit5
167 define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
168 ; SSE-LABEL: test7_undef:
170 ; SSE-NEXT: haddps %xmm1, %xmm0
173 ; AVX-LABEL: test7_undef:
175 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
177 %vecext = extractelement <4 x float> %b, i32 0
178 %vecext1 = extractelement <4 x float> %b, i32 1
179 %add = fadd float %vecext, %vecext1
180 %vecinit = insertelement <4 x float> undef, float %add, i32 2
181 %vecext2 = extractelement <4 x float> %b, i32 2
182 %vecext3 = extractelement <4 x float> %b, i32 3
183 %add4 = fadd float %vecext2, %vecext3
184 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
185 ret <4 x float> %vecinit5
188 define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
189 ; SSE-LABEL: test8_undef:
191 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
192 ; SSE-NEXT: addss %xmm0, %xmm1
193 ; SSE-NEXT: movaps %xmm0, %xmm2
194 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
195 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
196 ; SSE-NEXT: addss %xmm2, %xmm0
197 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
198 ; SSE-NEXT: movaps %xmm1, %xmm0
201 ; AVX-LABEL: test8_undef:
203 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
204 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
205 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
206 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
207 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
208 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
210 %vecext = extractelement <4 x float> %a, i32 0
211 %vecext1 = extractelement <4 x float> %a, i32 1
212 %add = fadd float %vecext, %vecext1
213 %vecinit = insertelement <4 x float> undef, float %add, i32 0
214 %vecext2 = extractelement <4 x float> %a, i32 2
215 %vecext3 = extractelement <4 x float> %a, i32 3
216 %add4 = fadd float %vecext2, %vecext3
217 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
218 ret <4 x float> %vecinit5
221 define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
222 ; SSE-LABEL: test9_undef:
224 ; SSE-NEXT: haddps %xmm1, %xmm0
227 ; AVX-LABEL: test9_undef:
229 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
231 %vecext = extractelement <4 x float> %a, i32 0
232 %vecext1 = extractelement <4 x float> %a, i32 1
233 %add = fadd float %vecext, %vecext1
234 %vecinit = insertelement <4 x float> undef, float %add, i32 0
235 %vecext2 = extractelement <4 x float> %b, i32 2
236 %vecext3 = extractelement <4 x float> %b, i32 3
237 %add4 = fadd float %vecext2, %vecext3
238 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
239 ret <4 x float> %vecinit5
242 define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
243 ; SSE-LABEL: test10_undef:
245 ; SSE-NEXT: haddps %xmm2, %xmm0
248 ; AVX-LABEL: test10_undef:
250 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
252 %vecext = extractelement <8 x float> %a, i32 0
253 %vecext1 = extractelement <8 x float> %a, i32 1
254 %add = fadd float %vecext, %vecext1
255 %vecinit = insertelement <8 x float> undef, float %add, i32 0
256 %vecext2 = extractelement <8 x float> %b, i32 2
257 %vecext3 = extractelement <8 x float> %b, i32 3
258 %add4 = fadd float %vecext2, %vecext3
259 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
260 ret <8 x float> %vecinit5
263 define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
264 ; SSE-SLOW-LABEL: test11_undef:
266 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
267 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
268 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
269 ; SSE-SLOW-NEXT: addss %xmm3, %xmm1
270 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
271 ; SSE-SLOW-NEXT: retq
273 ; SSE-FAST-LABEL: test11_undef:
275 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
276 ; SSE-FAST-NEXT: haddps %xmm3, %xmm3
277 ; SSE-FAST-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
278 ; SSE-FAST-NEXT: retq
280 ; AVX-LABEL: test11_undef:
282 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
284 %vecext = extractelement <8 x float> %a, i32 0
285 %vecext1 = extractelement <8 x float> %a, i32 1
286 %add = fadd float %vecext, %vecext1
287 %vecinit = insertelement <8 x float> undef, float %add, i32 0
288 %vecext2 = extractelement <8 x float> %b, i32 4
289 %vecext3 = extractelement <8 x float> %b, i32 5
290 %add4 = fadd float %vecext2, %vecext3
291 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
292 ret <8 x float> %vecinit5
295 define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
296 ; SSE-LABEL: test12_undef:
298 ; SSE-NEXT: haddps %xmm0, %xmm0
301 ; AVX-LABEL: test12_undef:
303 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
305 %vecext = extractelement <8 x float> %a, i32 0
306 %vecext1 = extractelement <8 x float> %a, i32 1
307 %add = fadd float %vecext, %vecext1
308 %vecinit = insertelement <8 x float> undef, float %add, i32 0
309 %vecext2 = extractelement <8 x float> %a, i32 2
310 %vecext3 = extractelement <8 x float> %a, i32 3
311 %add4 = fadd float %vecext2, %vecext3
312 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
313 ret <8 x float> %vecinit5
316 define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
317 ; SSE-LABEL: test13_undef:
319 ; SSE-NEXT: haddps %xmm1, %xmm0
322 ; AVX-LABEL: test13_undef:
324 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
325 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
327 %vecext = extractelement <8 x float> %a, i32 0
328 %vecext1 = extractelement <8 x float> %a, i32 1
329 %add1 = fadd float %vecext, %vecext1
330 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
331 %vecext2 = extractelement <8 x float> %a, i32 2
332 %vecext3 = extractelement <8 x float> %a, i32 3
333 %add2 = fadd float %vecext2, %vecext3
334 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
335 %vecext4 = extractelement <8 x float> %a, i32 4
336 %vecext5 = extractelement <8 x float> %a, i32 5
337 %add3 = fadd float %vecext4, %vecext5
338 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
339 %vecext6 = extractelement <8 x float> %a, i32 6
340 %vecext7 = extractelement <8 x float> %a, i32 7
341 %add4 = fadd float %vecext6, %vecext7
342 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
343 ret <8 x float> %vecinit4
346 define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
347 ; SSE-LABEL: test13_v16f32_undef:
349 ; SSE-NEXT: haddps %xmm1, %xmm0
352 ; AVX1-SLOW-LABEL: test13_v16f32_undef:
353 ; AVX1-SLOW: # %bb.0:
354 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
355 ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
356 ; AVX1-SLOW-NEXT: retq
358 ; AVX1-FAST-LABEL: test13_v16f32_undef:
359 ; AVX1-FAST: # %bb.0:
360 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
361 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
362 ; AVX1-FAST-NEXT: retq
364 ; AVX512-LABEL: test13_v16f32_undef:
366 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
367 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
368 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
369 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
370 ; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2
371 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
372 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
373 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
374 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm2
375 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
376 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
377 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
378 ; AVX512-NEXT: vaddss %xmm0, %xmm2, %xmm0
379 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
381 %vecext = extractelement <16 x float> %a, i32 0
382 %vecext1 = extractelement <16 x float> %a, i32 1
383 %add1 = fadd float %vecext, %vecext1
384 %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0
385 %vecext2 = extractelement <16 x float> %a, i32 2
386 %vecext3 = extractelement <16 x float> %a, i32 3
387 %add2 = fadd float %vecext2, %vecext3
388 %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1
389 %vecext4 = extractelement <16 x float> %a, i32 4
390 %vecext5 = extractelement <16 x float> %a, i32 5
391 %add3 = fadd float %vecext4, %vecext5
392 %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2
393 %vecext6 = extractelement <16 x float> %a, i32 6
394 %vecext7 = extractelement <16 x float> %a, i32 7
395 %add4 = fadd float %vecext6, %vecext7
396 %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3
397 ret <16 x float> %vecinit4
399 define <2 x double> @add_pd_003(<2 x double> %x) {
400 ; SSE-SLOW-LABEL: add_pd_003:
402 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
403 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
404 ; SSE-SLOW-NEXT: retq
406 ; SSE-FAST-LABEL: add_pd_003:
408 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
409 ; SSE-FAST-NEXT: retq
411 ; AVX-SLOW-LABEL: add_pd_003:
413 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
414 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
415 ; AVX-SLOW-NEXT: retq
417 ; AVX-FAST-LABEL: add_pd_003:
419 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
420 ; AVX-FAST-NEXT: retq
421 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
422 %add = fadd <2 x double> %l, %x
423 ret <2 x double> %add
426 ; Change shuffle mask - no undefs.
428 define <2 x double> @add_pd_003_2(<2 x double> %x) {
429 ; SSE-SLOW-LABEL: add_pd_003_2:
431 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
432 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
433 ; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
434 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
435 ; SSE-SLOW-NEXT: retq
437 ; SSE-FAST-LABEL: add_pd_003_2:
439 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
440 ; SSE-FAST-NEXT: retq
442 ; AVX-SLOW-LABEL: add_pd_003_2:
444 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
445 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
446 ; AVX-SLOW-NEXT: retq
448 ; AVX-FAST-LABEL: add_pd_003_2:
450 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
451 ; AVX-FAST-NEXT: retq
452 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
453 %add = fadd <2 x double> %l, %x
454 ret <2 x double> %add
457 define <2 x double> @add_pd_010(<2 x double> %x) {
458 ; SSE-SLOW-LABEL: add_pd_010:
460 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
461 ; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
462 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
463 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
464 ; SSE-SLOW-NEXT: retq
466 ; SSE-FAST-LABEL: add_pd_010:
468 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
469 ; SSE-FAST-NEXT: retq
471 ; AVX-SLOW-LABEL: add_pd_010:
473 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
474 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
475 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
476 ; AVX-SLOW-NEXT: retq
478 ; AVX-FAST-LABEL: add_pd_010:
480 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
481 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
482 ; AVX-FAST-NEXT: retq
483 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
484 %add = fadd <2 x double> %l, %x
485 %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
486 ret <2 x double> %shuffle2
489 define <4 x float> @add_ps_007(<4 x float> %x) {
490 ; SSE-SLOW-LABEL: add_ps_007:
492 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
493 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
494 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
495 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
496 ; SSE-SLOW-NEXT: retq
498 ; SSE-FAST-LABEL: add_ps_007:
500 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
501 ; SSE-FAST-NEXT: retq
503 ; AVX-SLOW-LABEL: add_ps_007:
505 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
506 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
507 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
508 ; AVX-SLOW-NEXT: retq
510 ; AVX-FAST-LABEL: add_ps_007:
512 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
513 ; AVX-FAST-NEXT: retq
514 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
515 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
516 %add = fadd <4 x float> %l, %r
520 define <4 x float> @add_ps_030(<4 x float> %x) {
521 ; SSE-SLOW-LABEL: add_ps_030:
523 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
524 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
525 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
526 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
527 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
528 ; SSE-SLOW-NEXT: retq
530 ; SSE-FAST-LABEL: add_ps_030:
532 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
533 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
534 ; SSE-FAST-NEXT: retq
536 ; AVX-SLOW-LABEL: add_ps_030:
538 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
539 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
540 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
541 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
542 ; AVX-SLOW-NEXT: retq
544 ; AVX-FAST-LABEL: add_ps_030:
546 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
547 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
548 ; AVX-FAST-NEXT: retq
549 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
550 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
551 %add = fadd <4 x float> %l, %r
552 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
553 ret <4 x float> %shuffle2
556 define <4 x float> @add_ps_007_2(<4 x float> %x) {
557 ; SSE-SLOW-LABEL: add_ps_007_2:
559 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
560 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
561 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
562 ; SSE-SLOW-NEXT: retq
564 ; SSE-FAST-LABEL: add_ps_007_2:
566 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
567 ; SSE-FAST-NEXT: retq
569 ; AVX1-SLOW-LABEL: add_ps_007_2:
570 ; AVX1-SLOW: # %bb.0:
571 ; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
572 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
573 ; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
574 ; AVX1-SLOW-NEXT: retq
576 ; AVX-FAST-LABEL: add_ps_007_2:
578 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
579 ; AVX-FAST-NEXT: retq
581 ; AVX512-SLOW-LABEL: add_ps_007_2:
582 ; AVX512-SLOW: # %bb.0:
583 ; AVX512-SLOW-NEXT: vbroadcastss %xmm0, %xmm1
584 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
585 ; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
586 ; AVX512-SLOW-NEXT: retq
587 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
588 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
589 %add = fadd <4 x float> %l, %r
593 define <4 x float> @add_ps_008(<4 x float> %x) {
594 ; SSE-SLOW-LABEL: add_ps_008:
596 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
597 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
598 ; SSE-SLOW-NEXT: retq
600 ; SSE-FAST-LABEL: add_ps_008:
602 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
603 ; SSE-FAST-NEXT: retq
605 ; AVX-SLOW-LABEL: add_ps_008:
607 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
608 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
609 ; AVX-SLOW-NEXT: retq
611 ; AVX-FAST-LABEL: add_ps_008:
613 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
614 ; AVX-FAST-NEXT: retq
615 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
616 %add = fadd <4 x float> %l, %x
620 define <4 x float> @add_ps_017(<4 x float> %x) {
621 ; SSE-SLOW-LABEL: add_ps_017:
623 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
624 ; SSE-SLOW-NEXT: addps %xmm0, %xmm1
625 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
626 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
627 ; SSE-SLOW-NEXT: retq
629 ; SSE-FAST-LABEL: add_ps_017:
631 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
632 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
633 ; SSE-FAST-NEXT: retq
635 ; AVX-SLOW-LABEL: add_ps_017:
637 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
638 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
639 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
640 ; AVX-SLOW-NEXT: retq
642 ; AVX-FAST-LABEL: add_ps_017:
644 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
645 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
646 ; AVX-FAST-NEXT: retq
647 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
648 %add = fadd <4 x float> %l, %x
649 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
650 ret <4 x float> %shuffle2
653 define <4 x float> @add_ps_018(<4 x float> %x) {
654 ; SSE-SLOW-LABEL: add_ps_018:
656 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
657 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
658 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
659 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
660 ; SSE-SLOW-NEXT: retq
662 ; SSE-FAST-LABEL: add_ps_018:
664 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
665 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
666 ; SSE-FAST-NEXT: retq
668 ; AVX1-SLOW-LABEL: add_ps_018:
669 ; AVX1-SLOW: # %bb.0:
670 ; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
671 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
672 ; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
673 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
674 ; AVX1-SLOW-NEXT: retq
676 ; AVX-FAST-LABEL: add_ps_018:
678 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
679 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
680 ; AVX-FAST-NEXT: retq
682 ; AVX512-SLOW-LABEL: add_ps_018:
683 ; AVX512-SLOW: # %bb.0:
684 ; AVX512-SLOW-NEXT: vbroadcastss %xmm0, %xmm1
685 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
686 ; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
687 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
688 ; AVX512-SLOW-NEXT: retq
689 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
690 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
691 %add = fadd <4 x float> %l, %r
692 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
693 ret <4 x float> %shuffle2
696 define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) {
697 ; SSE-LABEL: v8f32_inputs_v4f32_output_0101:
699 ; SSE-NEXT: haddps %xmm2, %xmm0
702 ; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
704 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
705 ; AVX-NEXT: vzeroupper
707 %a0 = extractelement <8 x float> %a, i32 0
708 %a1 = extractelement <8 x float> %a, i32 1
709 %b0 = extractelement <8 x float> %b, i32 0
710 %b1 = extractelement <8 x float> %b, i32 1
711 %add0 = fadd float %a0, %a1
712 %add2 = fadd float %b0, %b1
713 %r0 = insertelement <4 x float> undef, float %add0, i32 0
714 %r = insertelement <4 x float> %r0, float %add2, i32 2
718 define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) {
719 ; SSE-LABEL: v8f32_input0_v4f32_output_0123:
721 ; SSE-NEXT: haddps %xmm2, %xmm0
724 ; AVX-LABEL: v8f32_input0_v4f32_output_0123:
726 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
727 ; AVX-NEXT: vzeroupper
729 %a0 = extractelement <8 x float> %a, i32 0
730 %a1 = extractelement <8 x float> %a, i32 1
731 %b2 = extractelement <4 x float> %b, i32 2
732 %b3 = extractelement <4 x float> %b, i32 3
733 %add0 = fadd float %a0, %a1
734 %add3 = fadd float %b2, %b3
735 %r0 = insertelement <4 x float> undef, float %add0, i32 0
736 %r = insertelement <4 x float> %r0, float %add3, i32 3
740 define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) {
741 ; SSE-LABEL: v8f32_input1_v4f32_output_2301:
743 ; SSE-NEXT: haddps %xmm1, %xmm0
746 ; AVX-LABEL: v8f32_input1_v4f32_output_2301:
748 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
749 ; AVX-NEXT: vzeroupper
751 %a2 = extractelement <4 x float> %a, i32 2
752 %a3 = extractelement <4 x float> %a, i32 3
753 %b0 = extractelement <8 x float> %b, i32 0
754 %b1 = extractelement <8 x float> %b, i32 1
755 %add1 = fadd float %a2, %a3
756 %add2 = fadd float %b0, %b1
757 %r1 = insertelement <4 x float> undef, float %add1, i32 1
758 %r = insertelement <4 x float> %r1, float %add2, i32 2
762 define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) {
763 ; SSE-LABEL: v8f32_inputs_v4f32_output_2323:
765 ; SSE-NEXT: haddps %xmm2, %xmm0
768 ; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
770 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
771 ; AVX-NEXT: vzeroupper
773 %a2 = extractelement <8 x float> %a, i32 2
774 %a3 = extractelement <8 x float> %a, i32 3
775 %b2 = extractelement <8 x float> %b, i32 2
776 %b3 = extractelement <8 x float> %b, i32 3
777 %add1 = fadd float %a2, %a3
778 %add3 = fadd float %b2, %b3
779 %r1 = insertelement <4 x float> undef, float %add1, i32 1
780 %r = insertelement <4 x float> %r1, float %add3, i32 3
784 define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) {
785 ; SSE-LABEL: v16f32_inputs_v4f32_output_0123:
787 ; SSE-NEXT: haddps %xmm4, %xmm0
790 ; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
791 ; AVX1-SLOW: # %bb.0:
792 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0
793 ; AVX1-SLOW-NEXT: vzeroupper
794 ; AVX1-SLOW-NEXT: retq
796 ; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
797 ; AVX1-FAST: # %bb.0:
798 ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
799 ; AVX1-FAST-NEXT: vzeroupper
800 ; AVX1-FAST-NEXT: retq
802 ; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
804 ; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0
805 ; AVX512-NEXT: vzeroupper
807 %a0 = extractelement <16 x float> %a, i32 0
808 %a1 = extractelement <16 x float> %a, i32 1
809 %b2 = extractelement <16 x float> %b, i32 2
810 %b3 = extractelement <16 x float> %b, i32 3
811 %add0 = fadd float %a0, %a1
812 %add3 = fadd float %b2, %b3
813 %r0 = insertelement <4 x float> undef, float %add0, i32 0
814 %r = insertelement <4 x float> %r0, float %add3, i32 3
818 define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) {
819 ; SSE-LABEL: v16f32_inputs_v8f32_output_4567:
821 ; SSE-NEXT: haddps %xmm5, %xmm1
824 ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
825 ; AVX1-SLOW: # %bb.0:
826 ; AVX1-SLOW-NEXT: vhaddps %ymm2, %ymm0, %ymm0
827 ; AVX1-SLOW-NEXT: retq
829 ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
830 ; AVX1-FAST: # %bb.0:
831 ; AVX1-FAST-NEXT: vhaddps %ymm2, %ymm0, %ymm0
832 ; AVX1-FAST-NEXT: retq
834 ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
836 ; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0
838 %a4 = extractelement <16 x float> %a, i32 4
839 %a5 = extractelement <16 x float> %a, i32 5
840 %b6 = extractelement <16 x float> %b, i32 6
841 %b7 = extractelement <16 x float> %b, i32 7
842 %add4 = fadd float %a4, %a5
843 %add7 = fadd float %b6, %b7
844 %r4 = insertelement <8 x float> undef, float %add4, i32 4
845 %r = insertelement <8 x float> %r4, float %add7, i32 7
849 define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
850 ; SSE-LABEL: PR40243:
852 ; SSE-NEXT: haddps %xmm3, %xmm1
855 ; AVX-LABEL: PR40243:
857 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
859 %a4 = extractelement <8 x float> %a, i32 4
860 %a5 = extractelement <8 x float> %a, i32 5
861 %add4 = fadd float %a4, %a5
862 %b6 = extractelement <8 x float> %b, i32 6
863 %b7 = extractelement <8 x float> %b, i32 7
864 %add7 = fadd float %b6, %b7
865 %r4 = insertelement <8 x float> undef, float %add4, i32 4
866 %r = insertelement <8 x float> %r4, float %add7, i32 7