1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
9 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
11 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
12 ; SSE-LABEL: test1_undef:
14 ; SSE-NEXT: haddps %xmm1, %xmm0
17 ; AVX-LABEL: test1_undef:
19 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
21 %vecext = extractelement <4 x float> %a, i32 0
22 %vecext1 = extractelement <4 x float> %a, i32 1
23 %add = fadd float %vecext, %vecext1
24 %vecinit = insertelement <4 x float> undef, float %add, i32 0
25 %vecext2 = extractelement <4 x float> %a, i32 2
26 %vecext3 = extractelement <4 x float> %a, i32 3
27 %add4 = fadd float %vecext2, %vecext3
28 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
29 %vecext10 = extractelement <4 x float> %b, i32 2
30 %vecext11 = extractelement <4 x float> %b, i32 3
31 %add12 = fadd float %vecext10, %vecext11
32 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
33 ret <4 x float> %vecinit13
36 define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
37 ; SSE-LABEL: test2_undef:
39 ; SSE-NEXT: haddps %xmm1, %xmm0
42 ; AVX-LABEL: test2_undef:
44 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
46 %vecext = extractelement <4 x float> %a, i32 0
47 %vecext1 = extractelement <4 x float> %a, i32 1
48 %add = fadd float %vecext, %vecext1
49 %vecinit = insertelement <4 x float> undef, float %add, i32 0
50 %vecext6 = extractelement <4 x float> %b, i32 0
51 %vecext7 = extractelement <4 x float> %b, i32 1
52 %add8 = fadd float %vecext6, %vecext7
53 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
54 %vecext10 = extractelement <4 x float> %b, i32 2
55 %vecext11 = extractelement <4 x float> %b, i32 3
56 %add12 = fadd float %vecext10, %vecext11
57 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
58 ret <4 x float> %vecinit13
61 define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
62 ; SSE-LABEL: test3_undef:
64 ; SSE-NEXT: haddps %xmm1, %xmm0
67 ; AVX-LABEL: test3_undef:
69 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
71 %vecext = extractelement <4 x float> %a, i32 0
72 %vecext1 = extractelement <4 x float> %a, i32 1
73 %add = fadd float %vecext, %vecext1
74 %vecinit = insertelement <4 x float> undef, float %add, i32 0
75 %vecext2 = extractelement <4 x float> %a, i32 2
76 %vecext3 = extractelement <4 x float> %a, i32 3
77 %add4 = fadd float %vecext2, %vecext3
78 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
79 %vecext6 = extractelement <4 x float> %b, i32 0
80 %vecext7 = extractelement <4 x float> %b, i32 1
81 %add8 = fadd float %vecext6, %vecext7
82 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
83 ret <4 x float> %vecinit9
86 define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
87 ; SSE-SLOW-LABEL: test4_undef:
89 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
90 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
93 ; SSE-FAST-LABEL: test4_undef:
95 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
98 ; AVX-SLOW-LABEL: test4_undef:
100 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
101 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
102 ; AVX-SLOW-NEXT: retq
104 ; AVX-FAST-LABEL: test4_undef:
106 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
107 ; AVX-FAST-NEXT: retq
108 %vecext = extractelement <4 x float> %a, i32 0
109 %vecext1 = extractelement <4 x float> %a, i32 1
110 %add = fadd float %vecext, %vecext1
111 %vecinit = insertelement <4 x float> undef, float %add, i32 0
112 ret <4 x float> %vecinit
115 define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
116 ; SSE-SLOW-LABEL: test5_undef:
118 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
119 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
120 ; SSE-SLOW-NEXT: addsd %xmm0, %xmm1
121 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
122 ; SSE-SLOW-NEXT: retq
124 ; SSE-FAST-LABEL: test5_undef:
126 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
127 ; SSE-FAST-NEXT: retq
129 ; AVX-SLOW-LABEL: test5_undef:
131 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
132 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
133 ; AVX-SLOW-NEXT: retq
135 ; AVX-FAST-LABEL: test5_undef:
137 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
138 ; AVX-FAST-NEXT: retq
139 %vecext = extractelement <2 x double> %a, i32 0
140 %vecext1 = extractelement <2 x double> %a, i32 1
141 %add = fadd double %vecext, %vecext1
142 %vecinit = insertelement <2 x double> undef, double %add, i32 0
143 ret <2 x double> %vecinit
146 define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
147 ; SSE-LABEL: test6_undef:
149 ; SSE-NEXT: haddps %xmm0, %xmm0
152 ; AVX-LABEL: test6_undef:
154 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
156 %vecext = extractelement <4 x float> %a, i32 0
157 %vecext1 = extractelement <4 x float> %a, i32 1
158 %add = fadd float %vecext, %vecext1
159 %vecinit = insertelement <4 x float> undef, float %add, i32 0
160 %vecext2 = extractelement <4 x float> %a, i32 2
161 %vecext3 = extractelement <4 x float> %a, i32 3
162 %add4 = fadd float %vecext2, %vecext3
163 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
164 ret <4 x float> %vecinit5
167 define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
168 ; SSE-LABEL: test7_undef:
170 ; SSE-NEXT: haddps %xmm1, %xmm0
173 ; AVX-LABEL: test7_undef:
175 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
177 %vecext = extractelement <4 x float> %b, i32 0
178 %vecext1 = extractelement <4 x float> %b, i32 1
179 %add = fadd float %vecext, %vecext1
180 %vecinit = insertelement <4 x float> undef, float %add, i32 2
181 %vecext2 = extractelement <4 x float> %b, i32 2
182 %vecext3 = extractelement <4 x float> %b, i32 3
183 %add4 = fadd float %vecext2, %vecext3
184 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
185 ret <4 x float> %vecinit5
188 define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
189 ; SSE-SLOW-LABEL: test8_undef:
191 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
192 ; SSE-SLOW-NEXT: addss %xmm0, %xmm1
193 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
194 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
195 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
196 ; SSE-SLOW-NEXT: addss %xmm2, %xmm0
197 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
198 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
199 ; SSE-SLOW-NEXT: retq
201 ; SSE-FAST-LABEL: test8_undef:
203 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
204 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,1]
205 ; SSE-FAST-NEXT: retq
207 ; AVX-SLOW-LABEL: test8_undef:
209 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
210 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
211 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
212 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
213 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
214 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
215 ; AVX-SLOW-NEXT: retq
217 ; AVX-FAST-LABEL: test8_undef:
219 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
220 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
221 ; AVX-FAST-NEXT: retq
222 %vecext = extractelement <4 x float> %a, i32 0
223 %vecext1 = extractelement <4 x float> %a, i32 1
224 %add = fadd float %vecext, %vecext1
225 %vecinit = insertelement <4 x float> undef, float %add, i32 0
226 %vecext2 = extractelement <4 x float> %a, i32 2
227 %vecext3 = extractelement <4 x float> %a, i32 3
228 %add4 = fadd float %vecext2, %vecext3
229 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
230 ret <4 x float> %vecinit5
233 define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
234 ; SSE-LABEL: test9_undef:
236 ; SSE-NEXT: haddps %xmm1, %xmm0
239 ; AVX-LABEL: test9_undef:
241 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
243 %vecext = extractelement <4 x float> %a, i32 0
244 %vecext1 = extractelement <4 x float> %a, i32 1
245 %add = fadd float %vecext, %vecext1
246 %vecinit = insertelement <4 x float> undef, float %add, i32 0
247 %vecext2 = extractelement <4 x float> %b, i32 2
248 %vecext3 = extractelement <4 x float> %b, i32 3
249 %add4 = fadd float %vecext2, %vecext3
250 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
251 ret <4 x float> %vecinit5
254 define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
255 ; SSE-LABEL: test10_undef:
257 ; SSE-NEXT: haddps %xmm2, %xmm0
260 ; AVX-LABEL: test10_undef:
262 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
264 %vecext = extractelement <8 x float> %a, i32 0
265 %vecext1 = extractelement <8 x float> %a, i32 1
266 %add = fadd float %vecext, %vecext1
267 %vecinit = insertelement <8 x float> undef, float %add, i32 0
268 %vecext2 = extractelement <8 x float> %b, i32 2
269 %vecext3 = extractelement <8 x float> %b, i32 3
270 %add4 = fadd float %vecext2, %vecext3
271 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
272 ret <8 x float> %vecinit5
275 define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
276 ; SSE-SLOW-LABEL: test11_undef:
278 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
279 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
280 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
281 ; SSE-SLOW-NEXT: addss %xmm3, %xmm1
282 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
283 ; SSE-SLOW-NEXT: retq
285 ; SSE-FAST-LABEL: test11_undef:
287 ; SSE-FAST-NEXT: movaps %xmm3, %xmm1
288 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
289 ; SSE-FAST-NEXT: haddps %xmm3, %xmm1
290 ; SSE-FAST-NEXT: retq
292 ; AVX-LABEL: test11_undef:
294 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
296 %vecext = extractelement <8 x float> %a, i32 0
297 %vecext1 = extractelement <8 x float> %a, i32 1
298 %add = fadd float %vecext, %vecext1
299 %vecinit = insertelement <8 x float> undef, float %add, i32 0
300 %vecext2 = extractelement <8 x float> %b, i32 4
301 %vecext3 = extractelement <8 x float> %b, i32 5
302 %add4 = fadd float %vecext2, %vecext3
303 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
304 ret <8 x float> %vecinit5
307 define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
308 ; SSE-LABEL: test12_undef:
310 ; SSE-NEXT: haddps %xmm0, %xmm0
313 ; AVX-LABEL: test12_undef:
315 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
317 %vecext = extractelement <8 x float> %a, i32 0
318 %vecext1 = extractelement <8 x float> %a, i32 1
319 %add = fadd float %vecext, %vecext1
320 %vecinit = insertelement <8 x float> undef, float %add, i32 0
321 %vecext2 = extractelement <8 x float> %a, i32 2
322 %vecext3 = extractelement <8 x float> %a, i32 3
323 %add4 = fadd float %vecext2, %vecext3
324 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
325 ret <8 x float> %vecinit5
328 define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
329 ; SSE-LABEL: test13_undef:
331 ; SSE-NEXT: haddps %xmm1, %xmm0
334 ; AVX-LABEL: test13_undef:
336 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
337 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
339 %vecext = extractelement <8 x float> %a, i32 0
340 %vecext1 = extractelement <8 x float> %a, i32 1
341 %add1 = fadd float %vecext, %vecext1
342 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
343 %vecext2 = extractelement <8 x float> %a, i32 2
344 %vecext3 = extractelement <8 x float> %a, i32 3
345 %add2 = fadd float %vecext2, %vecext3
346 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
347 %vecext4 = extractelement <8 x float> %a, i32 4
348 %vecext5 = extractelement <8 x float> %a, i32 5
349 %add3 = fadd float %vecext4, %vecext5
350 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
351 %vecext6 = extractelement <8 x float> %a, i32 6
352 %vecext7 = extractelement <8 x float> %a, i32 7
353 %add4 = fadd float %vecext6, %vecext7
354 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
355 ret <8 x float> %vecinit4
358 define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
359 ; SSE-LABEL: test13_v16f32_undef:
361 ; SSE-NEXT: haddps %xmm1, %xmm0
364 ; AVX1-SLOW-LABEL: test13_v16f32_undef:
365 ; AVX1-SLOW: # %bb.0:
366 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
367 ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
368 ; AVX1-SLOW-NEXT: retq
370 ; AVX-FAST-LABEL: test13_v16f32_undef:
372 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
373 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
374 ; AVX-FAST-NEXT: retq
376 ; AVX512-SLOW-LABEL: test13_v16f32_undef:
377 ; AVX512-SLOW: # %bb.0:
378 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
379 ; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
380 ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
381 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
382 ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
383 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
384 ; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
385 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
386 ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2
387 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
388 ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
389 ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
390 ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
391 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
392 ; AVX512-SLOW-NEXT: retq
393 %vecext = extractelement <16 x float> %a, i32 0
394 %vecext1 = extractelement <16 x float> %a, i32 1
395 %add1 = fadd float %vecext, %vecext1
396 %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0
397 %vecext2 = extractelement <16 x float> %a, i32 2
398 %vecext3 = extractelement <16 x float> %a, i32 3
399 %add2 = fadd float %vecext2, %vecext3
400 %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1
401 %vecext4 = extractelement <16 x float> %a, i32 4
402 %vecext5 = extractelement <16 x float> %a, i32 5
403 %add3 = fadd float %vecext4, %vecext5
404 %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2
405 %vecext6 = extractelement <16 x float> %a, i32 6
406 %vecext7 = extractelement <16 x float> %a, i32 7
407 %add4 = fadd float %vecext6, %vecext7
408 %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3
409 ret <16 x float> %vecinit4
411 define <2 x double> @add_pd_003(<2 x double> %x) {
412 ; SSE-SLOW-LABEL: add_pd_003:
414 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
415 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
416 ; SSE-SLOW-NEXT: retq
418 ; SSE-FAST-LABEL: add_pd_003:
420 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
421 ; SSE-FAST-NEXT: retq
423 ; AVX-SLOW-LABEL: add_pd_003:
425 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
426 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
427 ; AVX-SLOW-NEXT: retq
429 ; AVX-FAST-LABEL: add_pd_003:
431 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
432 ; AVX-FAST-NEXT: retq
433 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
434 %add = fadd <2 x double> %l, %x
435 ret <2 x double> %add
438 ; Change shuffle mask - no undefs.
440 define <2 x double> @add_pd_003_2(<2 x double> %x) {
441 ; SSE-SLOW-LABEL: add_pd_003_2:
443 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
444 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
445 ; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
446 ; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
447 ; SSE-SLOW-NEXT: retq
449 ; SSE-FAST-LABEL: add_pd_003_2:
451 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
452 ; SSE-FAST-NEXT: retq
454 ; AVX-SLOW-LABEL: add_pd_003_2:
456 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
457 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
458 ; AVX-SLOW-NEXT: retq
460 ; AVX-FAST-LABEL: add_pd_003_2:
462 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
463 ; AVX-FAST-NEXT: retq
464 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
465 %add = fadd <2 x double> %l, %x
466 ret <2 x double> %add
469 define <2 x double> @add_pd_010(<2 x double> %x) {
470 ; SSE-LABEL: add_pd_010:
472 ; SSE-NEXT: haddpd %xmm0, %xmm0
475 ; AVX-SLOW-LABEL: add_pd_010:
477 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
478 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
479 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
480 ; AVX-SLOW-NEXT: retq
482 ; AVX-FAST-LABEL: add_pd_010:
484 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
485 ; AVX-FAST-NEXT: retq
486 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
487 %add = fadd <2 x double> %l, %x
488 %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
489 ret <2 x double> %shuffle2
492 define <4 x float> @add_ps_007(<4 x float> %x) {
493 ; SSE-LABEL: add_ps_007:
495 ; SSE-NEXT: haddps %xmm0, %xmm0
498 ; AVX-LABEL: add_ps_007:
500 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
502 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
503 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
504 %add = fadd <4 x float> %l, %r
508 define <4 x float> @add_ps_030(<4 x float> %x) {
509 ; SSE-SLOW-LABEL: add_ps_030:
511 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
512 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
513 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
514 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
515 ; SSE-SLOW-NEXT: retq
517 ; SSE-FAST-LABEL: add_ps_030:
519 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
520 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
521 ; SSE-FAST-NEXT: retq
523 ; AVX-SLOW-LABEL: add_ps_030:
525 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
526 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3]
527 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
528 ; AVX-SLOW-NEXT: retq
530 ; AVX-FAST-LABEL: add_ps_030:
532 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
533 ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
534 ; AVX-FAST-NEXT: retq
535 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
536 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
537 %add = fadd <4 x float> %l, %r
538 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
539 ret <4 x float> %shuffle2
542 define <4 x float> @add_ps_007_2(<4 x float> %x) {
543 ; SSE-LABEL: add_ps_007_2:
545 ; SSE-NEXT: haddps %xmm0, %xmm0
548 ; AVX-LABEL: add_ps_007_2:
550 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
552 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
553 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
554 %add = fadd <4 x float> %l, %r
558 define <4 x float> @add_ps_008(<4 x float> %x) {
559 ; SSE-SLOW-LABEL: add_ps_008:
561 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
562 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
563 ; SSE-SLOW-NEXT: retq
565 ; SSE-FAST-LABEL: add_ps_008:
567 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
568 ; SSE-FAST-NEXT: retq
570 ; AVX-SLOW-LABEL: add_ps_008:
572 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
573 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
574 ; AVX-SLOW-NEXT: retq
576 ; AVX-FAST-LABEL: add_ps_008:
578 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
579 ; AVX-FAST-NEXT: retq
580 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
581 %add = fadd <4 x float> %l, %x
585 define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
586 ; SSE-LABEL: add_ps_016:
588 ; SSE-NEXT: haddps %xmm0, %xmm1
589 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
590 ; SSE-NEXT: movaps %xmm1, %xmm0
593 ; AVX-LABEL: add_ps_016:
595 ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0
596 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3]
598 %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6>
599 %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7>
600 %5 = fadd <2 x float> %3, %4
601 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
602 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
603 %8 = fadd <4 x float> %7, %1
604 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 6, i32 1, i32 2, i32 undef>
608 define <4 x float> @add_ps_017(<4 x float> %x) {
609 ; SSE-SLOW-LABEL: add_ps_017:
611 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
612 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
613 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
614 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
615 ; SSE-SLOW-NEXT: retq
617 ; SSE-FAST-LABEL: add_ps_017:
619 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
620 ; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
621 ; SSE-FAST-NEXT: retq
623 ; AVX-SLOW-LABEL: add_ps_017:
625 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
626 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
627 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
628 ; AVX-SLOW-NEXT: retq
630 ; AVX-FAST-LABEL: add_ps_017:
632 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
633 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
634 ; AVX-FAST-NEXT: retq
635 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
636 %add = fadd <4 x float> %l, %x
637 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
638 ret <4 x float> %shuffle2
641 define <4 x float> @add_ps_018(<4 x float> %x) {
642 ; SSE-LABEL: add_ps_018:
644 ; SSE-NEXT: haddps %xmm0, %xmm0
645 ; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
648 ; AVX1-SLOW-LABEL: add_ps_018:
649 ; AVX1-SLOW: # %bb.0:
650 ; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
651 ; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
652 ; AVX1-SLOW-NEXT: retq
654 ; AVX1-FAST-LABEL: add_ps_018:
655 ; AVX1-FAST: # %bb.0:
656 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
657 ; AVX1-FAST-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
658 ; AVX1-FAST-NEXT: retq
660 ; AVX512-LABEL: add_ps_018:
662 ; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
663 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
665 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
666 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
667 %add = fadd <4 x float> %l, %r
668 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
669 ret <4 x float> %shuffle2
672 define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) {
673 ; SSE-SLOW-LABEL: add_pd_011:
675 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1
676 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
677 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm3
678 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
679 ; SSE-SLOW-NEXT: addpd %xmm0, %xmm3
680 ; SSE-SLOW-NEXT: addpd %xmm2, %xmm1
681 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm0
682 ; SSE-SLOW-NEXT: retq
684 ; SSE-FAST-LABEL: add_pd_011:
686 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
687 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
688 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm1
689 ; SSE-FAST-NEXT: retq
691 ; AVX1-SLOW-LABEL: add_pd_011:
692 ; AVX1-SLOW: # %bb.0:
693 ; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
694 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
695 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
696 ; AVX1-SLOW-NEXT: retq
698 ; AVX1-FAST-LABEL: add_pd_011:
699 ; AVX1-FAST: # %bb.0:
700 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
701 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
702 ; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
703 ; AVX1-FAST-NEXT: retq
705 ; AVX512-LABEL: add_pd_011:
707 ; AVX512-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
708 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
710 %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
711 %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 undef, i32 5, i32 undef>
712 %5 = fadd <4 x double> %3, %4
713 %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
717 define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) {
718 ; SSE-LABEL: v8f32_inputs_v4f32_output_0101:
720 ; SSE-NEXT: haddps %xmm2, %xmm0
723 ; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
725 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
726 ; AVX-NEXT: vzeroupper
728 %a0 = extractelement <8 x float> %a, i32 0
729 %a1 = extractelement <8 x float> %a, i32 1
730 %b0 = extractelement <8 x float> %b, i32 0
731 %b1 = extractelement <8 x float> %b, i32 1
732 %add0 = fadd float %a0, %a1
733 %add2 = fadd float %b0, %b1
734 %r0 = insertelement <4 x float> undef, float %add0, i32 0
735 %r = insertelement <4 x float> %r0, float %add2, i32 2
739 define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) {
740 ; SSE-LABEL: v8f32_input0_v4f32_output_0123:
742 ; SSE-NEXT: haddps %xmm2, %xmm0
745 ; AVX-LABEL: v8f32_input0_v4f32_output_0123:
747 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
748 ; AVX-NEXT: vzeroupper
750 %a0 = extractelement <8 x float> %a, i32 0
751 %a1 = extractelement <8 x float> %a, i32 1
752 %b2 = extractelement <4 x float> %b, i32 2
753 %b3 = extractelement <4 x float> %b, i32 3
754 %add0 = fadd float %a0, %a1
755 %add3 = fadd float %b2, %b3
756 %r0 = insertelement <4 x float> undef, float %add0, i32 0
757 %r = insertelement <4 x float> %r0, float %add3, i32 3
761 define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) {
762 ; SSE-LABEL: v8f32_input1_v4f32_output_2301:
764 ; SSE-NEXT: haddps %xmm1, %xmm0
767 ; AVX-LABEL: v8f32_input1_v4f32_output_2301:
769 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
770 ; AVX-NEXT: vzeroupper
772 %a2 = extractelement <4 x float> %a, i32 2
773 %a3 = extractelement <4 x float> %a, i32 3
774 %b0 = extractelement <8 x float> %b, i32 0
775 %b1 = extractelement <8 x float> %b, i32 1
776 %add1 = fadd float %a2, %a3
777 %add2 = fadd float %b0, %b1
778 %r1 = insertelement <4 x float> undef, float %add1, i32 1
779 %r = insertelement <4 x float> %r1, float %add2, i32 2
783 define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) {
784 ; SSE-LABEL: v8f32_inputs_v4f32_output_2323:
786 ; SSE-NEXT: haddps %xmm2, %xmm0
789 ; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
791 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
792 ; AVX-NEXT: vzeroupper
794 %a2 = extractelement <8 x float> %a, i32 2
795 %a3 = extractelement <8 x float> %a, i32 3
796 %b2 = extractelement <8 x float> %b, i32 2
797 %b3 = extractelement <8 x float> %b, i32 3
798 %add1 = fadd float %a2, %a3
799 %add3 = fadd float %b2, %b3
800 %r1 = insertelement <4 x float> undef, float %add1, i32 1
801 %r = insertelement <4 x float> %r1, float %add3, i32 3
805 define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) {
806 ; SSE-LABEL: v16f32_inputs_v4f32_output_0123:
808 ; SSE-NEXT: haddps %xmm4, %xmm0
811 ; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
812 ; AVX1-SLOW: # %bb.0:
813 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0
814 ; AVX1-SLOW-NEXT: vzeroupper
815 ; AVX1-SLOW-NEXT: retq
817 ; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
818 ; AVX1-FAST: # %bb.0:
819 ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
820 ; AVX1-FAST-NEXT: vzeroupper
821 ; AVX1-FAST-NEXT: retq
823 ; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
825 ; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0
826 ; AVX512-NEXT: vzeroupper
828 %a0 = extractelement <16 x float> %a, i32 0
829 %a1 = extractelement <16 x float> %a, i32 1
830 %b2 = extractelement <16 x float> %b, i32 2
831 %b3 = extractelement <16 x float> %b, i32 3
832 %add0 = fadd float %a0, %a1
833 %add3 = fadd float %b2, %b3
834 %r0 = insertelement <4 x float> undef, float %add0, i32 0
835 %r = insertelement <4 x float> %r0, float %add3, i32 3
839 define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) {
840 ; SSE-LABEL: v16f32_inputs_v8f32_output_4567:
842 ; SSE-NEXT: haddps %xmm5, %xmm1
845 ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
846 ; AVX1-SLOW: # %bb.0:
847 ; AVX1-SLOW-NEXT: vhaddps %ymm2, %ymm0, %ymm0
848 ; AVX1-SLOW-NEXT: retq
850 ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
851 ; AVX1-FAST: # %bb.0:
852 ; AVX1-FAST-NEXT: vhaddps %ymm2, %ymm0, %ymm0
853 ; AVX1-FAST-NEXT: retq
855 ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
857 ; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0
859 %a4 = extractelement <16 x float> %a, i32 4
860 %a5 = extractelement <16 x float> %a, i32 5
861 %b6 = extractelement <16 x float> %b, i32 6
862 %b7 = extractelement <16 x float> %b, i32 7
863 %add4 = fadd float %a4, %a5
864 %add7 = fadd float %b6, %b7
865 %r4 = insertelement <8 x float> undef, float %add4, i32 4
866 %r = insertelement <8 x float> %r4, float %add7, i32 7
870 define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
871 ; SSE-LABEL: PR40243:
873 ; SSE-NEXT: haddps %xmm3, %xmm1
876 ; AVX-LABEL: PR40243:
878 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
880 %a4 = extractelement <8 x float> %a, i32 4
881 %a5 = extractelement <8 x float> %a, i32 5
882 %add4 = fadd float %a4, %a5
883 %b6 = extractelement <8 x float> %b, i32 6
884 %b7 = extractelement <8 x float> %b, i32 7
885 %add7 = fadd float %b6, %b7
886 %r4 = insertelement <8 x float> undef, float %add4, i32 4
887 %r = insertelement <8 x float> %r4, float %add7, i32 7
891 define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
892 ; SSE-SLOW-LABEL: PR44694:
894 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
895 ; SSE-SLOW-NEXT: haddpd %xmm3, %xmm2
896 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
897 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1
898 ; SSE-SLOW-NEXT: retq
900 ; SSE-FAST-LABEL: PR44694:
902 ; SSE-FAST-NEXT: movapd %xmm1, %xmm0
903 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm2
904 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
905 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
906 ; SSE-FAST-NEXT: retq
908 ; AVX1-SLOW-LABEL: PR44694:
909 ; AVX1-SLOW: # %bb.0:
910 ; AVX1-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
911 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
912 ; AVX1-SLOW-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
913 ; AVX1-SLOW-NEXT: retq
915 ; AVX1-FAST-LABEL: PR44694:
916 ; AVX1-FAST: # %bb.0:
917 ; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
918 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
919 ; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
920 ; AVX1-FAST-NEXT: retq
922 ; AVX512-LABEL: PR44694:
924 ; AVX512-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
925 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
927 %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
928 %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
929 %5 = fadd <4 x double> %3, %4
933 define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
934 ; SSE-SLOW-LABEL: PR45747_1:
936 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
937 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
938 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
939 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
940 ; SSE-SLOW-NEXT: retq
942 ; SSE-FAST-LABEL: PR45747_1:
944 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
945 ; SSE-FAST-NEXT: retq
947 ; AVX-SLOW-LABEL: PR45747_1:
949 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,2,2]
950 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
951 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
952 ; AVX-SLOW-NEXT: retq
954 ; AVX-FAST-LABEL: PR45747_1:
956 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
957 ; AVX-FAST-NEXT: retq
958 %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
959 %t1 = fadd <4 x float> %t0, %a
960 %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
961 ret <4 x float> %shuffle
964 define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
965 ; SSE-SLOW-LABEL: PR45747_2:
967 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
968 ; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
969 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
970 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
971 ; SSE-SLOW-NEXT: retq
973 ; SSE-FAST-LABEL: PR45747_2:
975 ; SSE-FAST-NEXT: haddps %xmm1, %xmm1
976 ; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
977 ; SSE-FAST-NEXT: retq
979 ; AVX-SLOW-LABEL: PR45747_2:
981 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
982 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1]
983 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
984 ; AVX-SLOW-NEXT: retq
986 ; AVX-FAST-LABEL: PR45747_2:
988 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0
989 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
990 ; AVX-FAST-NEXT: retq
991 %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
992 %t1 = fadd <4 x float> %t0, %b
993 %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
994 ret <4 x float> %shuffle
997 define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
998 ; SSE-LABEL: PR34724_add_v4f32_u123:
1000 ; SSE-NEXT: haddps %xmm1, %xmm0
1003 ; AVX-LABEL: PR34724_add_v4f32_u123:
1005 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1007 %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
1008 %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
1009 %5 = fadd <2 x float> %3, %4
1010 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1011 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1012 %8 = fadd <4 x float> %7, %1
1013 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1017 define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
1018 ; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23:
1019 ; SSE-SLOW: # %bb.0:
1020 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
1021 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1022 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
1023 ; SSE-SLOW-NEXT: addps %xmm2, %xmm0
1024 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1025 ; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1026 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
1027 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1028 ; SSE-SLOW-NEXT: retq
1030 ; SSE-FAST-LABEL: PR34724_add_v4f32_0u23:
1031 ; SSE-FAST: # %bb.0:
1032 ; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1033 ; SSE-FAST-NEXT: retq
1035 ; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23:
1036 ; AVX-SLOW: # %bb.0:
1037 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3]
1038 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2]
1039 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
1040 ; AVX-SLOW-NEXT: retq
1042 ; AVX-FAST-LABEL: PR34724_add_v4f32_0u23:
1043 ; AVX-FAST: # %bb.0:
1044 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1045 ; AVX-FAST-NEXT: retq
1046 %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1047 %4 = fadd <4 x float> %3, %0
1048 %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1049 %6 = fadd <4 x float> %5, %1
1050 %7 = shufflevector <4 x float> %4, <4 x float> %6, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
1051 %8 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1052 %9 = fadd <4 x float> %8, %1
1053 %10 = shufflevector <4 x float> %7, <4 x float> %9, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
1057 define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
1058 ; SSE-SLOW-LABEL: PR34724_add_v4f32_01u3:
1059 ; SSE-SLOW: # %bb.0:
1060 ; SSE-SLOW-NEXT: haddps %xmm0, %xmm0
1061 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1062 ; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1063 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1064 ; SSE-SLOW-NEXT: retq
1066 ; SSE-FAST-LABEL: PR34724_add_v4f32_01u3:
1067 ; SSE-FAST: # %bb.0:
1068 ; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1069 ; SSE-FAST-NEXT: retq
1071 ; AVX-SLOW-LABEL: PR34724_add_v4f32_01u3:
1072 ; AVX-SLOW: # %bb.0:
1073 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1074 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1075 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
1076 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1077 ; AVX-SLOW-NEXT: retq
1079 ; AVX-FAST-LABEL: PR34724_add_v4f32_01u3:
1080 ; AVX-FAST: # %bb.0:
1081 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1082 ; AVX-FAST-NEXT: retq
1083 %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
1084 %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
1085 %5 = fadd <2 x float> %3, %4
1086 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1087 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1088 %8 = fadd <4 x float> %7, %1
1089 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 undef, i32 7>
1093 define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) {
1094 ; SSE-SLOW-LABEL: PR34724_add_v4f32_012u:
1095 ; SSE-SLOW: # %bb.0:
1096 ; SSE-SLOW-NEXT: haddps %xmm0, %xmm0
1097 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1098 ; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1099 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1100 ; SSE-SLOW-NEXT: retq
1102 ; SSE-FAST-LABEL: PR34724_add_v4f32_012u:
1103 ; SSE-FAST: # %bb.0:
1104 ; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1105 ; SSE-FAST-NEXT: retq
1107 ; AVX-SLOW-LABEL: PR34724_add_v4f32_012u:
1108 ; AVX-SLOW: # %bb.0:
1109 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1110 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1111 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
1112 ; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1113 ; AVX-SLOW-NEXT: retq
1115 ; AVX-FAST-LABEL: PR34724_add_v4f32_012u:
1116 ; AVX-FAST: # %bb.0:
1117 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1118 ; AVX-FAST-NEXT: retq
1119 %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
1120 %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
1121 %5 = fadd <2 x float> %3, %4
1122 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1123 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1124 %8 = fadd <4 x float> %7, %1
1125 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
1129 define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
1130 ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123:
1131 ; SSE-SLOW: # %bb.0:
1132 ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1
1133 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2
1134 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1135 ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2
1136 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1137 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
1138 ; SSE-SLOW-NEXT: retq
1140 ; SSE-FAST-LABEL: PR34724_add_v4f64_u123:
1141 ; SSE-FAST: # %bb.0:
1142 ; SSE-FAST-NEXT: movapd %xmm1, %xmm0
1143 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm2
1144 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
1145 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
1146 ; SSE-FAST-NEXT: retq
1148 ; AVX-SLOW-LABEL: PR34724_add_v4f64_u123:
1149 ; AVX-SLOW: # %bb.0:
1150 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1151 ; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
1152 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1153 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1154 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1155 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
1156 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1157 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1158 ; AVX-SLOW-NEXT: retq
1160 ; AVX1-FAST-LABEL: PR34724_add_v4f64_u123:
1161 ; AVX1-FAST: # %bb.0:
1162 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1163 ; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3]
1164 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1165 ; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
1166 ; AVX1-FAST-NEXT: retq
1168 ; AVX512-FAST-LABEL: PR34724_add_v4f64_u123:
1169 ; AVX512-FAST: # %bb.0:
1170 ; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1171 ; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1172 ; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
1173 ; AVX512-FAST-NEXT: retq
1174 %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
1175 %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
1176 %5 = fadd <2 x double> %3, %4
1177 %6 = extractelement <2 x double> %5, i32 0
1178 %7 = insertelement <4 x double> undef, double %6, i32 1
1179 %8 = extractelement <2 x double> %5, i32 1
1180 %9 = insertelement <4 x double> %7, double %8, i32 2
1181 %10 = extractelement <4 x double> %1, i32 2
1182 %11 = extractelement <4 x double> %1, i32 3
1183 %12 = fadd double %10, %11
1184 %13 = insertelement <4 x double> %9, double %12, i32 3
1185 ret <4 x double> %13
1188 define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
1189 ; SSE-SLOW-LABEL: PR34724_add_v4f64_0u23:
1190 ; SSE-SLOW: # %bb.0:
1191 ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm0
1192 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2
1193 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1194 ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2
1195 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
1196 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
1197 ; SSE-SLOW-NEXT: retq
1199 ; SSE-FAST-LABEL: PR34724_add_v4f64_0u23:
1200 ; SSE-FAST: # %bb.0:
1201 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
1202 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm0
1203 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm1
1204 ; SSE-FAST-NEXT: retq
1206 ; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23:
1207 ; AVX-SLOW: # %bb.0:
1208 ; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
1209 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1210 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1211 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1212 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
1213 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1214 ; AVX-SLOW-NEXT: retq
1216 ; AVX-FAST-LABEL: PR34724_add_v4f64_0u23:
1217 ; AVX-FAST: # %bb.0:
1218 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1219 ; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1220 ; AVX-FAST-NEXT: retq
1221 %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
1222 %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
1223 %5 = fadd <2 x double> %3, %4
1224 %6 = extractelement <2 x double> %5, i32 0
1225 %7 = insertelement <4 x double> undef, double %6, i32 0
1226 %8 = extractelement <2 x double> %5, i32 1
1227 %9 = insertelement <4 x double> %7, double %8, i32 2
1228 %10 = extractelement <4 x double> %1, i32 2
1229 %11 = extractelement <4 x double> %1, i32 3
1230 %12 = fadd double %10, %11
1231 %13 = insertelement <4 x double> %9, double %12, i32 3
1232 ret <4 x double> %13
1235 define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
1236 ; SSE-SLOW-LABEL: PR34724_add_v4f64_01u3:
1237 ; SSE-SLOW: # %bb.0:
1238 ; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0
1239 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm1
1240 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1241 ; SSE-SLOW-NEXT: addsd %xmm3, %xmm1
1242 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
1243 ; SSE-SLOW-NEXT: retq
1245 ; SSE-FAST-LABEL: PR34724_add_v4f64_01u3:
1246 ; SSE-FAST: # %bb.0:
1247 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
1248 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm3
1249 ; SSE-FAST-NEXT: movapd %xmm3, %xmm1
1250 ; SSE-FAST-NEXT: retq
1252 ; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3:
1253 ; AVX-SLOW: # %bb.0:
1254 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1255 ; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
1256 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1257 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1258 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1259 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1260 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1261 ; AVX-SLOW-NEXT: retq
1263 ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3:
1264 ; AVX1-FAST: # %bb.0:
1265 ; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1266 ; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1267 ; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
1268 ; AVX1-FAST-NEXT: retq
1270 ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3:
1271 ; AVX512-FAST: # %bb.0:
1272 ; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1273 ; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
1274 ; AVX512-FAST-NEXT: retq
1275 %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
1276 %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
1277 %5 = fadd <2 x double> %3, %4
1278 %6 = extractelement <2 x double> %5, i32 0
1279 %7 = insertelement <4 x double> undef, double %6, i32 0
1280 %8 = extractelement <2 x double> %5, i32 1
1281 %9 = insertelement <4 x double> %7, double %8, i32 1
1282 %10 = extractelement <4 x double> %1, i32 2
1283 %11 = extractelement <4 x double> %1, i32 3
1284 %12 = fadd double %10, %11
1285 %13 = insertelement <4 x double> %9, double %12, i32 3
1286 ret <4 x double> %13
1289 define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) {
1290 ; SSE-SLOW-LABEL: PR34724_add_v4f64_012u:
1291 ; SSE-SLOW: # %bb.0:
1292 ; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0
1293 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1
1294 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1295 ; SSE-SLOW-NEXT: addsd %xmm2, %xmm1
1296 ; SSE-SLOW-NEXT: retq
1298 ; SSE-FAST-LABEL: PR34724_add_v4f64_012u:
1299 ; SSE-FAST: # %bb.0:
1300 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
1301 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm2
1302 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
1303 ; SSE-FAST-NEXT: retq
1305 ; AVX-SLOW-LABEL: PR34724_add_v4f64_012u:
1306 ; AVX-SLOW: # %bb.0:
1307 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1308 ; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
1309 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1310 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1311 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1312 ; AVX-SLOW-NEXT: retq
1314 ; AVX-FAST-LABEL: PR34724_add_v4f64_012u:
1315 ; AVX-FAST: # %bb.0:
1316 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
1317 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1318 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1319 ; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1320 ; AVX-FAST-NEXT: retq
1321 %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
1322 %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
1323 %5 = fadd <2 x double> %3, %4
1324 %6 = extractelement <2 x double> %5, i32 0
1325 %7 = insertelement <4 x double> undef, double %6, i32 0
1326 %8 = extractelement <2 x double> %5, i32 1
1327 %9 = insertelement <4 x double> %7, double %8, i32 1
1328 %10 = extractelement <4 x double> %1, i32 0
1329 %11 = extractelement <4 x double> %1, i32 1
1330 %12 = fadd double %10, %11
1331 %13 = insertelement <4 x double> %9, double %12, i32 2
1332 ret <4 x double> %13