1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
9 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
11 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
12 ; SSE-LABEL: test1_undef:
14 ; SSE-NEXT: haddps %xmm1, %xmm0
17 ; AVX-LABEL: test1_undef:
19 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
21 %vecext = extractelement <4 x float> %a, i32 0
22 %vecext1 = extractelement <4 x float> %a, i32 1
23 %add = fadd float %vecext, %vecext1
24 %vecinit = insertelement <4 x float> undef, float %add, i32 0
25 %vecext2 = extractelement <4 x float> %a, i32 2
26 %vecext3 = extractelement <4 x float> %a, i32 3
27 %add4 = fadd float %vecext2, %vecext3
28 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
29 %vecext10 = extractelement <4 x float> %b, i32 2
30 %vecext11 = extractelement <4 x float> %b, i32 3
31 %add12 = fadd float %vecext10, %vecext11
32 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
33 ret <4 x float> %vecinit13
36 define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
37 ; SSE-LABEL: test2_undef:
39 ; SSE-NEXT: haddps %xmm1, %xmm0
42 ; AVX-LABEL: test2_undef:
44 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
46 %vecext = extractelement <4 x float> %a, i32 0
47 %vecext1 = extractelement <4 x float> %a, i32 1
48 %add = fadd float %vecext, %vecext1
49 %vecinit = insertelement <4 x float> undef, float %add, i32 0
50 %vecext6 = extractelement <4 x float> %b, i32 0
51 %vecext7 = extractelement <4 x float> %b, i32 1
52 %add8 = fadd float %vecext6, %vecext7
53 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
54 %vecext10 = extractelement <4 x float> %b, i32 2
55 %vecext11 = extractelement <4 x float> %b, i32 3
56 %add12 = fadd float %vecext10, %vecext11
57 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
58 ret <4 x float> %vecinit13
61 define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
62 ; SSE-LABEL: test3_undef:
64 ; SSE-NEXT: haddps %xmm1, %xmm0
67 ; AVX-LABEL: test3_undef:
69 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
71 %vecext = extractelement <4 x float> %a, i32 0
72 %vecext1 = extractelement <4 x float> %a, i32 1
73 %add = fadd float %vecext, %vecext1
74 %vecinit = insertelement <4 x float> undef, float %add, i32 0
75 %vecext2 = extractelement <4 x float> %a, i32 2
76 %vecext3 = extractelement <4 x float> %a, i32 3
77 %add4 = fadd float %vecext2, %vecext3
78 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
79 %vecext6 = extractelement <4 x float> %b, i32 0
80 %vecext7 = extractelement <4 x float> %b, i32 1
81 %add8 = fadd float %vecext6, %vecext7
82 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
83 ret <4 x float> %vecinit9
86 define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
87 ; SSE-SLOW-LABEL: test4_undef:
89 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
90 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
93 ; SSE-FAST-LABEL: test4_undef:
95 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
98 ; AVX-SLOW-LABEL: test4_undef:
100 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
101 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
102 ; AVX-SLOW-NEXT: retq
104 ; AVX-FAST-LABEL: test4_undef:
106 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
107 ; AVX-FAST-NEXT: retq
108 %vecext = extractelement <4 x float> %a, i32 0
109 %vecext1 = extractelement <4 x float> %a, i32 1
110 %add = fadd float %vecext, %vecext1
111 %vecinit = insertelement <4 x float> undef, float %add, i32 0
112 ret <4 x float> %vecinit
115 define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
116 ; SSE-SLOW-LABEL: test5_undef:
118 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
119 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
120 ; SSE-SLOW-NEXT: addsd %xmm1, %xmm0
121 ; SSE-SLOW-NEXT: retq
123 ; SSE-FAST-LABEL: test5_undef:
125 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
126 ; SSE-FAST-NEXT: retq
128 ; AVX-SLOW-LABEL: test5_undef:
130 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
131 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
132 ; AVX-SLOW-NEXT: retq
134 ; AVX-FAST-LABEL: test5_undef:
136 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
137 ; AVX-FAST-NEXT: retq
138 %vecext = extractelement <2 x double> %a, i32 0
139 %vecext1 = extractelement <2 x double> %a, i32 1
140 %add = fadd double %vecext, %vecext1
141 %vecinit = insertelement <2 x double> undef, double %add, i32 0
142 ret <2 x double> %vecinit
145 define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
146 ; SSE-LABEL: test6_undef:
148 ; SSE-NEXT: haddps %xmm0, %xmm0
151 ; AVX-LABEL: test6_undef:
153 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
155 %vecext = extractelement <4 x float> %a, i32 0
156 %vecext1 = extractelement <4 x float> %a, i32 1
157 %add = fadd float %vecext, %vecext1
158 %vecinit = insertelement <4 x float> undef, float %add, i32 0
159 %vecext2 = extractelement <4 x float> %a, i32 2
160 %vecext3 = extractelement <4 x float> %a, i32 3
161 %add4 = fadd float %vecext2, %vecext3
162 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
163 ret <4 x float> %vecinit5
166 define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
167 ; SSE-LABEL: test7_undef:
169 ; SSE-NEXT: haddps %xmm1, %xmm0
172 ; AVX-LABEL: test7_undef:
174 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
176 %vecext = extractelement <4 x float> %b, i32 0
177 %vecext1 = extractelement <4 x float> %b, i32 1
178 %add = fadd float %vecext, %vecext1
179 %vecinit = insertelement <4 x float> undef, float %add, i32 2
180 %vecext2 = extractelement <4 x float> %b, i32 2
181 %vecext3 = extractelement <4 x float> %b, i32 3
182 %add4 = fadd float %vecext2, %vecext3
183 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
184 ret <4 x float> %vecinit5
187 define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
188 ; SSE-SLOW-LABEL: test8_undef:
190 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
191 ; SSE-SLOW-NEXT: addss %xmm0, %xmm1
192 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
193 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
194 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
195 ; SSE-SLOW-NEXT: addss %xmm2, %xmm0
196 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
197 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
198 ; SSE-SLOW-NEXT: retq
200 ; SSE-FAST-LABEL: test8_undef:
202 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
203 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,1]
204 ; SSE-FAST-NEXT: retq
206 ; AVX-SLOW-LABEL: test8_undef:
208 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
209 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
210 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
211 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
212 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
213 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
214 ; AVX-SLOW-NEXT: retq
216 ; AVX-FAST-LABEL: test8_undef:
218 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
219 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
220 ; AVX-FAST-NEXT: retq
221 %vecext = extractelement <4 x float> %a, i32 0
222 %vecext1 = extractelement <4 x float> %a, i32 1
223 %add = fadd float %vecext, %vecext1
224 %vecinit = insertelement <4 x float> undef, float %add, i32 0
225 %vecext2 = extractelement <4 x float> %a, i32 2
226 %vecext3 = extractelement <4 x float> %a, i32 3
227 %add4 = fadd float %vecext2, %vecext3
228 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
229 ret <4 x float> %vecinit5
232 define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
233 ; SSE-LABEL: test9_undef:
235 ; SSE-NEXT: haddps %xmm1, %xmm0
238 ; AVX-LABEL: test9_undef:
240 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
242 %vecext = extractelement <4 x float> %a, i32 0
243 %vecext1 = extractelement <4 x float> %a, i32 1
244 %add = fadd float %vecext, %vecext1
245 %vecinit = insertelement <4 x float> undef, float %add, i32 0
246 %vecext2 = extractelement <4 x float> %b, i32 2
247 %vecext3 = extractelement <4 x float> %b, i32 3
248 %add4 = fadd float %vecext2, %vecext3
249 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
250 ret <4 x float> %vecinit5
253 define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
254 ; SSE-LABEL: test10_undef:
256 ; SSE-NEXT: haddps %xmm2, %xmm0
259 ; AVX-LABEL: test10_undef:
261 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
263 %vecext = extractelement <8 x float> %a, i32 0
264 %vecext1 = extractelement <8 x float> %a, i32 1
265 %add = fadd float %vecext, %vecext1
266 %vecinit = insertelement <8 x float> undef, float %add, i32 0
267 %vecext2 = extractelement <8 x float> %b, i32 2
268 %vecext3 = extractelement <8 x float> %b, i32 3
269 %add4 = fadd float %vecext2, %vecext3
270 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
271 ret <8 x float> %vecinit5
274 define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
275 ; SSE-SLOW-LABEL: test11_undef:
277 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
278 ; SSE-SLOW-NEXT: addss %xmm1, %xmm0
279 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
280 ; SSE-SLOW-NEXT: addss %xmm3, %xmm1
281 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
282 ; SSE-SLOW-NEXT: retq
284 ; SSE-FAST-LABEL: test11_undef:
286 ; SSE-FAST-NEXT: movaps %xmm3, %xmm1
287 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
288 ; SSE-FAST-NEXT: haddps %xmm3, %xmm1
289 ; SSE-FAST-NEXT: retq
291 ; AVX-LABEL: test11_undef:
293 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
295 %vecext = extractelement <8 x float> %a, i32 0
296 %vecext1 = extractelement <8 x float> %a, i32 1
297 %add = fadd float %vecext, %vecext1
298 %vecinit = insertelement <8 x float> undef, float %add, i32 0
299 %vecext2 = extractelement <8 x float> %b, i32 4
300 %vecext3 = extractelement <8 x float> %b, i32 5
301 %add4 = fadd float %vecext2, %vecext3
302 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
303 ret <8 x float> %vecinit5
306 define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
307 ; SSE-LABEL: test12_undef:
309 ; SSE-NEXT: haddps %xmm0, %xmm0
312 ; AVX-LABEL: test12_undef:
314 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
316 %vecext = extractelement <8 x float> %a, i32 0
317 %vecext1 = extractelement <8 x float> %a, i32 1
318 %add = fadd float %vecext, %vecext1
319 %vecinit = insertelement <8 x float> undef, float %add, i32 0
320 %vecext2 = extractelement <8 x float> %a, i32 2
321 %vecext3 = extractelement <8 x float> %a, i32 3
322 %add4 = fadd float %vecext2, %vecext3
323 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
324 ret <8 x float> %vecinit5
327 define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
328 ; SSE-LABEL: test13_undef:
330 ; SSE-NEXT: haddps %xmm1, %xmm0
333 ; AVX-LABEL: test13_undef:
335 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
336 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
338 %vecext = extractelement <8 x float> %a, i32 0
339 %vecext1 = extractelement <8 x float> %a, i32 1
340 %add1 = fadd float %vecext, %vecext1
341 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
342 %vecext2 = extractelement <8 x float> %a, i32 2
343 %vecext3 = extractelement <8 x float> %a, i32 3
344 %add2 = fadd float %vecext2, %vecext3
345 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
346 %vecext4 = extractelement <8 x float> %a, i32 4
347 %vecext5 = extractelement <8 x float> %a, i32 5
348 %add3 = fadd float %vecext4, %vecext5
349 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
350 %vecext6 = extractelement <8 x float> %a, i32 6
351 %vecext7 = extractelement <8 x float> %a, i32 7
352 %add4 = fadd float %vecext6, %vecext7
353 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
354 ret <8 x float> %vecinit4
357 define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
358 ; SSE-LABEL: test13_v16f32_undef:
360 ; SSE-NEXT: haddps %xmm1, %xmm0
363 ; AVX1-SLOW-LABEL: test13_v16f32_undef:
364 ; AVX1-SLOW: # %bb.0:
365 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
366 ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
367 ; AVX1-SLOW-NEXT: retq
369 ; AVX-FAST-LABEL: test13_v16f32_undef:
371 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
372 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
373 ; AVX-FAST-NEXT: retq
375 ; AVX512-SLOW-LABEL: test13_v16f32_undef:
376 ; AVX512-SLOW: # %bb.0:
377 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
378 ; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
379 ; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
380 ; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
381 ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
382 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
383 ; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
384 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
385 ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2
386 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
387 ; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
388 ; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
389 ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
390 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
391 ; AVX512-SLOW-NEXT: retq
392 %vecext = extractelement <16 x float> %a, i32 0
393 %vecext1 = extractelement <16 x float> %a, i32 1
394 %add1 = fadd float %vecext, %vecext1
395 %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0
396 %vecext2 = extractelement <16 x float> %a, i32 2
397 %vecext3 = extractelement <16 x float> %a, i32 3
398 %add2 = fadd float %vecext2, %vecext3
399 %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1
400 %vecext4 = extractelement <16 x float> %a, i32 4
401 %vecext5 = extractelement <16 x float> %a, i32 5
402 %add3 = fadd float %vecext4, %vecext5
403 %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2
404 %vecext6 = extractelement <16 x float> %a, i32 6
405 %vecext7 = extractelement <16 x float> %a, i32 7
406 %add4 = fadd float %vecext6, %vecext7
407 %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3
408 ret <16 x float> %vecinit4
410 define <2 x double> @add_pd_003(<2 x double> %x) {
411 ; SSE-SLOW-LABEL: add_pd_003:
413 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
414 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
415 ; SSE-SLOW-NEXT: retq
417 ; SSE-FAST-LABEL: add_pd_003:
419 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
420 ; SSE-FAST-NEXT: retq
422 ; AVX-SLOW-LABEL: add_pd_003:
424 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
425 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
426 ; AVX-SLOW-NEXT: retq
428 ; AVX-FAST-LABEL: add_pd_003:
430 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
431 ; AVX-FAST-NEXT: retq
432 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
433 %add = fadd <2 x double> %l, %x
434 ret <2 x double> %add
437 ; Change shuffle mask - no undefs.
439 define <2 x double> @add_pd_003_2(<2 x double> %x) {
440 ; SSE-SLOW-LABEL: add_pd_003_2:
442 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
443 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
444 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
445 ; SSE-SLOW-NEXT: retq
447 ; SSE-FAST-LABEL: add_pd_003_2:
449 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
450 ; SSE-FAST-NEXT: retq
452 ; AVX-SLOW-LABEL: add_pd_003_2:
454 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
455 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
456 ; AVX-SLOW-NEXT: retq
458 ; AVX-FAST-LABEL: add_pd_003_2:
460 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
461 ; AVX-FAST-NEXT: retq
462 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
463 %add = fadd <2 x double> %l, %x
464 ret <2 x double> %add
467 define <2 x double> @add_pd_010(<2 x double> %x) {
468 ; SSE-LABEL: add_pd_010:
470 ; SSE-NEXT: haddpd %xmm0, %xmm0
473 ; AVX-SLOW-LABEL: add_pd_010:
475 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
476 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
477 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
478 ; AVX-SLOW-NEXT: retq
480 ; AVX-FAST-LABEL: add_pd_010:
482 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
483 ; AVX-FAST-NEXT: retq
484 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
485 %add = fadd <2 x double> %l, %x
486 %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
487 ret <2 x double> %shuffle2
490 define <4 x float> @add_ps_007(<4 x float> %x) {
491 ; SSE-LABEL: add_ps_007:
493 ; SSE-NEXT: haddps %xmm0, %xmm0
496 ; AVX-LABEL: add_ps_007:
498 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
500 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
501 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
502 %add = fadd <4 x float> %l, %r
506 define <4 x float> @add_ps_030(<4 x float> %x) {
507 ; SSE-SLOW-LABEL: add_ps_030:
509 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
510 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
511 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
512 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
513 ; SSE-SLOW-NEXT: retq
515 ; SSE-FAST-LABEL: add_ps_030:
517 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
518 ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
519 ; SSE-FAST-NEXT: retq
521 ; AVX-SLOW-LABEL: add_ps_030:
523 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3]
524 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
525 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
526 ; AVX-SLOW-NEXT: retq
528 ; AVX-FAST-LABEL: add_ps_030:
530 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
531 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
532 ; AVX-FAST-NEXT: retq
533 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
534 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
535 %add = fadd <4 x float> %l, %r
536 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef>
537 ret <4 x float> %shuffle2
540 define <4 x float> @add_ps_007_2(<4 x float> %x) {
541 ; SSE-LABEL: add_ps_007_2:
543 ; SSE-NEXT: haddps %xmm0, %xmm0
546 ; AVX-LABEL: add_ps_007_2:
548 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
550 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
551 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
552 %add = fadd <4 x float> %l, %r
556 define <4 x float> @add_ps_008(<4 x float> %x) {
557 ; SSE-SLOW-LABEL: add_ps_008:
559 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
560 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
561 ; SSE-SLOW-NEXT: retq
563 ; SSE-FAST-LABEL: add_ps_008:
565 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
566 ; SSE-FAST-NEXT: retq
568 ; AVX-SLOW-LABEL: add_ps_008:
570 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
571 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
572 ; AVX-SLOW-NEXT: retq
574 ; AVX-FAST-LABEL: add_ps_008:
576 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
577 ; AVX-FAST-NEXT: retq
578 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
579 %add = fadd <4 x float> %l, %x
583 define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
584 ; SSE-LABEL: add_ps_016:
586 ; SSE-NEXT: haddps %xmm0, %xmm1
587 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
588 ; SSE-NEXT: movaps %xmm1, %xmm0
591 ; AVX-LABEL: add_ps_016:
593 ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0
594 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3]
596 %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6>
597 %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7>
598 %5 = fadd <2 x float> %3, %4
599 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
600 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
601 %8 = fadd <4 x float> %7, %1
602 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 6, i32 1, i32 2, i32 undef>
606 define <4 x float> @add_ps_017(<4 x float> %x) {
607 ; SSE-SLOW-LABEL: add_ps_017:
609 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
610 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
611 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
612 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
613 ; SSE-SLOW-NEXT: retq
615 ; SSE-FAST-LABEL: add_ps_017:
617 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
618 ; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
619 ; SSE-FAST-NEXT: retq
621 ; AVX-SLOW-LABEL: add_ps_017:
623 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
624 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
625 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
626 ; AVX-SLOW-NEXT: retq
628 ; AVX-FAST-LABEL: add_ps_017:
630 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
631 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
632 ; AVX-FAST-NEXT: retq
633 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
634 %add = fadd <4 x float> %l, %x
635 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
636 ret <4 x float> %shuffle2
639 define <4 x float> @add_ps_018(<4 x float> %x) {
640 ; SSE-LABEL: add_ps_018:
642 ; SSE-NEXT: haddps %xmm0, %xmm0
643 ; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
646 ; AVX1-SLOW-LABEL: add_ps_018:
647 ; AVX1-SLOW: # %bb.0:
648 ; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
649 ; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
650 ; AVX1-SLOW-NEXT: retq
652 ; AVX1-FAST-LABEL: add_ps_018:
653 ; AVX1-FAST: # %bb.0:
654 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
655 ; AVX1-FAST-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
656 ; AVX1-FAST-NEXT: retq
658 ; AVX512-LABEL: add_ps_018:
660 ; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
661 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
663 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
664 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
665 %add = fadd <4 x float> %l, %r
666 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
667 ret <4 x float> %shuffle2
670 define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) {
671 ; SSE-SLOW-LABEL: add_pd_011:
673 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1
674 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
675 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm3
676 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
677 ; SSE-SLOW-NEXT: addpd %xmm3, %xmm0
678 ; SSE-SLOW-NEXT: addpd %xmm2, %xmm1
679 ; SSE-SLOW-NEXT: retq
681 ; SSE-FAST-LABEL: add_pd_011:
683 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
684 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
685 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm1
686 ; SSE-FAST-NEXT: retq
688 ; AVX1-SLOW-LABEL: add_pd_011:
689 ; AVX1-SLOW: # %bb.0:
690 ; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
691 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
692 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
693 ; AVX1-SLOW-NEXT: retq
695 ; AVX1-FAST-LABEL: add_pd_011:
696 ; AVX1-FAST: # %bb.0:
697 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
698 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
699 ; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
700 ; AVX1-FAST-NEXT: retq
702 ; AVX512-LABEL: add_pd_011:
704 ; AVX512-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
705 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
707 %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
708 %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 undef, i32 5, i32 undef>
709 %5 = fadd <4 x double> %3, %4
710 %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
714 define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) {
715 ; SSE-LABEL: v8f32_inputs_v4f32_output_0101:
717 ; SSE-NEXT: haddps %xmm2, %xmm0
720 ; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
722 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
723 ; AVX-NEXT: vzeroupper
725 %a0 = extractelement <8 x float> %a, i32 0
726 %a1 = extractelement <8 x float> %a, i32 1
727 %b0 = extractelement <8 x float> %b, i32 0
728 %b1 = extractelement <8 x float> %b, i32 1
729 %add0 = fadd float %a0, %a1
730 %add2 = fadd float %b0, %b1
731 %r0 = insertelement <4 x float> undef, float %add0, i32 0
732 %r = insertelement <4 x float> %r0, float %add2, i32 2
736 define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) {
737 ; SSE-LABEL: v8f32_input0_v4f32_output_0123:
739 ; SSE-NEXT: haddps %xmm2, %xmm0
742 ; AVX-LABEL: v8f32_input0_v4f32_output_0123:
744 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
745 ; AVX-NEXT: vzeroupper
747 %a0 = extractelement <8 x float> %a, i32 0
748 %a1 = extractelement <8 x float> %a, i32 1
749 %b2 = extractelement <4 x float> %b, i32 2
750 %b3 = extractelement <4 x float> %b, i32 3
751 %add0 = fadd float %a0, %a1
752 %add3 = fadd float %b2, %b3
753 %r0 = insertelement <4 x float> undef, float %add0, i32 0
754 %r = insertelement <4 x float> %r0, float %add3, i32 3
758 define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) {
759 ; SSE-LABEL: v8f32_input1_v4f32_output_2301:
761 ; SSE-NEXT: haddps %xmm1, %xmm0
764 ; AVX-LABEL: v8f32_input1_v4f32_output_2301:
766 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
767 ; AVX-NEXT: vzeroupper
769 %a2 = extractelement <4 x float> %a, i32 2
770 %a3 = extractelement <4 x float> %a, i32 3
771 %b0 = extractelement <8 x float> %b, i32 0
772 %b1 = extractelement <8 x float> %b, i32 1
773 %add1 = fadd float %a2, %a3
774 %add2 = fadd float %b0, %b1
775 %r1 = insertelement <4 x float> undef, float %add1, i32 1
776 %r = insertelement <4 x float> %r1, float %add2, i32 2
780 define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) {
781 ; SSE-LABEL: v8f32_inputs_v4f32_output_2323:
783 ; SSE-NEXT: haddps %xmm2, %xmm0
786 ; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
788 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
789 ; AVX-NEXT: vzeroupper
791 %a2 = extractelement <8 x float> %a, i32 2
792 %a3 = extractelement <8 x float> %a, i32 3
793 %b2 = extractelement <8 x float> %b, i32 2
794 %b3 = extractelement <8 x float> %b, i32 3
795 %add1 = fadd float %a2, %a3
796 %add3 = fadd float %b2, %b3
797 %r1 = insertelement <4 x float> undef, float %add1, i32 1
798 %r = insertelement <4 x float> %r1, float %add3, i32 3
802 define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) {
803 ; SSE-LABEL: v16f32_inputs_v4f32_output_0123:
805 ; SSE-NEXT: haddps %xmm4, %xmm0
808 ; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
809 ; AVX1-SLOW: # %bb.0:
810 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0
811 ; AVX1-SLOW-NEXT: vzeroupper
812 ; AVX1-SLOW-NEXT: retq
814 ; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
815 ; AVX1-FAST: # %bb.0:
816 ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
817 ; AVX1-FAST-NEXT: vzeroupper
818 ; AVX1-FAST-NEXT: retq
820 ; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
822 ; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0
823 ; AVX512-NEXT: vzeroupper
825 %a0 = extractelement <16 x float> %a, i32 0
826 %a1 = extractelement <16 x float> %a, i32 1
827 %b2 = extractelement <16 x float> %b, i32 2
828 %b3 = extractelement <16 x float> %b, i32 3
829 %add0 = fadd float %a0, %a1
830 %add3 = fadd float %b2, %b3
831 %r0 = insertelement <4 x float> undef, float %add0, i32 0
832 %r = insertelement <4 x float> %r0, float %add3, i32 3
836 define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) {
837 ; SSE-LABEL: v16f32_inputs_v8f32_output_4567:
839 ; SSE-NEXT: haddps %xmm5, %xmm1
842 ; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567:
843 ; AVX1-SLOW: # %bb.0:
844 ; AVX1-SLOW-NEXT: vhaddps %ymm2, %ymm0, %ymm0
845 ; AVX1-SLOW-NEXT: retq
847 ; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567:
848 ; AVX1-FAST: # %bb.0:
849 ; AVX1-FAST-NEXT: vhaddps %ymm2, %ymm0, %ymm0
850 ; AVX1-FAST-NEXT: retq
852 ; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
854 ; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0
856 %a4 = extractelement <16 x float> %a, i32 4
857 %a5 = extractelement <16 x float> %a, i32 5
858 %b6 = extractelement <16 x float> %b, i32 6
859 %b7 = extractelement <16 x float> %b, i32 7
860 %add4 = fadd float %a4, %a5
861 %add7 = fadd float %b6, %b7
862 %r4 = insertelement <8 x float> undef, float %add4, i32 4
863 %r = insertelement <8 x float> %r4, float %add7, i32 7
867 define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
868 ; SSE-LABEL: PR40243:
870 ; SSE-NEXT: haddps %xmm3, %xmm1
873 ; AVX-LABEL: PR40243:
875 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
877 %a4 = extractelement <8 x float> %a, i32 4
878 %a5 = extractelement <8 x float> %a, i32 5
879 %add4 = fadd float %a4, %a5
880 %b6 = extractelement <8 x float> %b, i32 6
881 %b7 = extractelement <8 x float> %b, i32 7
882 %add7 = fadd float %b6, %b7
883 %r4 = insertelement <8 x float> undef, float %add4, i32 4
884 %r = insertelement <8 x float> %r4, float %add7, i32 7
888 define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
889 ; SSE-SLOW-LABEL: PR44694:
891 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
892 ; SSE-SLOW-NEXT: haddpd %xmm3, %xmm2
893 ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
894 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1
895 ; SSE-SLOW-NEXT: retq
897 ; SSE-FAST-LABEL: PR44694:
899 ; SSE-FAST-NEXT: movapd %xmm1, %xmm0
900 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm2
901 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
902 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
903 ; SSE-FAST-NEXT: retq
905 ; AVX1-SLOW-LABEL: PR44694:
906 ; AVX1-SLOW: # %bb.0:
907 ; AVX1-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
908 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
909 ; AVX1-SLOW-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
910 ; AVX1-SLOW-NEXT: retq
912 ; AVX1-FAST-LABEL: PR44694:
913 ; AVX1-FAST: # %bb.0:
914 ; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
915 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
916 ; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
917 ; AVX1-FAST-NEXT: retq
919 ; AVX512-LABEL: PR44694:
921 ; AVX512-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
922 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
924 %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
925 %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
926 %5 = fadd <4 x double> %3, %4
930 define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
931 ; SSE-SLOW-LABEL: PR45747_1:
933 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
934 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
935 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
936 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
937 ; SSE-SLOW-NEXT: retq
939 ; SSE-FAST-LABEL: PR45747_1:
941 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0
942 ; SSE-FAST-NEXT: retq
944 ; AVX-SLOW-LABEL: PR45747_1:
946 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2]
947 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
948 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
949 ; AVX-SLOW-NEXT: retq
951 ; AVX-FAST-LABEL: PR45747_1:
953 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
954 ; AVX-FAST-NEXT: retq
955 %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
956 %t1 = fadd <4 x float> %t0, %a
957 %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
958 ret <4 x float> %shuffle
961 define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
962 ; SSE-SLOW-LABEL: PR45747_2:
964 ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
965 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
966 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
967 ; SSE-SLOW-NEXT: addps %xmm1, %xmm0
968 ; SSE-SLOW-NEXT: retq
970 ; SSE-FAST-LABEL: PR45747_2:
972 ; SSE-FAST-NEXT: haddps %xmm1, %xmm1
973 ; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
974 ; SSE-FAST-NEXT: retq
976 ; AVX-SLOW-LABEL: PR45747_2:
978 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
979 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1]
980 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
981 ; AVX-SLOW-NEXT: retq
983 ; AVX-FAST-LABEL: PR45747_2:
985 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0
986 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
987 ; AVX-FAST-NEXT: retq
988 %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
989 %t1 = fadd <4 x float> %t0, %b
990 %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
991 ret <4 x float> %shuffle
994 define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
995 ; SSE-LABEL: PR34724_add_v4f32_u123:
997 ; SSE-NEXT: haddps %xmm1, %xmm0
1000 ; AVX-LABEL: PR34724_add_v4f32_u123:
1002 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1004 %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
1005 %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
1006 %5 = fadd <2 x float> %3, %4
1007 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1008 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1009 %8 = fadd <4 x float> %7, %1
1010 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1014 define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
1015 ; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23:
1016 ; SSE-SLOW: # %bb.0:
1017 ; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
1018 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1019 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
1020 ; SSE-SLOW-NEXT: addps %xmm2, %xmm0
1021 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1022 ; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1023 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
1024 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1025 ; SSE-SLOW-NEXT: retq
1027 ; SSE-FAST-LABEL: PR34724_add_v4f32_0u23:
1028 ; SSE-FAST: # %bb.0:
1029 ; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1030 ; SSE-FAST-NEXT: retq
1032 ; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23:
1033 ; AVX-SLOW: # %bb.0:
1034 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3]
1035 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2]
1036 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
1037 ; AVX-SLOW-NEXT: retq
1039 ; AVX-FAST-LABEL: PR34724_add_v4f32_0u23:
1040 ; AVX-FAST: # %bb.0:
1041 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1042 ; AVX-FAST-NEXT: retq
1043 %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1044 %4 = fadd <4 x float> %3, %0
1045 %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1046 %6 = fadd <4 x float> %5, %1
1047 %7 = shufflevector <4 x float> %4, <4 x float> %6, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
1048 %8 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1049 %9 = fadd <4 x float> %8, %1
1050 %10 = shufflevector <4 x float> %7, <4 x float> %9, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
1054 define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
1055 ; SSE-SLOW-LABEL: PR34724_add_v4f32_01u3:
1056 ; SSE-SLOW: # %bb.0:
1057 ; SSE-SLOW-NEXT: haddps %xmm0, %xmm0
1058 ; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1059 ; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1060 ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1061 ; SSE-SLOW-NEXT: retq
1063 ; SSE-FAST-LABEL: PR34724_add_v4f32_01u3:
1064 ; SSE-FAST: # %bb.0:
1065 ; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1066 ; SSE-FAST-NEXT: retq
1068 ; AVX-SLOW-LABEL: PR34724_add_v4f32_01u3:
1069 ; AVX-SLOW: # %bb.0:
1070 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1071 ; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
1072 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
1073 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1074 ; AVX-SLOW-NEXT: retq
1076 ; AVX-FAST-LABEL: PR34724_add_v4f32_01u3:
1077 ; AVX-FAST: # %bb.0:
1078 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1079 ; AVX-FAST-NEXT: retq
1080 %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
1081 %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
1082 %5 = fadd <2 x float> %3, %4
1083 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1084 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1085 %8 = fadd <4 x float> %7, %1
1086 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 undef, i32 7>
1090 define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) {
1091 ; SSE-SLOW-LABEL: PR34724_add_v4f32_012u:
1092 ; SSE-SLOW: # %bb.0:
1093 ; SSE-SLOW-NEXT: haddps %xmm0, %xmm0
1094 ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1095 ; SSE-SLOW-NEXT: addps %xmm1, %xmm2
1096 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1097 ; SSE-SLOW-NEXT: retq
1099 ; SSE-FAST-LABEL: PR34724_add_v4f32_012u:
1100 ; SSE-FAST: # %bb.0:
1101 ; SSE-FAST-NEXT: haddps %xmm1, %xmm0
1102 ; SSE-FAST-NEXT: retq
1104 ; AVX-SLOW-LABEL: PR34724_add_v4f32_012u:
1105 ; AVX-SLOW: # %bb.0:
1106 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1107 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1108 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
1109 ; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1110 ; AVX-SLOW-NEXT: retq
1112 ; AVX-FAST-LABEL: PR34724_add_v4f32_012u:
1113 ; AVX-FAST: # %bb.0:
1114 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1115 ; AVX-FAST-NEXT: retq
1116 %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
1117 %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
1118 %5 = fadd <2 x float> %3, %4
1119 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1120 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1121 %8 = fadd <4 x float> %7, %1
1122 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
1126 define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
1127 ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123:
1128 ; SSE-SLOW: # %bb.0:
1129 ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1
1130 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2
1131 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1132 ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2
1133 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1134 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
1135 ; SSE-SLOW-NEXT: retq
1137 ; SSE-FAST-LABEL: PR34724_add_v4f64_u123:
1138 ; SSE-FAST: # %bb.0:
1139 ; SSE-FAST-NEXT: movapd %xmm1, %xmm0
1140 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm2
1141 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
1142 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
1143 ; SSE-FAST-NEXT: retq
1145 ; AVX-SLOW-LABEL: PR34724_add_v4f64_u123:
1146 ; AVX-SLOW: # %bb.0:
1147 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1148 ; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
1149 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1150 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1151 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1152 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
1153 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1154 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1155 ; AVX-SLOW-NEXT: retq
1157 ; AVX-FAST-LABEL: PR34724_add_v4f64_u123:
1158 ; AVX-FAST: # %bb.0:
1159 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
1160 ; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1161 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1162 ; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
1163 ; AVX-FAST-NEXT: retq
1164 %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
1165 %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
1166 %5 = fadd <2 x double> %3, %4
1167 %6 = extractelement <2 x double> %5, i32 0
1168 %7 = insertelement <4 x double> undef, double %6, i32 1
1169 %8 = extractelement <2 x double> %5, i32 1
1170 %9 = insertelement <4 x double> %7, double %8, i32 2
1171 %10 = extractelement <4 x double> %1, i32 2
1172 %11 = extractelement <4 x double> %1, i32 3
1173 %12 = fadd double %10, %11
1174 %13 = insertelement <4 x double> %9, double %12, i32 3
1175 ret <4 x double> %13
1178 define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
1179 ; SSE-SLOW-LABEL: PR34724_add_v4f64_0u23:
1180 ; SSE-SLOW: # %bb.0:
1181 ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm0
1182 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2
1183 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1184 ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2
1185 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
1186 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
1187 ; SSE-SLOW-NEXT: retq
1189 ; SSE-FAST-LABEL: PR34724_add_v4f64_0u23:
1190 ; SSE-FAST: # %bb.0:
1191 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
1192 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm0
1193 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm1
1194 ; SSE-FAST-NEXT: retq
1196 ; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23:
1197 ; AVX-SLOW: # %bb.0:
1198 ; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
1199 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1200 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1201 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1202 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
1203 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1204 ; AVX-SLOW-NEXT: retq
1206 ; AVX-FAST-LABEL: PR34724_add_v4f64_0u23:
1207 ; AVX-FAST: # %bb.0:
1208 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1209 ; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1210 ; AVX-FAST-NEXT: retq
1211 %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
1212 %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
1213 %5 = fadd <2 x double> %3, %4
1214 %6 = extractelement <2 x double> %5, i32 0
1215 %7 = insertelement <4 x double> undef, double %6, i32 0
1216 %8 = extractelement <2 x double> %5, i32 1
1217 %9 = insertelement <4 x double> %7, double %8, i32 2
1218 %10 = extractelement <4 x double> %1, i32 2
1219 %11 = extractelement <4 x double> %1, i32 3
1220 %12 = fadd double %10, %11
1221 %13 = insertelement <4 x double> %9, double %12, i32 3
1222 ret <4 x double> %13
1225 define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
1226 ; SSE-SLOW-LABEL: PR34724_add_v4f64_01u3:
1227 ; SSE-SLOW: # %bb.0:
1228 ; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0
1229 ; SSE-SLOW-NEXT: movapd %xmm3, %xmm1
1230 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
1231 ; SSE-SLOW-NEXT: addsd %xmm3, %xmm1
1232 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
1233 ; SSE-SLOW-NEXT: retq
1235 ; SSE-FAST-LABEL: PR34724_add_v4f64_01u3:
1236 ; SSE-FAST: # %bb.0:
1237 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
1238 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm3
1239 ; SSE-FAST-NEXT: movapd %xmm3, %xmm1
1240 ; SSE-FAST-NEXT: retq
1242 ; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3:
1243 ; AVX-SLOW: # %bb.0:
1244 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1245 ; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
1246 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
1247 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1248 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1249 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1250 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1251 ; AVX-SLOW-NEXT: retq
1253 ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3:
1254 ; AVX1-FAST: # %bb.0:
1255 ; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1256 ; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1257 ; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
1258 ; AVX1-FAST-NEXT: retq
1260 ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3:
1261 ; AVX512-FAST: # %bb.0:
1262 ; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1263 ; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
1264 ; AVX512-FAST-NEXT: retq
1265 %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
1266 %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
1267 %5 = fadd <2 x double> %3, %4
1268 %6 = extractelement <2 x double> %5, i32 0
1269 %7 = insertelement <4 x double> undef, double %6, i32 0
1270 %8 = extractelement <2 x double> %5, i32 1
1271 %9 = insertelement <4 x double> %7, double %8, i32 1
1272 %10 = extractelement <4 x double> %1, i32 2
1273 %11 = extractelement <4 x double> %1, i32 3
1274 %12 = fadd double %10, %11
1275 %13 = insertelement <4 x double> %9, double %12, i32 3
1276 ret <4 x double> %13
1279 define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) {
1280 ; SSE-SLOW-LABEL: PR34724_add_v4f64_012u:
1281 ; SSE-SLOW: # %bb.0:
1282 ; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0
1283 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1
1284 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1285 ; SSE-SLOW-NEXT: addsd %xmm2, %xmm1
1286 ; SSE-SLOW-NEXT: retq
1288 ; SSE-FAST-LABEL: PR34724_add_v4f64_012u:
1289 ; SSE-FAST: # %bb.0:
1290 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0
1291 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm2
1292 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1
1293 ; SSE-FAST-NEXT: retq
1295 ; AVX-SLOW-LABEL: PR34724_add_v4f64_012u:
1296 ; AVX-SLOW: # %bb.0:
1297 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
1298 ; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
1299 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1300 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1301 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1302 ; AVX-SLOW-NEXT: retq
1304 ; AVX-FAST-LABEL: PR34724_add_v4f64_012u:
1305 ; AVX-FAST: # %bb.0:
1306 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
1307 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1308 ; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1309 ; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1310 ; AVX-FAST-NEXT: retq
1311 %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
1312 %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
1313 %5 = fadd <2 x double> %3, %4
1314 %6 = extractelement <2 x double> %5, i32 0
1315 %7 = insertelement <4 x double> undef, double %6, i32 0
1316 %8 = extractelement <2 x double> %5, i32 1
1317 %9 = insertelement <4 x double> %7, double %8, i32 1
1318 %10 = extractelement <4 x double> %1, i32 0
1319 %11 = extractelement <4 x double> %1, i32 1
1320 %12 = fadd double %10, %11
1321 %13 = insertelement <4 x double> %9, double %12, i32 2
1322 ret <4 x double> %13