1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
11 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
12 ; SSE3-LABEL: haddpd1:
14 ; SSE3-NEXT: haddpd %xmm1, %xmm0
19 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
21 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
22 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
23 %r = fadd <2 x double> %a, %b
27 define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
28 ; SSE3-LABEL: haddpd2:
30 ; SSE3-NEXT: haddpd %xmm1, %xmm0
35 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
37 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
38 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
39 %r = fadd <2 x double> %a, %b
43 define <2 x double> @haddpd3(<2 x double> %x) {
44 ; SSE3-SLOW-LABEL: haddpd3:
46 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
47 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
48 ; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1
49 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
50 ; SSE3-SLOW-NEXT: retq
52 ; SSE3-FAST-LABEL: haddpd3:
54 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
55 ; SSE3-FAST-NEXT: retq
57 ; AVX-SLOW-LABEL: haddpd3:
59 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
60 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
63 ; AVX-FAST-LABEL: haddpd3:
65 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
67 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
68 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
69 %r = fadd <2 x double> %a, %b
73 define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
74 ; SSE3-LABEL: haddps1:
76 ; SSE3-NEXT: haddps %xmm1, %xmm0
81 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
83 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
84 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
85 %r = fadd <4 x float> %a, %b
89 define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
90 ; SSE3-LABEL: haddps2:
92 ; SSE3-NEXT: haddps %xmm1, %xmm0
97 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
99 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
100 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
101 %r = fadd <4 x float> %a, %b
105 define <4 x float> @haddps3(<4 x float> %x) {
106 ; SSE3-LABEL: haddps3:
108 ; SSE3-NEXT: haddps %xmm0, %xmm0
111 ; AVX-LABEL: haddps3:
113 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
115 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
116 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
117 %r = fadd <4 x float> %a, %b
121 define <4 x float> @haddps4(<4 x float> %x) {
122 ; SSE3-LABEL: haddps4:
124 ; SSE3-NEXT: haddps %xmm0, %xmm0
127 ; AVX-LABEL: haddps4:
129 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
131 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
132 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
133 %r = fadd <4 x float> %a, %b
137 define <4 x float> @haddps5(<4 x float> %x) {
138 ; SSE3-LABEL: haddps5:
140 ; SSE3-NEXT: haddps %xmm0, %xmm0
143 ; AVX-LABEL: haddps5:
145 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
147 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
148 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
149 %r = fadd <4 x float> %a, %b
153 define <4 x float> @haddps6(<4 x float> %x) {
154 ; SSE3-SLOW-LABEL: haddps6:
155 ; SSE3-SLOW: # %bb.0:
156 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
157 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
158 ; SSE3-SLOW-NEXT: retq
160 ; SSE3-FAST-LABEL: haddps6:
161 ; SSE3-FAST: # %bb.0:
162 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
163 ; SSE3-FAST-NEXT: retq
165 ; AVX-SLOW-LABEL: haddps6:
167 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
169 ; AVX-SLOW-NEXT: retq
171 ; AVX-FAST-LABEL: haddps6:
173 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
174 ; AVX-FAST-NEXT: retq
175 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
176 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
177 %r = fadd <4 x float> %a, %b
181 define <4 x float> @haddps7(<4 x float> %x) {
182 ; SSE3-LABEL: haddps7:
184 ; SSE3-NEXT: haddps %xmm0, %xmm0
187 ; AVX-LABEL: haddps7:
189 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
191 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
192 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
193 %r = fadd <4 x float> %a, %b
197 define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
198 ; SSE3-LABEL: hsubpd1:
200 ; SSE3-NEXT: hsubpd %xmm1, %xmm0
203 ; AVX-LABEL: hsubpd1:
205 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
207 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
208 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
209 %r = fsub <2 x double> %a, %b
213 define <2 x double> @hsubpd2(<2 x double> %x) {
214 ; SSE3-SLOW-LABEL: hsubpd2:
215 ; SSE3-SLOW: # %bb.0:
216 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
217 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
218 ; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0
219 ; SSE3-SLOW-NEXT: retq
221 ; SSE3-FAST-LABEL: hsubpd2:
222 ; SSE3-FAST: # %bb.0:
223 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
224 ; SSE3-FAST-NEXT: retq
226 ; AVX-SLOW-LABEL: hsubpd2:
228 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
229 ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
230 ; AVX-SLOW-NEXT: retq
232 ; AVX-FAST-LABEL: hsubpd2:
234 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
235 ; AVX-FAST-NEXT: retq
236 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
237 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
238 %r = fsub <2 x double> %a, %b
242 define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
243 ; SSE3-LABEL: hsubps1:
245 ; SSE3-NEXT: hsubps %xmm1, %xmm0
248 ; AVX-LABEL: hsubps1:
250 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
252 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
253 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
254 %r = fsub <4 x float> %a, %b
258 define <4 x float> @hsubps2(<4 x float> %x) {
259 ; SSE3-LABEL: hsubps2:
261 ; SSE3-NEXT: hsubps %xmm0, %xmm0
264 ; AVX-LABEL: hsubps2:
266 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
268 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
269 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
270 %r = fsub <4 x float> %a, %b
274 define <4 x float> @hsubps3(<4 x float> %x) {
275 ; SSE3-LABEL: hsubps3:
277 ; SSE3-NEXT: hsubps %xmm0, %xmm0
280 ; AVX-LABEL: hsubps3:
282 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
284 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
285 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
286 %r = fsub <4 x float> %a, %b
290 define <4 x float> @hsubps4(<4 x float> %x) {
291 ; SSE3-SLOW-LABEL: hsubps4:
292 ; SSE3-SLOW: # %bb.0:
293 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
294 ; SSE3-SLOW-NEXT: subps %xmm1, %xmm0
295 ; SSE3-SLOW-NEXT: retq
297 ; SSE3-FAST-LABEL: hsubps4:
298 ; SSE3-FAST: # %bb.0:
299 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
300 ; SSE3-FAST-NEXT: retq
302 ; AVX-SLOW-LABEL: hsubps4:
304 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
305 ; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
306 ; AVX-SLOW-NEXT: retq
308 ; AVX-FAST-LABEL: hsubps4:
310 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
311 ; AVX-FAST-NEXT: retq
312 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
313 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
314 %r = fsub <4 x float> %a, %b
318 define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
319 ; SSE3-LABEL: vhaddps1:
321 ; SSE3-NEXT: haddps %xmm2, %xmm0
322 ; SSE3-NEXT: haddps %xmm3, %xmm1
325 ; AVX-LABEL: vhaddps1:
327 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
329 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
330 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
331 %r = fadd <8 x float> %a, %b
335 define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
336 ; SSE3-LABEL: vhaddps2:
338 ; SSE3-NEXT: haddps %xmm2, %xmm0
339 ; SSE3-NEXT: haddps %xmm3, %xmm1
342 ; AVX-LABEL: vhaddps2:
344 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
346 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
347 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
348 %r = fadd <8 x float> %a, %b
352 define <8 x float> @vhaddps3(<8 x float> %x) {
353 ; SSE3-LABEL: vhaddps3:
355 ; SSE3-NEXT: haddps %xmm0, %xmm0
356 ; SSE3-NEXT: haddps %xmm1, %xmm1
359 ; AVX-LABEL: vhaddps3:
361 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
363 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
364 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
365 %r = fadd <8 x float> %a, %b
369 define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
370 ; SSE3-LABEL: vhsubps1:
372 ; SSE3-NEXT: hsubps %xmm2, %xmm0
373 ; SSE3-NEXT: hsubps %xmm3, %xmm1
376 ; AVX-LABEL: vhsubps1:
378 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
380 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
381 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
382 %r = fsub <8 x float> %a, %b
386 define <8 x float> @vhsubps3(<8 x float> %x) {
387 ; SSE3-LABEL: vhsubps3:
389 ; SSE3-NEXT: hsubps %xmm0, %xmm0
390 ; SSE3-NEXT: hsubps %xmm1, %xmm1
393 ; AVX-LABEL: vhsubps3:
395 ; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
397 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
398 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
399 %r = fsub <8 x float> %a, %b
403 define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
404 ; SSE3-LABEL: vhaddpd1:
406 ; SSE3-NEXT: haddpd %xmm2, %xmm0
407 ; SSE3-NEXT: haddpd %xmm3, %xmm1
410 ; AVX-LABEL: vhaddpd1:
412 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
414 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
415 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
416 %r = fadd <4 x double> %a, %b
420 define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
421 ; SSE3-LABEL: vhsubpd1:
423 ; SSE3-NEXT: hsubpd %xmm2, %xmm0
424 ; SSE3-NEXT: hsubpd %xmm3, %xmm1
427 ; AVX-LABEL: vhsubpd1:
429 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
431 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
432 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
433 %r = fsub <4 x double> %a, %b
437 define <2 x float> @haddps_v2f32(<4 x float> %v0) {
438 ; SSE3-LABEL: haddps_v2f32:
440 ; SSE3-NEXT: haddps %xmm0, %xmm0
443 ; AVX-LABEL: haddps_v2f32:
445 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
447 %v0.0 = extractelement <4 x float> %v0, i32 0
448 %v0.1 = extractelement <4 x float> %v0, i32 1
449 %v0.2 = extractelement <4 x float> %v0, i32 2
450 %v0.3 = extractelement <4 x float> %v0, i32 3
451 %op0 = fadd float %v0.0, %v0.1
452 %op1 = fadd float %v0.2, %v0.3
453 %res0 = insertelement <2 x float> undef, float %op0, i32 0
454 %res1 = insertelement <2 x float> %res0, float %op1, i32 1
455 ret <2 x float> %res1
458 ; 128-bit vectors, float/double, fadd/fsub
460 define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
461 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
462 ; SSE3-SLOW: # %bb.0:
463 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
464 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
465 ; SSE3-SLOW-NEXT: retq
467 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
468 ; SSE3-FAST: # %bb.0:
469 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
470 ; SSE3-FAST-NEXT: retq
472 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
474 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
475 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
476 ; AVX-SLOW-NEXT: retq
478 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
480 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
481 ; AVX-FAST-NEXT: retq
482 %x0 = extractelement <4 x float> %x, i32 0
483 %x1 = extractelement <4 x float> %x, i32 1
484 %x01 = fadd float %x0, %x1
488 define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
489 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
490 ; SSE3-SLOW: # %bb.0:
491 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
492 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
493 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
494 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
495 ; SSE3-SLOW-NEXT: retq
497 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
498 ; SSE3-FAST: # %bb.0:
499 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
500 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
501 ; SSE3-FAST-NEXT: retq
503 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
505 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
506 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
507 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
508 ; AVX-SLOW-NEXT: retq
510 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
512 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
513 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
514 ; AVX-FAST-NEXT: retq
515 %x0 = extractelement <4 x float> %x, i32 2
516 %x1 = extractelement <4 x float> %x, i32 3
517 %x01 = fadd float %x0, %x1
521 define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
522 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
523 ; SSE3-SLOW: # %bb.0:
524 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
526 ; SSE3-SLOW-NEXT: retq
528 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
529 ; SSE3-FAST: # %bb.0:
530 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
531 ; SSE3-FAST-NEXT: retq
533 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
535 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
536 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
537 ; AVX-SLOW-NEXT: retq
539 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
541 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
542 ; AVX-FAST-NEXT: retq
543 %x0 = extractelement <4 x float> %x, i32 0
544 %x1 = extractelement <4 x float> %x, i32 1
545 %x01 = fadd float %x1, %x0
549 define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
550 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
551 ; SSE3-SLOW: # %bb.0:
552 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
553 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
554 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
555 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
556 ; SSE3-SLOW-NEXT: retq
558 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
559 ; SSE3-FAST: # %bb.0:
560 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
561 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
562 ; SSE3-FAST-NEXT: retq
564 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
566 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
567 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
568 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
569 ; AVX-SLOW-NEXT: retq
571 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
573 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
574 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
575 ; AVX-FAST-NEXT: retq
576 %x0 = extractelement <4 x float> %x, i32 2
577 %x1 = extractelement <4 x float> %x, i32 3
578 %x01 = fadd float %x1, %x0
582 define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
583 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
584 ; SSE3-SLOW: # %bb.0:
585 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
586 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
587 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
588 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
589 ; SSE3-SLOW-NEXT: retq
591 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
592 ; SSE3-FAST: # %bb.0:
593 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
594 ; SSE3-FAST-NEXT: retq
596 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
598 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
599 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
600 ; AVX-SLOW-NEXT: retq
602 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
604 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
605 ; AVX-FAST-NEXT: retq
606 %x0 = extractelement <2 x double> %x, i32 0
607 %x1 = extractelement <2 x double> %x, i32 1
608 %x01 = fadd double %x0, %x1
612 define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
613 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
614 ; SSE3-SLOW: # %bb.0:
615 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
616 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
617 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
618 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
619 ; SSE3-SLOW-NEXT: retq
621 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
622 ; SSE3-FAST: # %bb.0:
623 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
624 ; SSE3-FAST-NEXT: retq
626 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
628 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
629 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
630 ; AVX-SLOW-NEXT: retq
632 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
634 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
635 ; AVX-FAST-NEXT: retq
636 %x0 = extractelement <2 x double> %x, i32 0
637 %x1 = extractelement <2 x double> %x, i32 1
638 %x01 = fadd double %x1, %x0
642 define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
643 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
644 ; SSE3-SLOW: # %bb.0:
645 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
646 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
647 ; SSE3-SLOW-NEXT: retq
649 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
650 ; SSE3-FAST: # %bb.0:
651 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
652 ; SSE3-FAST-NEXT: retq
654 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
656 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
657 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
658 ; AVX-SLOW-NEXT: retq
660 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
662 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
663 ; AVX-FAST-NEXT: retq
664 %x0 = extractelement <4 x float> %x, i32 0
665 %x1 = extractelement <4 x float> %x, i32 1
666 %x01 = fsub float %x0, %x1
670 define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
671 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
672 ; SSE3-SLOW: # %bb.0:
673 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
674 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
675 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
676 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
677 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
678 ; SSE3-SLOW-NEXT: retq
680 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
681 ; SSE3-FAST: # %bb.0:
682 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
683 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
684 ; SSE3-FAST-NEXT: retq
686 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
688 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
689 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
690 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
691 ; AVX-SLOW-NEXT: retq
693 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
695 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
696 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
697 ; AVX-FAST-NEXT: retq
698 %x0 = extractelement <4 x float> %x, i32 2
699 %x1 = extractelement <4 x float> %x, i32 3
700 %x01 = fsub float %x0, %x1
704 define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
705 ; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
707 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
708 ; SSE3-NEXT: subss %xmm0, %xmm1
709 ; SSE3-NEXT: movaps %xmm1, %xmm0
712 ; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
714 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
715 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
717 %x0 = extractelement <4 x float> %x, i32 0
718 %x1 = extractelement <4 x float> %x, i32 1
719 %x01 = fsub float %x1, %x0
723 define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
724 ; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
726 ; SSE3-NEXT: movaps %xmm0, %xmm1
727 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
728 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
729 ; SSE3-NEXT: subss %xmm1, %xmm0
732 ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
734 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
735 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
736 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
738 %x0 = extractelement <4 x float> %x, i32 2
739 %x1 = extractelement <4 x float> %x, i32 3
740 %x01 = fsub float %x1, %x0
744 define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
745 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
746 ; SSE3-SLOW: # %bb.0:
747 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
748 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
749 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
750 ; SSE3-SLOW-NEXT: retq
752 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
753 ; SSE3-FAST: # %bb.0:
754 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
755 ; SSE3-FAST-NEXT: retq
757 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
759 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
760 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
761 ; AVX-SLOW-NEXT: retq
763 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
765 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
766 ; AVX-FAST-NEXT: retq
767 %x0 = extractelement <2 x double> %x, i32 0
768 %x1 = extractelement <2 x double> %x, i32 1
769 %x01 = fsub double %x0, %x1
773 define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
774 ; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
776 ; SSE3-NEXT: movapd %xmm0, %xmm1
777 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
778 ; SSE3-NEXT: subsd %xmm0, %xmm1
779 ; SSE3-NEXT: movapd %xmm1, %xmm0
782 ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
784 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
785 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
787 %x0 = extractelement <2 x double> %x, i32 0
788 %x1 = extractelement <2 x double> %x, i32 1
789 %x01 = fsub double %x1, %x0
793 ; 256-bit vectors, float/double, fadd/fsub
795 define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
796 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
797 ; SSE3-SLOW: # %bb.0:
798 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
799 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
800 ; SSE3-SLOW-NEXT: retq
802 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
803 ; SSE3-FAST: # %bb.0:
804 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
805 ; SSE3-FAST-NEXT: retq
807 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
809 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
810 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
811 ; AVX-SLOW-NEXT: vzeroupper
812 ; AVX-SLOW-NEXT: retq
814 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
816 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
817 ; AVX-FAST-NEXT: vzeroupper
818 ; AVX-FAST-NEXT: retq
819 %x0 = extractelement <8 x float> %x, i32 0
820 %x1 = extractelement <8 x float> %x, i32 1
821 %x01 = fadd float %x0, %x1
825 define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
826 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
827 ; SSE3-SLOW: # %bb.0:
828 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
829 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
830 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
831 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
832 ; SSE3-SLOW-NEXT: retq
834 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
835 ; SSE3-FAST: # %bb.0:
836 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
837 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
838 ; SSE3-FAST-NEXT: retq
840 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
842 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
843 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
844 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
845 ; AVX-SLOW-NEXT: vzeroupper
846 ; AVX-SLOW-NEXT: retq
848 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
850 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
851 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
852 ; AVX-FAST-NEXT: vzeroupper
853 ; AVX-FAST-NEXT: retq
854 %x0 = extractelement <8 x float> %x, i32 2
855 %x1 = extractelement <8 x float> %x, i32 3
856 %x01 = fadd float %x0, %x1
860 define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
861 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
862 ; SSE3-SLOW: # %bb.0:
863 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
864 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
865 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
866 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
867 ; SSE3-SLOW-NEXT: retq
869 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
870 ; SSE3-FAST: # %bb.0:
871 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
872 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
873 ; SSE3-FAST-NEXT: retq
875 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
877 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
878 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
879 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
880 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
881 ; AVX-SLOW-NEXT: vzeroupper
882 ; AVX-SLOW-NEXT: retq
884 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
886 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
887 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
888 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
889 ; AVX-FAST-NEXT: vzeroupper
890 ; AVX-FAST-NEXT: retq
891 %x0 = extractelement <8 x float> %x, i32 6
892 %x1 = extractelement <8 x float> %x, i32 7
893 %x01 = fadd float %x0, %x1
897 define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
898 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
899 ; SSE3-SLOW: # %bb.0:
900 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
901 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
902 ; SSE3-SLOW-NEXT: retq
904 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
905 ; SSE3-FAST: # %bb.0:
906 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
907 ; SSE3-FAST-NEXT: retq
909 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
911 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
912 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
913 ; AVX-SLOW-NEXT: vzeroupper
914 ; AVX-SLOW-NEXT: retq
916 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
918 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
919 ; AVX-FAST-NEXT: vzeroupper
920 ; AVX-FAST-NEXT: retq
921 %x0 = extractelement <8 x float> %x, i32 0
922 %x1 = extractelement <8 x float> %x, i32 1
923 %x01 = fadd float %x1, %x0
927 define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
928 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
929 ; SSE3-SLOW: # %bb.0:
930 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
931 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
932 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
933 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
934 ; SSE3-SLOW-NEXT: retq
936 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
937 ; SSE3-FAST: # %bb.0:
938 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
939 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
940 ; SSE3-FAST-NEXT: retq
942 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
944 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
945 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
946 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
947 ; AVX-SLOW-NEXT: vzeroupper
948 ; AVX-SLOW-NEXT: retq
950 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
952 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
953 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
954 ; AVX-FAST-NEXT: vzeroupper
955 ; AVX-FAST-NEXT: retq
956 %x0 = extractelement <8 x float> %x, i32 2
957 %x1 = extractelement <8 x float> %x, i32 3
958 %x01 = fadd float %x1, %x0
962 define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
963 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
964 ; SSE3-SLOW: # %bb.0:
965 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
966 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
967 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
968 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
969 ; SSE3-SLOW-NEXT: retq
971 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
972 ; SSE3-FAST: # %bb.0:
973 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
974 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
975 ; SSE3-FAST-NEXT: retq
977 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
979 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
980 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
981 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
982 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
983 ; AVX-SLOW-NEXT: vzeroupper
984 ; AVX-SLOW-NEXT: retq
986 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
988 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
989 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
990 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
991 ; AVX-FAST-NEXT: vzeroupper
992 ; AVX-FAST-NEXT: retq
993 %x0 = extractelement <8 x float> %x, i32 6
994 %x1 = extractelement <8 x float> %x, i32 7
995 %x01 = fadd float %x1, %x0
999 define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
1000 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1001 ; SSE3-SLOW: # %bb.0:
1002 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1003 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1005 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1006 ; SSE3-SLOW-NEXT: retq
1008 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1009 ; SSE3-FAST: # %bb.0:
1010 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1011 ; SSE3-FAST-NEXT: retq
1013 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014 ; AVX-SLOW: # %bb.0:
1015 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1017 ; AVX-SLOW-NEXT: vzeroupper
1018 ; AVX-SLOW-NEXT: retq
1020 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021 ; AVX-FAST: # %bb.0:
1022 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1023 ; AVX-FAST-NEXT: vzeroupper
1024 ; AVX-FAST-NEXT: retq
1025 %x0 = extractelement <4 x double> %x, i32 0
1026 %x1 = extractelement <4 x double> %x, i32 1
1027 %x01 = fadd double %x0, %x1
1031 define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1032 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1033 ; SSE3-SLOW: # %bb.0:
1034 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1035 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1036 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1037 ; SSE3-SLOW-NEXT: retq
1039 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1040 ; SSE3-FAST: # %bb.0:
1041 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1042 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1043 ; SSE3-FAST-NEXT: retq
1045 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046 ; AVX-SLOW: # %bb.0:
1047 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1048 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX-SLOW-NEXT: vzeroupper
1051 ; AVX-SLOW-NEXT: retq
1053 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054 ; AVX-FAST: # %bb.0:
1055 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1056 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1057 ; AVX-FAST-NEXT: vzeroupper
1058 ; AVX-FAST-NEXT: retq
1059 %x0 = extractelement <4 x double> %x, i32 2
1060 %x1 = extractelement <4 x double> %x, i32 3
1061 %x01 = fadd double %x0, %x1
1065 define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1066 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1067 ; SSE3-SLOW: # %bb.0:
1068 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1069 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1070 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1071 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1072 ; SSE3-SLOW-NEXT: retq
1074 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075 ; SSE3-FAST: # %bb.0:
1076 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1077 ; SSE3-FAST-NEXT: retq
1079 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080 ; AVX-SLOW: # %bb.0:
1081 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-SLOW-NEXT: vzeroupper
1084 ; AVX-SLOW-NEXT: retq
1086 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087 ; AVX-FAST: # %bb.0:
1088 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1089 ; AVX-FAST-NEXT: vzeroupper
1090 ; AVX-FAST-NEXT: retq
1091 %x0 = extractelement <4 x double> %x, i32 0
1092 %x1 = extractelement <4 x double> %x, i32 1
1093 %x01 = fadd double %x1, %x0
1097 define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1098 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1099 ; SSE3-SLOW: # %bb.0:
1100 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1101 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1102 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1103 ; SSE3-SLOW-NEXT: retq
1105 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1106 ; SSE3-FAST: # %bb.0:
1107 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1108 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1109 ; SSE3-FAST-NEXT: retq
1111 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112 ; AVX-SLOW: # %bb.0:
1113 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1114 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1116 ; AVX-SLOW-NEXT: vzeroupper
1117 ; AVX-SLOW-NEXT: retq
1119 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120 ; AVX-FAST: # %bb.0:
1121 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1123 ; AVX-FAST-NEXT: vzeroupper
1124 ; AVX-FAST-NEXT: retq
1125 %x0 = extractelement <4 x double> %x, i32 2
1126 %x1 = extractelement <4 x double> %x, i32 3
1127 %x01 = fadd double %x1, %x0
1131 define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1132 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1133 ; SSE3-SLOW: # %bb.0:
1134 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1135 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1136 ; SSE3-SLOW-NEXT: retq
1138 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1139 ; SSE3-FAST: # %bb.0:
1140 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1141 ; SSE3-FAST-NEXT: retq
1143 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144 ; AVX-SLOW: # %bb.0:
1145 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1147 ; AVX-SLOW-NEXT: vzeroupper
1148 ; AVX-SLOW-NEXT: retq
1150 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151 ; AVX-FAST: # %bb.0:
1152 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1153 ; AVX-FAST-NEXT: vzeroupper
1154 ; AVX-FAST-NEXT: retq
1155 %x0 = extractelement <8 x float> %x, i32 0
1156 %x1 = extractelement <8 x float> %x, i32 1
1157 %x01 = fsub float %x0, %x1
1161 define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1162 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1163 ; SSE3-SLOW: # %bb.0:
1164 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1165 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1166 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1167 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
1168 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1169 ; SSE3-SLOW-NEXT: retq
1171 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1172 ; SSE3-FAST: # %bb.0:
1173 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1174 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1175 ; SSE3-FAST-NEXT: retq
1177 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178 ; AVX-SLOW: # %bb.0:
1179 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1181 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
1182 ; AVX-SLOW-NEXT: vzeroupper
1183 ; AVX-SLOW-NEXT: retq
1185 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186 ; AVX-FAST: # %bb.0:
1187 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1188 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189 ; AVX-FAST-NEXT: vzeroupper
1190 ; AVX-FAST-NEXT: retq
1191 %x0 = extractelement <8 x float> %x, i32 2
1192 %x1 = extractelement <8 x float> %x, i32 3
1193 %x01 = fsub float %x0, %x1
1197 define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1198 ; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1199 ; SSE3-SLOW: # %bb.0:
1200 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1201 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1202 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1203 ; SSE3-SLOW-NEXT: retq
1205 ; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1206 ; SSE3-FAST: # %bb.0:
1207 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1208 ; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0
1209 ; SSE3-FAST-NEXT: retq
1211 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212 ; AVX-SLOW: # %bb.0:
1213 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1214 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1216 ; AVX-SLOW-NEXT: vzeroupper
1217 ; AVX-SLOW-NEXT: retq
1219 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220 ; AVX-FAST: # %bb.0:
1221 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1222 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1223 ; AVX-FAST-NEXT: vzeroupper
1224 ; AVX-FAST-NEXT: retq
1225 %x0 = extractelement <8 x float> %x, i32 4
1226 %x1 = extractelement <8 x float> %x, i32 5
1227 %x01 = fsub float %x0, %x1
1231 ; Negative test...or get hoppy and negate?
1233 define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1234 ; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1236 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1237 ; SSE3-NEXT: subss %xmm0, %xmm1
1238 ; SSE3-NEXT: movaps %xmm1, %xmm0
1241 ; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1243 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1245 ; AVX-NEXT: vzeroupper
1247 %x0 = extractelement <8 x float> %x, i32 0
1248 %x1 = extractelement <8 x float> %x, i32 1
1249 %x01 = fsub float %x1, %x0
1253 define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1254 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1255 ; SSE3-SLOW: # %bb.0:
1256 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1257 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1258 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1259 ; SSE3-SLOW-NEXT: retq
1261 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1262 ; SSE3-FAST: # %bb.0:
1263 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1264 ; SSE3-FAST-NEXT: retq
1266 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267 ; AVX-SLOW: # %bb.0:
1268 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1270 ; AVX-SLOW-NEXT: vzeroupper
1271 ; AVX-SLOW-NEXT: retq
1273 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274 ; AVX-FAST: # %bb.0:
1275 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1276 ; AVX-FAST-NEXT: vzeroupper
1277 ; AVX-FAST-NEXT: retq
1278 %x0 = extractelement <4 x double> %x, i32 0
1279 %x1 = extractelement <4 x double> %x, i32 1
1280 %x01 = fsub double %x0, %x1
1284 ; Negative test...or get hoppy and negate?
1286 define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1287 ; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1289 ; SSE3-NEXT: movapd %xmm0, %xmm1
1290 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1291 ; SSE3-NEXT: subsd %xmm0, %xmm1
1292 ; SSE3-NEXT: movapd %xmm1, %xmm0
1295 ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1297 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1298 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1299 ; AVX-NEXT: vzeroupper
1301 %x0 = extractelement <4 x double> %x, i32 0
1302 %x1 = extractelement <4 x double> %x, i32 1
1303 %x01 = fsub double %x1, %x0
1307 ; 512-bit vectors, float/double, fadd/fsub
1309 define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1310 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1311 ; SSE3-SLOW: # %bb.0:
1312 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1313 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1314 ; SSE3-SLOW-NEXT: retq
1316 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1317 ; SSE3-FAST: # %bb.0:
1318 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1319 ; SSE3-FAST-NEXT: retq
1321 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322 ; AVX-SLOW: # %bb.0:
1323 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1325 ; AVX-SLOW-NEXT: vzeroupper
1326 ; AVX-SLOW-NEXT: retq
1328 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329 ; AVX-FAST: # %bb.0:
1330 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1331 ; AVX-FAST-NEXT: vzeroupper
1332 ; AVX-FAST-NEXT: retq
1333 %x0 = extractelement <16 x float> %x, i32 0
1334 %x1 = extractelement <16 x float> %x, i32 1
1335 %x01 = fadd float %x0, %x1
1339 define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1340 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1341 ; SSE3-SLOW: # %bb.0:
1342 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1343 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1344 ; SSE3-SLOW-NEXT: retq
1346 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347 ; SSE3-FAST: # %bb.0:
1348 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1349 ; SSE3-FAST-NEXT: retq
1351 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352 ; AVX-SLOW: # %bb.0:
1353 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
1355 ; AVX-SLOW-NEXT: vzeroupper
1356 ; AVX-SLOW-NEXT: retq
1358 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359 ; AVX-FAST: # %bb.0:
1360 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1361 ; AVX-FAST-NEXT: vzeroupper
1362 ; AVX-FAST-NEXT: retq
1363 %x0 = extractelement <16 x float> %x, i32 0
1364 %x1 = extractelement <16 x float> %x, i32 1
1365 %x01 = fadd float %x1, %x0
1369 define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1370 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1371 ; SSE3-SLOW: # %bb.0:
1372 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1373 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1374 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1375 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1376 ; SSE3-SLOW-NEXT: retq
1378 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1379 ; SSE3-FAST: # %bb.0:
1380 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1381 ; SSE3-FAST-NEXT: retq
1383 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384 ; AVX-SLOW: # %bb.0:
1385 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT: vzeroupper
1388 ; AVX-SLOW-NEXT: retq
1390 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391 ; AVX-FAST: # %bb.0:
1392 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1393 ; AVX-FAST-NEXT: vzeroupper
1394 ; AVX-FAST-NEXT: retq
1395 %x0 = extractelement <8 x double> %x, i32 0
1396 %x1 = extractelement <8 x double> %x, i32 1
1397 %x01 = fadd double %x0, %x1
1401 define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1402 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1403 ; SSE3-SLOW: # %bb.0:
1404 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1405 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1406 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1407 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1408 ; SSE3-SLOW-NEXT: retq
1410 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1411 ; SSE3-FAST: # %bb.0:
1412 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1413 ; SSE3-FAST-NEXT: retq
1415 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-SLOW: # %bb.0:
1417 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1419 ; AVX-SLOW-NEXT: vzeroupper
1420 ; AVX-SLOW-NEXT: retq
1422 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423 ; AVX-FAST: # %bb.0:
1424 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1425 ; AVX-FAST-NEXT: vzeroupper
1426 ; AVX-FAST-NEXT: retq
1427 %x0 = extractelement <8 x double> %x, i32 0
1428 %x1 = extractelement <8 x double> %x, i32 1
1429 %x01 = fadd double %x1, %x0
1433 define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1434 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1435 ; SSE3-SLOW: # %bb.0:
1436 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1437 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1438 ; SSE3-SLOW-NEXT: retq
1440 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1441 ; SSE3-FAST: # %bb.0:
1442 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1443 ; SSE3-FAST-NEXT: retq
1445 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-SLOW: # %bb.0:
1447 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1449 ; AVX-SLOW-NEXT: vzeroupper
1450 ; AVX-SLOW-NEXT: retq
1452 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453 ; AVX-FAST: # %bb.0:
1454 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1455 ; AVX-FAST-NEXT: vzeroupper
1456 ; AVX-FAST-NEXT: retq
1457 %x0 = extractelement <16 x float> %x, i32 0
1458 %x1 = extractelement <16 x float> %x, i32 1
1459 %x01 = fsub float %x0, %x1
1463 define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1464 ; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1466 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467 ; SSE3-NEXT: subss %xmm0, %xmm1
1468 ; SSE3-NEXT: movaps %xmm1, %xmm0
1471 ; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1473 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1474 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1475 ; AVX-NEXT: vzeroupper
1477 %x0 = extractelement <16 x float> %x, i32 0
1478 %x1 = extractelement <16 x float> %x, i32 1
1479 %x01 = fsub float %x1, %x0
1483 define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1484 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1485 ; SSE3-SLOW: # %bb.0:
1486 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1487 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1488 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1489 ; SSE3-SLOW-NEXT: retq
1491 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1492 ; SSE3-FAST: # %bb.0:
1493 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1494 ; SSE3-FAST-NEXT: retq
1496 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-SLOW: # %bb.0:
1498 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1500 ; AVX-SLOW-NEXT: vzeroupper
1501 ; AVX-SLOW-NEXT: retq
1503 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504 ; AVX-FAST: # %bb.0:
1505 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1506 ; AVX-FAST-NEXT: vzeroupper
1507 ; AVX-FAST-NEXT: retq
1508 %x0 = extractelement <8 x double> %x, i32 0
1509 %x1 = extractelement <8 x double> %x, i32 1
1510 %x01 = fsub double %x0, %x1
1514 define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1515 ; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1517 ; SSE3-NEXT: movapd %xmm0, %xmm1
1518 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1519 ; SSE3-NEXT: subsd %xmm0, %xmm1
1520 ; SSE3-NEXT: movapd %xmm1, %xmm0
1523 ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1525 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1526 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1527 ; AVX-NEXT: vzeroupper
1529 %x0 = extractelement <8 x double> %x, i32 0
1530 %x1 = extractelement <8 x double> %x, i32 1
1531 %x01 = fsub double %x1, %x0
1535 ; Check output when 1 or both extracts have extra uses.
1537 define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1538 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539 ; SSE3-SLOW: # %bb.0:
1540 ; SSE3-SLOW-NEXT: movss %xmm0, (%rdi)
1541 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1542 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1543 ; SSE3-SLOW-NEXT: retq
1545 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1546 ; SSE3-FAST: # %bb.0:
1547 ; SSE3-FAST-NEXT: movss %xmm0, (%rdi)
1548 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1549 ; SSE3-FAST-NEXT: retq
1551 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-SLOW: # %bb.0:
1553 ; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi)
1554 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1556 ; AVX-SLOW-NEXT: retq
1558 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559 ; AVX-FAST: # %bb.0:
1560 ; AVX-FAST-NEXT: vmovss %xmm0, (%rdi)
1561 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1562 ; AVX-FAST-NEXT: retq
1563 %x0 = extractelement <4 x float> %x, i32 0
1564 store float %x0, float* %p
1565 %x1 = extractelement <4 x float> %x, i32 1
1566 %x01 = fadd float %x0, %x1
1570 define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1571 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572 ; SSE3-SLOW: # %bb.0:
1573 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574 ; SSE3-SLOW-NEXT: movss %xmm1, (%rdi)
1575 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1576 ; SSE3-SLOW-NEXT: retq
1578 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579 ; SSE3-FAST: # %bb.0:
1580 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581 ; SSE3-FAST-NEXT: movss %xmm1, (%rdi)
1582 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1583 ; SSE3-FAST-NEXT: retq
1585 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-SLOW: # %bb.0:
1587 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588 ; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi)
1589 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1590 ; AVX-SLOW-NEXT: retq
1592 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593 ; AVX-FAST: # %bb.0:
1594 ; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi)
1595 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1596 ; AVX-FAST-NEXT: retq
1597 %x0 = extractelement <4 x float> %x, i32 0
1598 %x1 = extractelement <4 x float> %x, i32 1
1599 store float %x1, float* %p
1600 %x01 = fadd float %x0, %x1
1604 define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1605 ; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1607 ; SSE3-NEXT: movss %xmm0, (%rdi)
1608 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1609 ; SSE3-NEXT: movss %xmm1, (%rsi)
1610 ; SSE3-NEXT: addss %xmm1, %xmm0
1613 ; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1615 ; AVX-NEXT: vmovss %xmm0, (%rdi)
1616 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1617 ; AVX-NEXT: vmovss %xmm1, (%rsi)
1618 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1620 %x0 = extractelement <4 x float> %x, i32 0
1621 store float %x0, float* %p1
1622 %x1 = extractelement <4 x float> %x, i32 1
1623 store float %x1, float* %p2
1624 %x01 = fadd float %x0, %x1
1628 ; Repeat tests from general reductions to verify output for hoppy targets:
1629 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1631 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
1632 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
1634 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1635 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1636 ; SSE3-SLOW: # %bb.0:
1637 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
1638 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
1639 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1640 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
1641 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1642 ; SSE3-SLOW-NEXT: addss %xmm2, %xmm1
1643 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1644 ; SSE3-SLOW-NEXT: retq
1646 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1647 ; SSE3-FAST: # %bb.0:
1648 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm2
1649 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1650 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1651 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0
1652 ; SSE3-FAST-NEXT: retq
1654 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1655 ; AVX-SLOW: # %bb.0:
1656 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1657 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1658 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1659 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1660 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1661 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
1662 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1663 ; AVX-SLOW-NEXT: vzeroupper
1664 ; AVX-SLOW-NEXT: retq
1666 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1667 ; AVX-FAST: # %bb.0:
1668 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1669 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1
1670 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1671 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1672 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
1673 ; AVX-FAST-NEXT: vzeroupper
1674 ; AVX-FAST-NEXT: retq
1675 %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1679 define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1680 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1681 ; SSE3-SLOW: # %bb.0:
1682 ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
1683 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2
1684 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1685 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2
1686 ; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0
1687 ; SSE3-SLOW-NEXT: retq
1689 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1690 ; SSE3-FAST: # %bb.0:
1691 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2
1692 ; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2
1693 ; SSE3-FAST-NEXT: addsd %xmm2, %xmm0
1694 ; SSE3-FAST-NEXT: retq
1696 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1697 ; AVX-SLOW: # %bb.0:
1698 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1699 ; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
1700 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1701 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1702 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1703 ; AVX-SLOW-NEXT: vzeroupper
1704 ; AVX-SLOW-NEXT: retq
1706 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1707 ; AVX-FAST: # %bb.0:
1708 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1709 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1
1710 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1711 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1712 ; AVX-FAST-NEXT: vzeroupper
1713 ; AVX-FAST-NEXT: retq
1714 %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1718 define float @PR39936_v8f32(<8 x float>) {
1719 ; SSSE3-SLOW-LABEL: PR39936_v8f32:
1720 ; SSSE3-SLOW: # %bb.0:
1721 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1722 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1723 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1724 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1725 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
1726 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1727 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
1728 ; SSSE3-SLOW-NEXT: retq
1730 ; SSSE3-FAST-LABEL: PR39936_v8f32:
1731 ; SSSE3-FAST: # %bb.0:
1732 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
1733 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1734 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1735 ; SSSE3-FAST-NEXT: retq
1737 ; SSE3-SLOW-LABEL: PR39936_v8f32:
1738 ; SSE3-SLOW: # %bb.0:
1739 ; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1740 ; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0
1741 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1743 ; SSE3-SLOW-NEXT: retq
1745 ; SSE3-FAST-LABEL: PR39936_v8f32:
1746 ; SSE3-FAST: # %bb.0:
1747 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm0
1748 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1749 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1750 ; SSE3-FAST-NEXT: retq
1752 ; AVX-SLOW-LABEL: PR39936_v8f32:
1753 ; AVX-SLOW: # %bb.0:
1754 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1755 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1756 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1757 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1758 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1759 ; AVX-SLOW-NEXT: vzeroupper
1760 ; AVX-SLOW-NEXT: retq
1762 ; AVX-FAST-LABEL: PR39936_v8f32:
1763 ; AVX-FAST: # %bb.0:
1764 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1765 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1766 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1767 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1768 ; AVX-FAST-NEXT: vzeroupper
1769 ; AVX-FAST-NEXT: retq
1770 %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1771 %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1772 %4 = fadd <8 x float> %2, %3
1773 %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1774 %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1775 %7 = fadd <8 x float> %5, %6
1776 %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1777 %9 = fadd <8 x float> %7, %8
1778 %10 = extractelement <8 x float> %9, i32 0
1782 define float @hadd32_4(<4 x float> %x225) {
1783 ; SSE3-SLOW-LABEL: hadd32_4:
1784 ; SSE3-SLOW: # %bb.0:
1785 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1786 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1787 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1788 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1789 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1790 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1791 ; SSE3-SLOW-NEXT: retq
1793 ; SSE3-FAST-LABEL: hadd32_4:
1794 ; SSE3-FAST: # %bb.0:
1795 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1796 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1797 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
1798 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
1799 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1800 ; SSE3-FAST-NEXT: retq
1802 ; AVX-SLOW-LABEL: hadd32_4:
1803 ; AVX-SLOW: # %bb.0:
1804 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1805 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1806 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1807 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1808 ; AVX-SLOW-NEXT: retq
1810 ; AVX-FAST-LABEL: hadd32_4:
1811 ; AVX-FAST: # %bb.0:
1812 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1814 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1815 ; AVX-FAST-NEXT: retq
1816 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1817 %x227 = fadd <4 x float> %x225, %x226
1818 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1819 %x229 = fadd <4 x float> %x227, %x228
1820 %x230 = extractelement <4 x float> %x229, i32 0
1824 define float @hadd32_8(<8 x float> %x225) {
1825 ; SSE3-SLOW-LABEL: hadd32_8:
1826 ; SSE3-SLOW: # %bb.0:
1827 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1828 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1829 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1830 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1831 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1832 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1833 ; SSE3-SLOW-NEXT: retq
1835 ; SSE3-FAST-LABEL: hadd32_8:
1836 ; SSE3-FAST: # %bb.0:
1837 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1838 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1839 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
1840 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
1841 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1842 ; SSE3-FAST-NEXT: retq
1844 ; AVX-SLOW-LABEL: hadd32_8:
1845 ; AVX-SLOW: # %bb.0:
1846 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1847 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1848 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1849 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1850 ; AVX-SLOW-NEXT: vzeroupper
1851 ; AVX-SLOW-NEXT: retq
1853 ; AVX-FAST-LABEL: hadd32_8:
1854 ; AVX-FAST: # %bb.0:
1855 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1856 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1857 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1858 ; AVX-FAST-NEXT: vzeroupper
1859 ; AVX-FAST-NEXT: retq
1860 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1861 %x227 = fadd <8 x float> %x225, %x226
1862 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1863 %x229 = fadd <8 x float> %x227, %x228
1864 %x230 = extractelement <8 x float> %x229, i32 0
1868 define float @hadd32_16(<16 x float> %x225) {
1869 ; SSE3-SLOW-LABEL: hadd32_16:
1870 ; SSE3-SLOW: # %bb.0:
1871 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1872 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1873 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1874 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1875 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1876 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1877 ; SSE3-SLOW-NEXT: retq
1879 ; SSE3-FAST-LABEL: hadd32_16:
1880 ; SSE3-FAST: # %bb.0:
1881 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1882 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1883 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
1884 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
1885 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1886 ; SSE3-FAST-NEXT: retq
1888 ; AVX-SLOW-LABEL: hadd32_16:
1889 ; AVX-SLOW: # %bb.0:
1890 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1891 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1892 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1893 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1894 ; AVX-SLOW-NEXT: vzeroupper
1895 ; AVX-SLOW-NEXT: retq
1897 ; AVX-FAST-LABEL: hadd32_16:
1898 ; AVX-FAST: # %bb.0:
1899 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1900 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1901 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1902 ; AVX-FAST-NEXT: vzeroupper
1903 ; AVX-FAST-NEXT: retq
1904 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1905 %x227 = fadd <16 x float> %x225, %x226
1906 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1907 %x229 = fadd <16 x float> %x227, %x228
1908 %x230 = extractelement <16 x float> %x229, i32 0
1912 define float @hadd32_4_optsize(<4 x float> %x225) optsize {
1913 ; SSE3-LABEL: hadd32_4_optsize:
1915 ; SSE3-NEXT: movaps %xmm0, %xmm1
1916 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1917 ; SSE3-NEXT: addps %xmm0, %xmm1
1918 ; SSE3-NEXT: haddps %xmm1, %xmm1
1919 ; SSE3-NEXT: movaps %xmm1, %xmm0
1922 ; AVX-LABEL: hadd32_4_optsize:
1924 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1925 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1926 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1928 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1929 %x227 = fadd <4 x float> %x225, %x226
1930 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1931 %x229 = fadd <4 x float> %x227, %x228
1932 %x230 = extractelement <4 x float> %x229, i32 0
1936 define float @hadd32_8_optsize(<8 x float> %x225) optsize {
1937 ; SSE3-LABEL: hadd32_8_optsize:
1939 ; SSE3-NEXT: movaps %xmm0, %xmm1
1940 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1941 ; SSE3-NEXT: addps %xmm0, %xmm1
1942 ; SSE3-NEXT: haddps %xmm1, %xmm1
1943 ; SSE3-NEXT: movaps %xmm1, %xmm0
1946 ; AVX-LABEL: hadd32_8_optsize:
1948 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1949 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1950 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1951 ; AVX-NEXT: vzeroupper
1953 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1954 %x227 = fadd <8 x float> %x225, %x226
1955 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1956 %x229 = fadd <8 x float> %x227, %x228
1957 %x230 = extractelement <8 x float> %x229, i32 0
1961 define float @hadd32_16_optsize(<16 x float> %x225) optsize {
1962 ; SSE3-LABEL: hadd32_16_optsize:
1964 ; SSE3-NEXT: movaps %xmm0, %xmm1
1965 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1966 ; SSE3-NEXT: addps %xmm0, %xmm1
1967 ; SSE3-NEXT: haddps %xmm1, %xmm1
1968 ; SSE3-NEXT: movaps %xmm1, %xmm0
1971 ; AVX-LABEL: hadd32_16_optsize:
1973 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1974 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1975 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1976 ; AVX-NEXT: vzeroupper
1978 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1979 %x227 = fadd <16 x float> %x225, %x226
1980 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1981 %x229 = fadd <16 x float> %x227, %x228
1982 %x230 = extractelement <16 x float> %x229, i32 0
1986 define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
1987 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
1988 ; SSE3-SLOW: # %bb.0:
1989 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1990 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1991 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1992 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1993 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1994 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1995 ; SSE3-SLOW-NEXT: retq
1997 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
1998 ; SSE3-FAST: # %bb.0:
1999 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2000 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2001 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
2002 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
2003 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
2004 ; SSE3-FAST-NEXT: retq
2006 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2007 ; AVX-SLOW: # %bb.0:
2008 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2009 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2010 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2011 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2012 ; AVX-SLOW-NEXT: vzeroupper
2013 ; AVX-SLOW-NEXT: retq
2015 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2016 ; AVX-FAST: # %bb.0:
2017 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2018 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2019 ; AVX-FAST-NEXT: vzeroupper
2020 ; AVX-FAST-NEXT: retq
2021 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2022 %x0213 = fadd <8 x float> %x, %x23
2023 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2024 %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13
2025 %r = extractelement <8 x float> %x0123, i32 0
2029 ; Negative test - only the flags on the final math op in the
2030 ; sequence determine whether we can transform to horizontal ops.
2032 define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
2033 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2034 ; SSE3-SLOW: # %bb.0:
2035 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2036 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2037 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
2038 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2039 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
2040 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
2041 ; SSE3-SLOW-NEXT: retq
2043 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2044 ; SSE3-FAST: # %bb.0:
2045 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2046 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2047 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
2048 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
2049 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
2050 ; SSE3-FAST-NEXT: retq
2052 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2053 ; AVX-SLOW: # %bb.0:
2054 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2055 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2056 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2057 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2058 ; AVX-SLOW-NEXT: vzeroupper
2059 ; AVX-SLOW-NEXT: retq
2061 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2062 ; AVX-FAST: # %bb.0:
2063 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2064 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
2065 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2066 ; AVX-FAST-NEXT: vzeroupper
2067 ; AVX-FAST-NEXT: retq
2068 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2069 %x0213 = fadd fast <8 x float> %x, %x23
2070 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2071 %x0123 = fadd ninf nnan <8 x float> %x0213, %x13
2072 %r = extractelement <8 x float> %x0123, i32 0
2076 define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
2077 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32:
2078 ; SSE3-SLOW: # %bb.0:
2079 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2080 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2081 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
2082 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2083 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
2084 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
2085 ; SSE3-SLOW-NEXT: retq
2087 ; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
2088 ; SSE3-FAST: # %bb.0:
2089 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2090 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2091 ; SSE3-FAST-NEXT: addps %xmm0, %xmm1
2092 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
2093 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
2094 ; SSE3-FAST-NEXT: retq
2096 ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2097 ; AVX-SLOW: # %bb.0:
2098 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2099 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2100 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2101 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2102 ; AVX-SLOW-NEXT: vzeroupper
2103 ; AVX-SLOW-NEXT: retq
2105 ; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2106 ; AVX-FAST: # %bb.0:
2107 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2108 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2109 ; AVX-FAST-NEXT: vzeroupper
2110 ; AVX-FAST-NEXT: retq
2111 %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2112 %x0213 = fadd <16 x float> %x, %x23
2113 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2114 %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13
2115 %r = extractelement <16 x float> %x0123, i32 0