1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
11 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
12 ; SSE3-LABEL: haddpd1:
14 ; SSE3-NEXT: haddpd %xmm1, %xmm0
19 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
21 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
22 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
23 %r = fadd <2 x double> %a, %b
27 define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
28 ; SSE3-LABEL: haddpd2:
30 ; SSE3-NEXT: haddpd %xmm1, %xmm0
35 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
37 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
38 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
39 %r = fadd <2 x double> %a, %b
43 define <2 x double> @haddpd3(<2 x double> %x) {
44 ; SSE3-SLOW-LABEL: haddpd3:
46 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
47 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
48 ; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1
49 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
50 ; SSE3-SLOW-NEXT: retq
52 ; SSE3-FAST-LABEL: haddpd3:
54 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
55 ; SSE3-FAST-NEXT: retq
57 ; AVX-SLOW-LABEL: haddpd3:
59 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
60 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
63 ; AVX-FAST-LABEL: haddpd3:
65 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
67 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
68 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
69 %r = fadd <2 x double> %a, %b
73 define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
74 ; SSE3-LABEL: haddps1:
76 ; SSE3-NEXT: haddps %xmm1, %xmm0
81 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
83 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
84 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
85 %r = fadd <4 x float> %a, %b
89 define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
90 ; SSE3-LABEL: haddps2:
92 ; SSE3-NEXT: haddps %xmm1, %xmm0
97 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
99 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
100 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
101 %r = fadd <4 x float> %a, %b
105 define <4 x float> @haddps3(<4 x float> %x) {
106 ; SSE3-LABEL: haddps3:
108 ; SSE3-NEXT: haddps %xmm0, %xmm0
111 ; AVX-LABEL: haddps3:
113 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
115 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
116 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
117 %r = fadd <4 x float> %a, %b
121 define <4 x float> @haddps4(<4 x float> %x) {
122 ; SSE3-LABEL: haddps4:
124 ; SSE3-NEXT: haddps %xmm0, %xmm0
127 ; AVX-LABEL: haddps4:
129 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
131 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
132 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
133 %r = fadd <4 x float> %a, %b
137 define <4 x float> @haddps5(<4 x float> %x) {
138 ; SSE3-LABEL: haddps5:
140 ; SSE3-NEXT: haddps %xmm0, %xmm0
143 ; AVX-LABEL: haddps5:
145 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
147 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
148 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
149 %r = fadd <4 x float> %a, %b
153 define <4 x float> @haddps6(<4 x float> %x) {
154 ; SSE3-SLOW-LABEL: haddps6:
155 ; SSE3-SLOW: # %bb.0:
156 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
157 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
158 ; SSE3-SLOW-NEXT: retq
160 ; SSE3-FAST-LABEL: haddps6:
161 ; SSE3-FAST: # %bb.0:
162 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
163 ; SSE3-FAST-NEXT: retq
165 ; AVX-SLOW-LABEL: haddps6:
167 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
169 ; AVX-SLOW-NEXT: retq
171 ; AVX-FAST-LABEL: haddps6:
173 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
174 ; AVX-FAST-NEXT: retq
175 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
176 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
177 %r = fadd <4 x float> %a, %b
181 define <4 x float> @haddps7(<4 x float> %x) {
182 ; SSE3-LABEL: haddps7:
184 ; SSE3-NEXT: haddps %xmm0, %xmm0
187 ; AVX-LABEL: haddps7:
189 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
191 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
192 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
193 %r = fadd <4 x float> %a, %b
197 define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
198 ; SSE3-LABEL: hsubpd1:
200 ; SSE3-NEXT: hsubpd %xmm1, %xmm0
203 ; AVX-LABEL: hsubpd1:
205 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
207 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
208 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
209 %r = fsub <2 x double> %a, %b
213 define <2 x double> @hsubpd2(<2 x double> %x) {
214 ; SSE3-SLOW-LABEL: hsubpd2:
215 ; SSE3-SLOW: # %bb.0:
216 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
217 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
218 ; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0
219 ; SSE3-SLOW-NEXT: retq
221 ; SSE3-FAST-LABEL: hsubpd2:
222 ; SSE3-FAST: # %bb.0:
223 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
224 ; SSE3-FAST-NEXT: retq
226 ; AVX-SLOW-LABEL: hsubpd2:
228 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
229 ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
230 ; AVX-SLOW-NEXT: retq
232 ; AVX-FAST-LABEL: hsubpd2:
234 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
235 ; AVX-FAST-NEXT: retq
236 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
237 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
238 %r = fsub <2 x double> %a, %b
242 define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
243 ; SSE3-LABEL: hsubps1:
245 ; SSE3-NEXT: hsubps %xmm1, %xmm0
248 ; AVX-LABEL: hsubps1:
250 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
252 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
253 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
254 %r = fsub <4 x float> %a, %b
258 define <4 x float> @hsubps2(<4 x float> %x) {
259 ; SSE3-LABEL: hsubps2:
261 ; SSE3-NEXT: hsubps %xmm0, %xmm0
264 ; AVX-LABEL: hsubps2:
266 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
268 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
269 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
270 %r = fsub <4 x float> %a, %b
274 define <4 x float> @hsubps3(<4 x float> %x) {
275 ; SSE3-LABEL: hsubps3:
277 ; SSE3-NEXT: hsubps %xmm0, %xmm0
280 ; AVX-LABEL: hsubps3:
282 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
284 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
285 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
286 %r = fsub <4 x float> %a, %b
290 define <4 x float> @hsubps4(<4 x float> %x) {
291 ; SSE3-SLOW-LABEL: hsubps4:
292 ; SSE3-SLOW: # %bb.0:
293 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
294 ; SSE3-SLOW-NEXT: subps %xmm1, %xmm0
295 ; SSE3-SLOW-NEXT: retq
297 ; SSE3-FAST-LABEL: hsubps4:
298 ; SSE3-FAST: # %bb.0:
299 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
300 ; SSE3-FAST-NEXT: retq
302 ; AVX-SLOW-LABEL: hsubps4:
304 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
305 ; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
306 ; AVX-SLOW-NEXT: retq
308 ; AVX-FAST-LABEL: hsubps4:
310 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
311 ; AVX-FAST-NEXT: retq
312 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
313 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
314 %r = fsub <4 x float> %a, %b
318 define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
319 ; SSE3-LABEL: vhaddps1:
321 ; SSE3-NEXT: haddps %xmm2, %xmm0
322 ; SSE3-NEXT: haddps %xmm3, %xmm1
325 ; AVX-LABEL: vhaddps1:
327 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
329 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
330 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
331 %r = fadd <8 x float> %a, %b
335 define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
336 ; SSE3-LABEL: vhaddps2:
338 ; SSE3-NEXT: haddps %xmm2, %xmm0
339 ; SSE3-NEXT: haddps %xmm3, %xmm1
342 ; AVX-LABEL: vhaddps2:
344 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
346 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
347 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
348 %r = fadd <8 x float> %a, %b
352 define <8 x float> @vhaddps3(<8 x float> %x) {
353 ; SSE3-LABEL: vhaddps3:
355 ; SSE3-NEXT: haddps %xmm0, %xmm0
356 ; SSE3-NEXT: haddps %xmm1, %xmm1
359 ; AVX-LABEL: vhaddps3:
361 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
363 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
364 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
365 %r = fadd <8 x float> %a, %b
369 define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
370 ; SSE3-LABEL: vhsubps1:
372 ; SSE3-NEXT: hsubps %xmm2, %xmm0
373 ; SSE3-NEXT: hsubps %xmm3, %xmm1
376 ; AVX-LABEL: vhsubps1:
378 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
380 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
381 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
382 %r = fsub <8 x float> %a, %b
386 define <8 x float> @vhsubps3(<8 x float> %x) {
387 ; SSE3-LABEL: vhsubps3:
389 ; SSE3-NEXT: hsubps %xmm0, %xmm0
390 ; SSE3-NEXT: hsubps %xmm1, %xmm1
393 ; AVX-LABEL: vhsubps3:
395 ; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
397 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
398 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
399 %r = fsub <8 x float> %a, %b
403 define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
404 ; SSE3-LABEL: vhaddpd1:
406 ; SSE3-NEXT: haddpd %xmm2, %xmm0
407 ; SSE3-NEXT: haddpd %xmm3, %xmm1
410 ; AVX-LABEL: vhaddpd1:
412 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
414 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
415 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
416 %r = fadd <4 x double> %a, %b
420 define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
421 ; SSE3-LABEL: vhsubpd1:
423 ; SSE3-NEXT: hsubpd %xmm2, %xmm0
424 ; SSE3-NEXT: hsubpd %xmm3, %xmm1
427 ; AVX-LABEL: vhsubpd1:
429 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
431 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
432 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
433 %r = fsub <4 x double> %a, %b
437 define <2 x float> @haddps_v2f32(<4 x float> %v0) {
438 ; SSE3-LABEL: haddps_v2f32:
440 ; SSE3-NEXT: haddps %xmm0, %xmm0
443 ; AVX-LABEL: haddps_v2f32:
445 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
447 %v0.0 = extractelement <4 x float> %v0, i32 0
448 %v0.1 = extractelement <4 x float> %v0, i32 1
449 %v0.2 = extractelement <4 x float> %v0, i32 2
450 %v0.3 = extractelement <4 x float> %v0, i32 3
451 %op0 = fadd float %v0.0, %v0.1
452 %op1 = fadd float %v0.2, %v0.3
453 %res0 = insertelement <2 x float> undef, float %op0, i32 0
454 %res1 = insertelement <2 x float> %res0, float %op1, i32 1
455 ret <2 x float> %res1
458 ; 128-bit vectors, float/double, fadd/fsub
460 define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
461 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
462 ; SSE3-SLOW: # %bb.0:
463 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
464 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
465 ; SSE3-SLOW-NEXT: retq
467 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
468 ; SSE3-FAST: # %bb.0:
469 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
470 ; SSE3-FAST-NEXT: retq
472 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
474 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
475 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
476 ; AVX-SLOW-NEXT: retq
478 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
480 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
481 ; AVX-FAST-NEXT: retq
482 %x0 = extractelement <4 x float> %x, i32 0
483 %x1 = extractelement <4 x float> %x, i32 1
484 %x01 = fadd float %x0, %x1
488 define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
489 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
490 ; SSE3-SLOW: # %bb.0:
491 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
492 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
493 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
494 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
495 ; SSE3-SLOW-NEXT: retq
497 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
498 ; SSE3-FAST: # %bb.0:
499 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
500 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
501 ; SSE3-FAST-NEXT: retq
503 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
505 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
506 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
507 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
508 ; AVX-SLOW-NEXT: retq
510 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
512 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
513 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
514 ; AVX-FAST-NEXT: retq
515 %x0 = extractelement <4 x float> %x, i32 2
516 %x1 = extractelement <4 x float> %x, i32 3
517 %x01 = fadd float %x0, %x1
521 define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
522 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
523 ; SSE3-SLOW: # %bb.0:
524 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
526 ; SSE3-SLOW-NEXT: retq
528 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
529 ; SSE3-FAST: # %bb.0:
530 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
531 ; SSE3-FAST-NEXT: retq
533 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
535 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
536 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
537 ; AVX-SLOW-NEXT: retq
539 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
541 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
542 ; AVX-FAST-NEXT: retq
543 %x0 = extractelement <4 x float> %x, i32 0
544 %x1 = extractelement <4 x float> %x, i32 1
545 %x01 = fadd float %x1, %x0
549 define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
550 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
551 ; SSE3-SLOW: # %bb.0:
552 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
553 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
554 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
555 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
556 ; SSE3-SLOW-NEXT: retq
558 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
559 ; SSE3-FAST: # %bb.0:
560 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
561 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
562 ; SSE3-FAST-NEXT: retq
564 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
566 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
567 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
568 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
569 ; AVX-SLOW-NEXT: retq
571 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
573 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
574 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
575 ; AVX-FAST-NEXT: retq
576 %x0 = extractelement <4 x float> %x, i32 2
577 %x1 = extractelement <4 x float> %x, i32 3
578 %x01 = fadd float %x1, %x0
582 define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
583 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
584 ; SSE3-SLOW: # %bb.0:
585 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
586 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
587 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
588 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
589 ; SSE3-SLOW-NEXT: retq
591 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
592 ; SSE3-FAST: # %bb.0:
593 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
594 ; SSE3-FAST-NEXT: retq
596 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
598 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
599 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
600 ; AVX-SLOW-NEXT: retq
602 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
604 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
605 ; AVX-FAST-NEXT: retq
606 %x0 = extractelement <2 x double> %x, i32 0
607 %x1 = extractelement <2 x double> %x, i32 1
608 %x01 = fadd double %x0, %x1
612 define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
613 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
614 ; SSE3-SLOW: # %bb.0:
615 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
616 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
617 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
618 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
619 ; SSE3-SLOW-NEXT: retq
621 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
622 ; SSE3-FAST: # %bb.0:
623 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
624 ; SSE3-FAST-NEXT: retq
626 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
628 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
629 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
630 ; AVX-SLOW-NEXT: retq
632 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
634 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
635 ; AVX-FAST-NEXT: retq
636 %x0 = extractelement <2 x double> %x, i32 0
637 %x1 = extractelement <2 x double> %x, i32 1
638 %x01 = fadd double %x1, %x0
642 define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
643 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
644 ; SSE3-SLOW: # %bb.0:
645 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
646 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
647 ; SSE3-SLOW-NEXT: retq
649 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
650 ; SSE3-FAST: # %bb.0:
651 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
652 ; SSE3-FAST-NEXT: retq
654 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
656 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
657 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
658 ; AVX-SLOW-NEXT: retq
660 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
662 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
663 ; AVX-FAST-NEXT: retq
664 %x0 = extractelement <4 x float> %x, i32 0
665 %x1 = extractelement <4 x float> %x, i32 1
666 %x01 = fsub float %x0, %x1
670 define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
671 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
672 ; SSE3-SLOW: # %bb.0:
673 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
674 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
675 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
676 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
677 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
678 ; SSE3-SLOW-NEXT: retq
680 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
681 ; SSE3-FAST: # %bb.0:
682 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
683 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
684 ; SSE3-FAST-NEXT: retq
686 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
688 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
689 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
690 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
691 ; AVX-SLOW-NEXT: retq
693 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
695 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
696 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
697 ; AVX-FAST-NEXT: retq
698 %x0 = extractelement <4 x float> %x, i32 2
699 %x1 = extractelement <4 x float> %x, i32 3
700 %x01 = fsub float %x0, %x1
704 define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
705 ; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
707 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
708 ; SSE3-NEXT: subss %xmm0, %xmm1
709 ; SSE3-NEXT: movaps %xmm1, %xmm0
712 ; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
714 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
715 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
717 %x0 = extractelement <4 x float> %x, i32 0
718 %x1 = extractelement <4 x float> %x, i32 1
719 %x01 = fsub float %x1, %x0
723 define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
724 ; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
726 ; SSE3-NEXT: movaps %xmm0, %xmm1
727 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
728 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
729 ; SSE3-NEXT: subss %xmm1, %xmm0
732 ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
734 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
735 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
736 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
738 %x0 = extractelement <4 x float> %x, i32 2
739 %x1 = extractelement <4 x float> %x, i32 3
740 %x01 = fsub float %x1, %x0
744 define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
745 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
746 ; SSE3-SLOW: # %bb.0:
747 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
748 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
749 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
750 ; SSE3-SLOW-NEXT: retq
752 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
753 ; SSE3-FAST: # %bb.0:
754 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
755 ; SSE3-FAST-NEXT: retq
757 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
759 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
760 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
761 ; AVX-SLOW-NEXT: retq
763 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
765 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
766 ; AVX-FAST-NEXT: retq
767 %x0 = extractelement <2 x double> %x, i32 0
768 %x1 = extractelement <2 x double> %x, i32 1
769 %x01 = fsub double %x0, %x1
773 define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
774 ; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
776 ; SSE3-NEXT: movapd %xmm0, %xmm1
777 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
778 ; SSE3-NEXT: subsd %xmm0, %xmm1
779 ; SSE3-NEXT: movapd %xmm1, %xmm0
782 ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
784 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
785 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
787 %x0 = extractelement <2 x double> %x, i32 0
788 %x1 = extractelement <2 x double> %x, i32 1
789 %x01 = fsub double %x1, %x0
793 ; 256-bit vectors, float/double, fadd/fsub
795 define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
796 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
797 ; SSE3-SLOW: # %bb.0:
798 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
799 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
800 ; SSE3-SLOW-NEXT: retq
802 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
803 ; SSE3-FAST: # %bb.0:
804 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
805 ; SSE3-FAST-NEXT: retq
807 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
809 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
810 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
811 ; AVX-SLOW-NEXT: vzeroupper
812 ; AVX-SLOW-NEXT: retq
814 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
816 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
817 ; AVX-FAST-NEXT: vzeroupper
818 ; AVX-FAST-NEXT: retq
819 %x0 = extractelement <8 x float> %x, i32 0
820 %x1 = extractelement <8 x float> %x, i32 1
821 %x01 = fadd float %x0, %x1
825 define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
826 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
827 ; SSE3-SLOW: # %bb.0:
828 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
829 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
830 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
831 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
832 ; SSE3-SLOW-NEXT: retq
834 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
835 ; SSE3-FAST: # %bb.0:
836 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
837 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
838 ; SSE3-FAST-NEXT: retq
840 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
842 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
843 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
844 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
845 ; AVX-SLOW-NEXT: vzeroupper
846 ; AVX-SLOW-NEXT: retq
848 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
850 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
851 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
852 ; AVX-FAST-NEXT: vzeroupper
853 ; AVX-FAST-NEXT: retq
854 %x0 = extractelement <8 x float> %x, i32 2
855 %x1 = extractelement <8 x float> %x, i32 3
856 %x01 = fadd float %x0, %x1
860 define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
861 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
862 ; SSE3-SLOW: # %bb.0:
863 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
864 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
865 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
866 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
867 ; SSE3-SLOW-NEXT: retq
869 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
870 ; SSE3-FAST: # %bb.0:
871 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
872 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
873 ; SSE3-FAST-NEXT: retq
875 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
877 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
878 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
879 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
880 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
881 ; AVX-SLOW-NEXT: vzeroupper
882 ; AVX-SLOW-NEXT: retq
884 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
886 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
887 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
888 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
889 ; AVX-FAST-NEXT: vzeroupper
890 ; AVX-FAST-NEXT: retq
891 %x0 = extractelement <8 x float> %x, i32 6
892 %x1 = extractelement <8 x float> %x, i32 7
893 %x01 = fadd float %x0, %x1
897 define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
898 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
899 ; SSE3-SLOW: # %bb.0:
900 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
901 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
902 ; SSE3-SLOW-NEXT: retq
904 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
905 ; SSE3-FAST: # %bb.0:
906 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
907 ; SSE3-FAST-NEXT: retq
909 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
911 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
912 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
913 ; AVX-SLOW-NEXT: vzeroupper
914 ; AVX-SLOW-NEXT: retq
916 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
918 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
919 ; AVX-FAST-NEXT: vzeroupper
920 ; AVX-FAST-NEXT: retq
921 %x0 = extractelement <8 x float> %x, i32 0
922 %x1 = extractelement <8 x float> %x, i32 1
923 %x01 = fadd float %x1, %x0
927 define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
928 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
929 ; SSE3-SLOW: # %bb.0:
930 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
931 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
932 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
933 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
934 ; SSE3-SLOW-NEXT: retq
936 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
937 ; SSE3-FAST: # %bb.0:
938 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
939 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
940 ; SSE3-FAST-NEXT: retq
942 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
944 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
945 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
946 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
947 ; AVX-SLOW-NEXT: vzeroupper
948 ; AVX-SLOW-NEXT: retq
950 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
952 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
953 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
954 ; AVX-FAST-NEXT: vzeroupper
955 ; AVX-FAST-NEXT: retq
956 %x0 = extractelement <8 x float> %x, i32 2
957 %x1 = extractelement <8 x float> %x, i32 3
958 %x01 = fadd float %x1, %x0
962 define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
963 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
964 ; SSE3-SLOW: # %bb.0:
965 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
966 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
967 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
968 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
969 ; SSE3-SLOW-NEXT: retq
971 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
972 ; SSE3-FAST: # %bb.0:
973 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
974 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
975 ; SSE3-FAST-NEXT: retq
977 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
979 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
980 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
981 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
982 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
983 ; AVX-SLOW-NEXT: vzeroupper
984 ; AVX-SLOW-NEXT: retq
986 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
988 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
989 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
990 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
991 ; AVX-FAST-NEXT: vzeroupper
992 ; AVX-FAST-NEXT: retq
993 %x0 = extractelement <8 x float> %x, i32 6
994 %x1 = extractelement <8 x float> %x, i32 7
995 %x01 = fadd float %x1, %x0
999 define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
1000 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1001 ; SSE3-SLOW: # %bb.0:
1002 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1003 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1005 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1006 ; SSE3-SLOW-NEXT: retq
1008 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1009 ; SSE3-FAST: # %bb.0:
1010 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1011 ; SSE3-FAST-NEXT: retq
1013 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014 ; AVX-SLOW: # %bb.0:
1015 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1017 ; AVX-SLOW-NEXT: vzeroupper
1018 ; AVX-SLOW-NEXT: retq
1020 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021 ; AVX-FAST: # %bb.0:
1022 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1023 ; AVX-FAST-NEXT: vzeroupper
1024 ; AVX-FAST-NEXT: retq
1025 %x0 = extractelement <4 x double> %x, i32 0
1026 %x1 = extractelement <4 x double> %x, i32 1
1027 %x01 = fadd double %x0, %x1
1031 define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1032 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1033 ; SSE3-SLOW: # %bb.0:
1034 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1035 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1036 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1037 ; SSE3-SLOW-NEXT: retq
1039 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1040 ; SSE3-FAST: # %bb.0:
1041 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1042 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1043 ; SSE3-FAST-NEXT: retq
1045 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046 ; AVX-SLOW: # %bb.0:
1047 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1048 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX-SLOW-NEXT: vzeroupper
1051 ; AVX-SLOW-NEXT: retq
1053 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054 ; AVX-FAST: # %bb.0:
1055 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1056 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1057 ; AVX-FAST-NEXT: vzeroupper
1058 ; AVX-FAST-NEXT: retq
1059 %x0 = extractelement <4 x double> %x, i32 2
1060 %x1 = extractelement <4 x double> %x, i32 3
1061 %x01 = fadd double %x0, %x1
1065 define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1066 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1067 ; SSE3-SLOW: # %bb.0:
1068 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1069 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1070 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1071 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1072 ; SSE3-SLOW-NEXT: retq
1074 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075 ; SSE3-FAST: # %bb.0:
1076 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1077 ; SSE3-FAST-NEXT: retq
1079 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080 ; AVX-SLOW: # %bb.0:
1081 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-SLOW-NEXT: vzeroupper
1084 ; AVX-SLOW-NEXT: retq
1086 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087 ; AVX-FAST: # %bb.0:
1088 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1089 ; AVX-FAST-NEXT: vzeroupper
1090 ; AVX-FAST-NEXT: retq
1091 %x0 = extractelement <4 x double> %x, i32 0
1092 %x1 = extractelement <4 x double> %x, i32 1
1093 %x01 = fadd double %x1, %x0
1097 define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1098 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1099 ; SSE3-SLOW: # %bb.0:
1100 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1101 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1102 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1103 ; SSE3-SLOW-NEXT: retq
1105 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1106 ; SSE3-FAST: # %bb.0:
1107 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1108 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1109 ; SSE3-FAST-NEXT: retq
1111 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112 ; AVX-SLOW: # %bb.0:
1113 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1114 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1116 ; AVX-SLOW-NEXT: vzeroupper
1117 ; AVX-SLOW-NEXT: retq
1119 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120 ; AVX-FAST: # %bb.0:
1121 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1123 ; AVX-FAST-NEXT: vzeroupper
1124 ; AVX-FAST-NEXT: retq
1125 %x0 = extractelement <4 x double> %x, i32 2
1126 %x1 = extractelement <4 x double> %x, i32 3
1127 %x01 = fadd double %x1, %x0
1131 define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1132 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1133 ; SSE3-SLOW: # %bb.0:
1134 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1135 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1136 ; SSE3-SLOW-NEXT: retq
1138 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1139 ; SSE3-FAST: # %bb.0:
1140 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1141 ; SSE3-FAST-NEXT: retq
1143 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144 ; AVX-SLOW: # %bb.0:
1145 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1147 ; AVX-SLOW-NEXT: vzeroupper
1148 ; AVX-SLOW-NEXT: retq
1150 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151 ; AVX-FAST: # %bb.0:
1152 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1153 ; AVX-FAST-NEXT: vzeroupper
1154 ; AVX-FAST-NEXT: retq
1155 %x0 = extractelement <8 x float> %x, i32 0
1156 %x1 = extractelement <8 x float> %x, i32 1
1157 %x01 = fsub float %x0, %x1
1161 define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1162 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1163 ; SSE3-SLOW: # %bb.0:
1164 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1165 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1166 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1167 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
1168 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1169 ; SSE3-SLOW-NEXT: retq
1171 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1172 ; SSE3-FAST: # %bb.0:
1173 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1174 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1175 ; SSE3-FAST-NEXT: retq
1177 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178 ; AVX-SLOW: # %bb.0:
1179 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1181 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
1182 ; AVX-SLOW-NEXT: vzeroupper
1183 ; AVX-SLOW-NEXT: retq
1185 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186 ; AVX-FAST: # %bb.0:
1187 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1188 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189 ; AVX-FAST-NEXT: vzeroupper
1190 ; AVX-FAST-NEXT: retq
1191 %x0 = extractelement <8 x float> %x, i32 2
1192 %x1 = extractelement <8 x float> %x, i32 3
1193 %x01 = fsub float %x0, %x1
1197 define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1198 ; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1199 ; SSE3-SLOW: # %bb.0:
1200 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1201 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1202 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1203 ; SSE3-SLOW-NEXT: retq
1205 ; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1206 ; SSE3-FAST: # %bb.0:
1207 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1208 ; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0
1209 ; SSE3-FAST-NEXT: retq
1211 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212 ; AVX-SLOW: # %bb.0:
1213 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1214 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1216 ; AVX-SLOW-NEXT: vzeroupper
1217 ; AVX-SLOW-NEXT: retq
1219 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220 ; AVX-FAST: # %bb.0:
1221 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1222 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1223 ; AVX-FAST-NEXT: vzeroupper
1224 ; AVX-FAST-NEXT: retq
1225 %x0 = extractelement <8 x float> %x, i32 4
1226 %x1 = extractelement <8 x float> %x, i32 5
1227 %x01 = fsub float %x0, %x1
1231 ; Negative test...or get hoppy and negate?
1233 define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1234 ; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1236 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1237 ; SSE3-NEXT: subss %xmm0, %xmm1
1238 ; SSE3-NEXT: movaps %xmm1, %xmm0
1241 ; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1243 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1245 ; AVX-NEXT: vzeroupper
1247 %x0 = extractelement <8 x float> %x, i32 0
1248 %x1 = extractelement <8 x float> %x, i32 1
1249 %x01 = fsub float %x1, %x0
1253 define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1254 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1255 ; SSE3-SLOW: # %bb.0:
1256 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1257 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1258 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1259 ; SSE3-SLOW-NEXT: retq
1261 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1262 ; SSE3-FAST: # %bb.0:
1263 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1264 ; SSE3-FAST-NEXT: retq
1266 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267 ; AVX-SLOW: # %bb.0:
1268 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1270 ; AVX-SLOW-NEXT: vzeroupper
1271 ; AVX-SLOW-NEXT: retq
1273 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274 ; AVX-FAST: # %bb.0:
1275 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1276 ; AVX-FAST-NEXT: vzeroupper
1277 ; AVX-FAST-NEXT: retq
1278 %x0 = extractelement <4 x double> %x, i32 0
1279 %x1 = extractelement <4 x double> %x, i32 1
1280 %x01 = fsub double %x0, %x1
1284 ; Negative test...or get hoppy and negate?
1286 define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1287 ; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1289 ; SSE3-NEXT: movapd %xmm0, %xmm1
1290 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1291 ; SSE3-NEXT: subsd %xmm0, %xmm1
1292 ; SSE3-NEXT: movapd %xmm1, %xmm0
1295 ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1297 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1298 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1299 ; AVX-NEXT: vzeroupper
1301 %x0 = extractelement <4 x double> %x, i32 0
1302 %x1 = extractelement <4 x double> %x, i32 1
1303 %x01 = fsub double %x1, %x0
1307 ; 512-bit vectors, float/double, fadd/fsub
1309 define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1310 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1311 ; SSE3-SLOW: # %bb.0:
1312 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1313 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1314 ; SSE3-SLOW-NEXT: retq
1316 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1317 ; SSE3-FAST: # %bb.0:
1318 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1319 ; SSE3-FAST-NEXT: retq
1321 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322 ; AVX-SLOW: # %bb.0:
1323 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1325 ; AVX-SLOW-NEXT: vzeroupper
1326 ; AVX-SLOW-NEXT: retq
1328 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329 ; AVX-FAST: # %bb.0:
1330 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1331 ; AVX-FAST-NEXT: vzeroupper
1332 ; AVX-FAST-NEXT: retq
1333 %x0 = extractelement <16 x float> %x, i32 0
1334 %x1 = extractelement <16 x float> %x, i32 1
1335 %x01 = fadd float %x0, %x1
1339 define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1340 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1341 ; SSE3-SLOW: # %bb.0:
1342 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1343 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1344 ; SSE3-SLOW-NEXT: retq
1346 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347 ; SSE3-FAST: # %bb.0:
1348 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1349 ; SSE3-FAST-NEXT: retq
1351 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352 ; AVX-SLOW: # %bb.0:
1353 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
1355 ; AVX-SLOW-NEXT: vzeroupper
1356 ; AVX-SLOW-NEXT: retq
1358 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359 ; AVX-FAST: # %bb.0:
1360 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1361 ; AVX-FAST-NEXT: vzeroupper
1362 ; AVX-FAST-NEXT: retq
1363 %x0 = extractelement <16 x float> %x, i32 0
1364 %x1 = extractelement <16 x float> %x, i32 1
1365 %x01 = fadd float %x1, %x0
1369 define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1370 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1371 ; SSE3-SLOW: # %bb.0:
1372 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1373 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1374 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1375 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1376 ; SSE3-SLOW-NEXT: retq
1378 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1379 ; SSE3-FAST: # %bb.0:
1380 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1381 ; SSE3-FAST-NEXT: retq
1383 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384 ; AVX-SLOW: # %bb.0:
1385 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT: vzeroupper
1388 ; AVX-SLOW-NEXT: retq
1390 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391 ; AVX-FAST: # %bb.0:
1392 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1393 ; AVX-FAST-NEXT: vzeroupper
1394 ; AVX-FAST-NEXT: retq
1395 %x0 = extractelement <8 x double> %x, i32 0
1396 %x1 = extractelement <8 x double> %x, i32 1
1397 %x01 = fadd double %x0, %x1
1401 define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1402 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1403 ; SSE3-SLOW: # %bb.0:
1404 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1405 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1406 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1407 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1408 ; SSE3-SLOW-NEXT: retq
1410 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1411 ; SSE3-FAST: # %bb.0:
1412 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1413 ; SSE3-FAST-NEXT: retq
1415 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-SLOW: # %bb.0:
1417 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1419 ; AVX-SLOW-NEXT: vzeroupper
1420 ; AVX-SLOW-NEXT: retq
1422 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423 ; AVX-FAST: # %bb.0:
1424 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1425 ; AVX-FAST-NEXT: vzeroupper
1426 ; AVX-FAST-NEXT: retq
1427 %x0 = extractelement <8 x double> %x, i32 0
1428 %x1 = extractelement <8 x double> %x, i32 1
1429 %x01 = fadd double %x1, %x0
1433 define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1434 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1435 ; SSE3-SLOW: # %bb.0:
1436 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1437 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1438 ; SSE3-SLOW-NEXT: retq
1440 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1441 ; SSE3-FAST: # %bb.0:
1442 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1443 ; SSE3-FAST-NEXT: retq
1445 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-SLOW: # %bb.0:
1447 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1449 ; AVX-SLOW-NEXT: vzeroupper
1450 ; AVX-SLOW-NEXT: retq
1452 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453 ; AVX-FAST: # %bb.0:
1454 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1455 ; AVX-FAST-NEXT: vzeroupper
1456 ; AVX-FAST-NEXT: retq
1457 %x0 = extractelement <16 x float> %x, i32 0
1458 %x1 = extractelement <16 x float> %x, i32 1
1459 %x01 = fsub float %x0, %x1
1463 define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1464 ; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1466 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467 ; SSE3-NEXT: subss %xmm0, %xmm1
1468 ; SSE3-NEXT: movaps %xmm1, %xmm0
1471 ; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1473 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1474 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1475 ; AVX-NEXT: vzeroupper
1477 %x0 = extractelement <16 x float> %x, i32 0
1478 %x1 = extractelement <16 x float> %x, i32 1
1479 %x01 = fsub float %x1, %x0
1483 define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1484 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1485 ; SSE3-SLOW: # %bb.0:
1486 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1487 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1488 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1489 ; SSE3-SLOW-NEXT: retq
1491 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1492 ; SSE3-FAST: # %bb.0:
1493 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1494 ; SSE3-FAST-NEXT: retq
1496 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-SLOW: # %bb.0:
1498 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1500 ; AVX-SLOW-NEXT: vzeroupper
1501 ; AVX-SLOW-NEXT: retq
1503 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504 ; AVX-FAST: # %bb.0:
1505 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1506 ; AVX-FAST-NEXT: vzeroupper
1507 ; AVX-FAST-NEXT: retq
1508 %x0 = extractelement <8 x double> %x, i32 0
1509 %x1 = extractelement <8 x double> %x, i32 1
1510 %x01 = fsub double %x0, %x1
1514 define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1515 ; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1517 ; SSE3-NEXT: movapd %xmm0, %xmm1
1518 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1519 ; SSE3-NEXT: subsd %xmm0, %xmm1
1520 ; SSE3-NEXT: movapd %xmm1, %xmm0
1523 ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1525 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1526 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1527 ; AVX-NEXT: vzeroupper
1529 %x0 = extractelement <8 x double> %x, i32 0
1530 %x1 = extractelement <8 x double> %x, i32 1
1531 %x01 = fsub double %x1, %x0
1535 ; Check output when 1 or both extracts have extra uses.
1537 define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1538 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539 ; SSE3-SLOW: # %bb.0:
1540 ; SSE3-SLOW-NEXT: movss %xmm0, (%rdi)
1541 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1542 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1543 ; SSE3-SLOW-NEXT: retq
1545 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1546 ; SSE3-FAST: # %bb.0:
1547 ; SSE3-FAST-NEXT: movss %xmm0, (%rdi)
1548 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1549 ; SSE3-FAST-NEXT: retq
1551 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-SLOW: # %bb.0:
1553 ; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi)
1554 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1556 ; AVX-SLOW-NEXT: retq
1558 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559 ; AVX-FAST: # %bb.0:
1560 ; AVX-FAST-NEXT: vmovss %xmm0, (%rdi)
1561 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1562 ; AVX-FAST-NEXT: retq
1563 %x0 = extractelement <4 x float> %x, i32 0
1564 store float %x0, float* %p
1565 %x1 = extractelement <4 x float> %x, i32 1
1566 %x01 = fadd float %x0, %x1
1570 define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1571 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572 ; SSE3-SLOW: # %bb.0:
1573 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574 ; SSE3-SLOW-NEXT: movss %xmm1, (%rdi)
1575 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1576 ; SSE3-SLOW-NEXT: retq
1578 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579 ; SSE3-FAST: # %bb.0:
1580 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581 ; SSE3-FAST-NEXT: movss %xmm1, (%rdi)
1582 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1583 ; SSE3-FAST-NEXT: retq
1585 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-SLOW: # %bb.0:
1587 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588 ; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi)
1589 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1590 ; AVX-SLOW-NEXT: retq
1592 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593 ; AVX-FAST: # %bb.0:
1594 ; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi)
1595 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1596 ; AVX-FAST-NEXT: retq
1597 %x0 = extractelement <4 x float> %x, i32 0
1598 %x1 = extractelement <4 x float> %x, i32 1
1599 store float %x1, float* %p
1600 %x01 = fadd float %x0, %x1
1604 define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1605 ; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1607 ; SSE3-NEXT: movss %xmm0, (%rdi)
1608 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1609 ; SSE3-NEXT: movss %xmm1, (%rsi)
1610 ; SSE3-NEXT: addss %xmm1, %xmm0
1613 ; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1615 ; AVX-NEXT: vmovss %xmm0, (%rdi)
1616 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1617 ; AVX-NEXT: vmovss %xmm1, (%rsi)
1618 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1620 %x0 = extractelement <4 x float> %x, i32 0
1621 store float %x0, float* %p1
1622 %x1 = extractelement <4 x float> %x, i32 1
1623 store float %x1, float* %p2
1624 %x01 = fadd float %x0, %x1
1628 ; Repeat tests from general reductions to verify output for hoppy targets:
1629 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1631 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
1632 declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>)
1634 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1635 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1636 ; SSE3-SLOW: # %bb.0:
1637 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
1638 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
1639 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1640 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
1641 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1642 ; SSE3-SLOW-NEXT: addss %xmm2, %xmm1
1643 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1644 ; SSE3-SLOW-NEXT: retq
1646 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1647 ; SSE3-FAST: # %bb.0:
1648 ; SSE3-FAST-NEXT: addps %xmm2, %xmm1
1649 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm2
1650 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1651 ; SSE3-FAST-NEXT: addps %xmm1, %xmm2
1652 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1653 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0
1654 ; SSE3-FAST-NEXT: retq
1656 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1657 ; AVX-SLOW: # %bb.0:
1658 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1659 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1660 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1661 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1662 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1663 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
1664 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1665 ; AVX-SLOW-NEXT: vzeroupper
1666 ; AVX-SLOW-NEXT: retq
1668 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1669 ; AVX-FAST: # %bb.0:
1670 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1671 ; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
1672 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1673 ; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
1674 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1675 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
1676 ; AVX-FAST-NEXT: vzeroupper
1677 ; AVX-FAST-NEXT: retq
1678 %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1682 define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1683 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1684 ; SSE3-SLOW: # %bb.0:
1685 ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
1686 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2
1687 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1688 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2
1689 ; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0
1690 ; SSE3-SLOW-NEXT: retq
1692 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1693 ; SSE3-FAST: # %bb.0:
1694 ; SSE3-FAST-NEXT: addpd %xmm2, %xmm1
1695 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1
1696 ; SSE3-FAST-NEXT: addsd %xmm1, %xmm0
1697 ; SSE3-FAST-NEXT: retq
1699 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1700 ; AVX-SLOW: # %bb.0:
1701 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1702 ; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
1703 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1704 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1705 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1706 ; AVX-SLOW-NEXT: vzeroupper
1707 ; AVX-SLOW-NEXT: retq
1709 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1710 ; AVX-FAST: # %bb.0:
1711 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1712 ; AVX-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1
1713 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1714 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1715 ; AVX-FAST-NEXT: vzeroupper
1716 ; AVX-FAST-NEXT: retq
1717 %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1721 define float @PR39936_v8f32(<8 x float>) {
1722 ; SSSE3-SLOW-LABEL: PR39936_v8f32:
1723 ; SSSE3-SLOW: # %bb.0:
1724 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1725 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1726 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1727 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1728 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
1729 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1730 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
1731 ; SSSE3-SLOW-NEXT: retq
1733 ; SSSE3-FAST-LABEL: PR39936_v8f32:
1734 ; SSSE3-FAST: # %bb.0:
1735 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
1736 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1737 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1738 ; SSSE3-FAST-NEXT: retq
1740 ; SSE3-SLOW-LABEL: PR39936_v8f32:
1741 ; SSE3-SLOW: # %bb.0:
1742 ; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1743 ; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0
1744 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1745 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1746 ; SSE3-SLOW-NEXT: retq
1748 ; SSE3-FAST-LABEL: PR39936_v8f32:
1749 ; SSE3-FAST: # %bb.0:
1750 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm0
1751 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1752 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1753 ; SSE3-FAST-NEXT: retq
1755 ; AVX-SLOW-LABEL: PR39936_v8f32:
1756 ; AVX-SLOW: # %bb.0:
1757 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1758 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1759 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1760 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1761 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1762 ; AVX-SLOW-NEXT: vzeroupper
1763 ; AVX-SLOW-NEXT: retq
1765 ; AVX-FAST-LABEL: PR39936_v8f32:
1766 ; AVX-FAST: # %bb.0:
1767 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1768 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1769 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1770 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1771 ; AVX-FAST-NEXT: vzeroupper
1772 ; AVX-FAST-NEXT: retq
1773 %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1774 %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1775 %4 = fadd <8 x float> %2, %3
1776 %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1777 %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1778 %7 = fadd <8 x float> %5, %6
1779 %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1780 %9 = fadd <8 x float> %7, %8
1781 %10 = extractelement <8 x float> %9, i32 0