1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
11 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
12 ; SSE3-LABEL: haddpd1:
14 ; SSE3-NEXT: haddpd %xmm1, %xmm0
19 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
21 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
22 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
23 %r = fadd <2 x double> %a, %b
27 define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
28 ; SSE3-LABEL: haddpd2:
30 ; SSE3-NEXT: haddpd %xmm1, %xmm0
35 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
37 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
38 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
39 %r = fadd <2 x double> %a, %b
43 define <2 x double> @haddpd3(<2 x double> %x) {
44 ; SSE3-SLOW-LABEL: haddpd3:
46 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
47 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
48 ; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0
49 ; SSE3-SLOW-NEXT: retq
51 ; SSE3-FAST-LABEL: haddpd3:
53 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
54 ; SSE3-FAST-NEXT: retq
56 ; AVX-SLOW-LABEL: haddpd3:
58 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
59 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
62 ; AVX-FAST-LABEL: haddpd3:
64 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
66 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
67 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
68 %r = fadd <2 x double> %a, %b
72 define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
73 ; SSE3-LABEL: haddps1:
75 ; SSE3-NEXT: haddps %xmm1, %xmm0
80 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
82 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
83 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
84 %r = fadd <4 x float> %a, %b
88 define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
89 ; SSE3-LABEL: haddps2:
91 ; SSE3-NEXT: haddps %xmm1, %xmm0
96 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
98 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
99 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
100 %r = fadd <4 x float> %a, %b
104 define <4 x float> @haddps3(<4 x float> %x) {
105 ; SSE3-LABEL: haddps3:
107 ; SSE3-NEXT: haddps %xmm0, %xmm0
110 ; AVX-LABEL: haddps3:
112 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
114 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
115 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
116 %r = fadd <4 x float> %a, %b
120 define <4 x float> @haddps4(<4 x float> %x) {
121 ; SSE3-LABEL: haddps4:
123 ; SSE3-NEXT: haddps %xmm0, %xmm0
126 ; AVX-LABEL: haddps4:
128 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
130 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
131 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
132 %r = fadd <4 x float> %a, %b
136 define <4 x float> @haddps5(<4 x float> %x) {
137 ; SSE3-LABEL: haddps5:
139 ; SSE3-NEXT: haddps %xmm0, %xmm0
142 ; AVX-LABEL: haddps5:
144 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
146 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
147 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
148 %r = fadd <4 x float> %a, %b
152 define <4 x float> @haddps6(<4 x float> %x) {
153 ; SSE3-SLOW-LABEL: haddps6:
154 ; SSE3-SLOW: # %bb.0:
155 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
156 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
157 ; SSE3-SLOW-NEXT: retq
159 ; SSE3-FAST-LABEL: haddps6:
160 ; SSE3-FAST: # %bb.0:
161 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
162 ; SSE3-FAST-NEXT: retq
164 ; AVX-SLOW-LABEL: haddps6:
166 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
167 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
168 ; AVX-SLOW-NEXT: retq
170 ; AVX-FAST-LABEL: haddps6:
172 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
173 ; AVX-FAST-NEXT: retq
174 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
175 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
176 %r = fadd <4 x float> %a, %b
180 define <4 x float> @haddps7(<4 x float> %x) {
181 ; SSE3-LABEL: haddps7:
183 ; SSE3-NEXT: haddps %xmm0, %xmm0
186 ; AVX-LABEL: haddps7:
188 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
190 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
191 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
192 %r = fadd <4 x float> %a, %b
196 define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
197 ; SSE3-LABEL: hsubpd1:
199 ; SSE3-NEXT: hsubpd %xmm1, %xmm0
202 ; AVX-LABEL: hsubpd1:
204 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
206 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
207 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
208 %r = fsub <2 x double> %a, %b
212 define <2 x double> @hsubpd2(<2 x double> %x) {
213 ; SSE3-SLOW-LABEL: hsubpd2:
214 ; SSE3-SLOW: # %bb.0:
215 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
216 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
217 ; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0
218 ; SSE3-SLOW-NEXT: retq
220 ; SSE3-FAST-LABEL: hsubpd2:
221 ; SSE3-FAST: # %bb.0:
222 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
223 ; SSE3-FAST-NEXT: retq
225 ; AVX-SLOW-LABEL: hsubpd2:
227 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
228 ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
229 ; AVX-SLOW-NEXT: retq
231 ; AVX-FAST-LABEL: hsubpd2:
233 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
234 ; AVX-FAST-NEXT: retq
235 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
236 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
237 %r = fsub <2 x double> %a, %b
241 define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
242 ; SSE3-LABEL: hsubps1:
244 ; SSE3-NEXT: hsubps %xmm1, %xmm0
247 ; AVX-LABEL: hsubps1:
249 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
251 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
252 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
253 %r = fsub <4 x float> %a, %b
257 define <4 x float> @hsubps2(<4 x float> %x) {
258 ; SSE3-LABEL: hsubps2:
260 ; SSE3-NEXT: hsubps %xmm0, %xmm0
263 ; AVX-LABEL: hsubps2:
265 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
267 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
268 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
269 %r = fsub <4 x float> %a, %b
273 define <4 x float> @hsubps3(<4 x float> %x) {
274 ; SSE3-LABEL: hsubps3:
276 ; SSE3-NEXT: hsubps %xmm0, %xmm0
279 ; AVX-LABEL: hsubps3:
281 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
283 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
284 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
285 %r = fsub <4 x float> %a, %b
289 define <4 x float> @hsubps4(<4 x float> %x) {
290 ; SSE3-SLOW-LABEL: hsubps4:
291 ; SSE3-SLOW: # %bb.0:
292 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
293 ; SSE3-SLOW-NEXT: subps %xmm1, %xmm0
294 ; SSE3-SLOW-NEXT: retq
296 ; SSE3-FAST-LABEL: hsubps4:
297 ; SSE3-FAST: # %bb.0:
298 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
299 ; SSE3-FAST-NEXT: retq
301 ; AVX-SLOW-LABEL: hsubps4:
303 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
304 ; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
305 ; AVX-SLOW-NEXT: retq
307 ; AVX-FAST-LABEL: hsubps4:
309 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
310 ; AVX-FAST-NEXT: retq
311 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
312 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
313 %r = fsub <4 x float> %a, %b
317 define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
318 ; SSE3-LABEL: vhaddps1:
320 ; SSE3-NEXT: haddps %xmm2, %xmm0
321 ; SSE3-NEXT: haddps %xmm3, %xmm1
324 ; AVX-LABEL: vhaddps1:
326 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
328 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
329 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
330 %r = fadd <8 x float> %a, %b
334 define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
335 ; SSE3-LABEL: vhaddps2:
337 ; SSE3-NEXT: haddps %xmm2, %xmm0
338 ; SSE3-NEXT: haddps %xmm3, %xmm1
341 ; AVX-LABEL: vhaddps2:
343 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
345 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
346 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
347 %r = fadd <8 x float> %a, %b
351 define <8 x float> @vhaddps3(<8 x float> %x) {
352 ; SSE3-LABEL: vhaddps3:
354 ; SSE3-NEXT: haddps %xmm0, %xmm0
355 ; SSE3-NEXT: haddps %xmm1, %xmm1
358 ; AVX-LABEL: vhaddps3:
360 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
362 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
363 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
364 %r = fadd <8 x float> %a, %b
368 define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
369 ; SSE3-LABEL: vhsubps1:
371 ; SSE3-NEXT: hsubps %xmm2, %xmm0
372 ; SSE3-NEXT: hsubps %xmm3, %xmm1
375 ; AVX-LABEL: vhsubps1:
377 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
379 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
380 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
381 %r = fsub <8 x float> %a, %b
385 define <8 x float> @vhsubps3(<8 x float> %x) {
386 ; SSE3-LABEL: vhsubps3:
388 ; SSE3-NEXT: hsubps %xmm0, %xmm0
389 ; SSE3-NEXT: hsubps %xmm1, %xmm1
392 ; AVX-LABEL: vhsubps3:
394 ; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
396 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
397 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
398 %r = fsub <8 x float> %a, %b
402 define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
403 ; SSE3-LABEL: vhaddpd1:
405 ; SSE3-NEXT: haddpd %xmm2, %xmm0
406 ; SSE3-NEXT: haddpd %xmm3, %xmm1
409 ; AVX-LABEL: vhaddpd1:
411 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
413 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
414 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
415 %r = fadd <4 x double> %a, %b
419 define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
420 ; SSE3-LABEL: vhsubpd1:
422 ; SSE3-NEXT: hsubpd %xmm2, %xmm0
423 ; SSE3-NEXT: hsubpd %xmm3, %xmm1
426 ; AVX-LABEL: vhsubpd1:
428 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
430 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
431 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
432 %r = fsub <4 x double> %a, %b
436 define <2 x float> @haddps_v2f32(<4 x float> %v0) {
437 ; SSE3-LABEL: haddps_v2f32:
439 ; SSE3-NEXT: haddps %xmm0, %xmm0
442 ; AVX-LABEL: haddps_v2f32:
444 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
446 %v0.0 = extractelement <4 x float> %v0, i32 0
447 %v0.1 = extractelement <4 x float> %v0, i32 1
448 %v0.2 = extractelement <4 x float> %v0, i32 2
449 %v0.3 = extractelement <4 x float> %v0, i32 3
450 %op0 = fadd float %v0.0, %v0.1
451 %op1 = fadd float %v0.2, %v0.3
452 %res0 = insertelement <2 x float> undef, float %op0, i32 0
453 %res1 = insertelement <2 x float> %res0, float %op1, i32 1
454 ret <2 x float> %res1
457 ; 128-bit vectors, float/double, fadd/fsub
459 define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
460 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
461 ; SSE3-SLOW: # %bb.0:
462 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
463 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
464 ; SSE3-SLOW-NEXT: retq
466 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
467 ; SSE3-FAST: # %bb.0:
468 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
469 ; SSE3-FAST-NEXT: retq
471 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
473 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
474 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
475 ; AVX-SLOW-NEXT: retq
477 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
479 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
480 ; AVX-FAST-NEXT: retq
481 %x0 = extractelement <4 x float> %x, i32 0
482 %x1 = extractelement <4 x float> %x, i32 1
483 %x01 = fadd float %x0, %x1
487 define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
488 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
489 ; SSE3-SLOW: # %bb.0:
490 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
491 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
492 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
493 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
494 ; SSE3-SLOW-NEXT: retq
496 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
497 ; SSE3-FAST: # %bb.0:
498 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
499 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
500 ; SSE3-FAST-NEXT: retq
502 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
504 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
505 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
506 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
507 ; AVX-SLOW-NEXT: retq
509 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
511 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
512 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
513 ; AVX-FAST-NEXT: retq
514 %x0 = extractelement <4 x float> %x, i32 2
515 %x1 = extractelement <4 x float> %x, i32 3
516 %x01 = fadd float %x0, %x1
520 define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
521 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
522 ; SSE3-SLOW: # %bb.0:
523 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
524 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
525 ; SSE3-SLOW-NEXT: retq
527 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
528 ; SSE3-FAST: # %bb.0:
529 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
530 ; SSE3-FAST-NEXT: retq
532 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
534 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
535 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
536 ; AVX-SLOW-NEXT: retq
538 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
540 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
541 ; AVX-FAST-NEXT: retq
542 %x0 = extractelement <4 x float> %x, i32 0
543 %x1 = extractelement <4 x float> %x, i32 1
544 %x01 = fadd float %x1, %x0
548 define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
549 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
550 ; SSE3-SLOW: # %bb.0:
551 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
552 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
553 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
554 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
555 ; SSE3-SLOW-NEXT: retq
557 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
558 ; SSE3-FAST: # %bb.0:
559 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
560 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
561 ; SSE3-FAST-NEXT: retq
563 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
565 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
566 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
567 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
568 ; AVX-SLOW-NEXT: retq
570 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
572 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
573 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
574 ; AVX-FAST-NEXT: retq
575 %x0 = extractelement <4 x float> %x, i32 2
576 %x1 = extractelement <4 x float> %x, i32 3
577 %x01 = fadd float %x1, %x0
581 define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
582 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
583 ; SSE3-SLOW: # %bb.0:
584 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
585 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
586 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
587 ; SSE3-SLOW-NEXT: retq
589 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
590 ; SSE3-FAST: # %bb.0:
591 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
592 ; SSE3-FAST-NEXT: retq
594 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
596 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
597 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
598 ; AVX-SLOW-NEXT: retq
600 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
602 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
603 ; AVX-FAST-NEXT: retq
604 %x0 = extractelement <2 x double> %x, i32 0
605 %x1 = extractelement <2 x double> %x, i32 1
606 %x01 = fadd double %x0, %x1
610 define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
611 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
612 ; SSE3-SLOW: # %bb.0:
613 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
614 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
615 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
616 ; SSE3-SLOW-NEXT: retq
618 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
619 ; SSE3-FAST: # %bb.0:
620 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
621 ; SSE3-FAST-NEXT: retq
623 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
625 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
626 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
627 ; AVX-SLOW-NEXT: retq
629 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
631 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
632 ; AVX-FAST-NEXT: retq
633 %x0 = extractelement <2 x double> %x, i32 0
634 %x1 = extractelement <2 x double> %x, i32 1
635 %x01 = fadd double %x1, %x0
639 define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
640 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
641 ; SSE3-SLOW: # %bb.0:
642 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
643 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
644 ; SSE3-SLOW-NEXT: retq
646 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
647 ; SSE3-FAST: # %bb.0:
648 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
649 ; SSE3-FAST-NEXT: retq
651 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
653 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
654 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
655 ; AVX-SLOW-NEXT: retq
657 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
659 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
660 ; AVX-FAST-NEXT: retq
661 %x0 = extractelement <4 x float> %x, i32 0
662 %x1 = extractelement <4 x float> %x, i32 1
663 %x01 = fsub float %x0, %x1
667 define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
668 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
669 ; SSE3-SLOW: # %bb.0:
670 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
671 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
672 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
673 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
674 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
675 ; SSE3-SLOW-NEXT: retq
677 ; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
678 ; SSE3-FAST: # %bb.0:
679 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
680 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
681 ; SSE3-FAST-NEXT: retq
683 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
685 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
686 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
687 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
688 ; AVX-SLOW-NEXT: retq
690 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
692 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
693 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
694 ; AVX-FAST-NEXT: retq
695 %x0 = extractelement <4 x float> %x, i32 2
696 %x1 = extractelement <4 x float> %x, i32 3
697 %x01 = fsub float %x0, %x1
701 define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
702 ; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute:
704 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
705 ; SSE3-NEXT: subss %xmm0, %xmm1
706 ; SSE3-NEXT: movaps %xmm1, %xmm0
709 ; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
711 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
712 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
714 %x0 = extractelement <4 x float> %x, i32 0
715 %x1 = extractelement <4 x float> %x, i32 1
716 %x01 = fsub float %x1, %x0
720 define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
721 ; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute:
723 ; SSE3-NEXT: movaps %xmm0, %xmm1
724 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
725 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
726 ; SSE3-NEXT: subss %xmm1, %xmm0
729 ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
731 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
732 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
733 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
735 %x0 = extractelement <4 x float> %x, i32 2
736 %x1 = extractelement <4 x float> %x, i32 3
737 %x01 = fsub float %x1, %x0
741 define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
742 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
743 ; SSE3-SLOW: # %bb.0:
744 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
745 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
746 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
747 ; SSE3-SLOW-NEXT: retq
749 ; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
750 ; SSE3-FAST: # %bb.0:
751 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
752 ; SSE3-FAST-NEXT: retq
754 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
756 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
757 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
758 ; AVX-SLOW-NEXT: retq
760 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
762 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
763 ; AVX-FAST-NEXT: retq
764 %x0 = extractelement <2 x double> %x, i32 0
765 %x1 = extractelement <2 x double> %x, i32 1
766 %x01 = fsub double %x0, %x1
770 define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
771 ; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute:
773 ; SSE3-NEXT: movapd %xmm0, %xmm1
774 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
775 ; SSE3-NEXT: subsd %xmm0, %xmm1
776 ; SSE3-NEXT: movapd %xmm1, %xmm0
779 ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
781 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
782 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
784 %x0 = extractelement <2 x double> %x, i32 0
785 %x1 = extractelement <2 x double> %x, i32 1
786 %x01 = fsub double %x1, %x0
790 ; 256-bit vectors, float/double, fadd/fsub
792 define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
793 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
794 ; SSE3-SLOW: # %bb.0:
795 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
796 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
797 ; SSE3-SLOW-NEXT: retq
799 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
800 ; SSE3-FAST: # %bb.0:
801 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
802 ; SSE3-FAST-NEXT: retq
804 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
806 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
807 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
808 ; AVX-SLOW-NEXT: vzeroupper
809 ; AVX-SLOW-NEXT: retq
811 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
813 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
814 ; AVX-FAST-NEXT: vzeroupper
815 ; AVX-FAST-NEXT: retq
816 %x0 = extractelement <8 x float> %x, i32 0
817 %x1 = extractelement <8 x float> %x, i32 1
818 %x01 = fadd float %x0, %x1
822 define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
823 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
824 ; SSE3-SLOW: # %bb.0:
825 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
826 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
827 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
828 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
829 ; SSE3-SLOW-NEXT: retq
831 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
832 ; SSE3-FAST: # %bb.0:
833 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
834 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
835 ; SSE3-FAST-NEXT: retq
837 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
839 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
840 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
841 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
842 ; AVX-SLOW-NEXT: vzeroupper
843 ; AVX-SLOW-NEXT: retq
845 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
847 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
848 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
849 ; AVX-FAST-NEXT: vzeroupper
850 ; AVX-FAST-NEXT: retq
851 %x0 = extractelement <8 x float> %x, i32 2
852 %x1 = extractelement <8 x float> %x, i32 3
853 %x01 = fadd float %x0, %x1
857 define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
858 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
859 ; SSE3-SLOW: # %bb.0:
860 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
861 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
862 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
863 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
864 ; SSE3-SLOW-NEXT: retq
866 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
867 ; SSE3-FAST: # %bb.0:
868 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
869 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
870 ; SSE3-FAST-NEXT: retq
872 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
874 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
875 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
876 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
877 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
878 ; AVX-SLOW-NEXT: vzeroupper
879 ; AVX-SLOW-NEXT: retq
881 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
883 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
884 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
885 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
886 ; AVX-FAST-NEXT: vzeroupper
887 ; AVX-FAST-NEXT: retq
888 %x0 = extractelement <8 x float> %x, i32 6
889 %x1 = extractelement <8 x float> %x, i32 7
890 %x01 = fadd float %x0, %x1
894 define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
895 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
896 ; SSE3-SLOW: # %bb.0:
897 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
898 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
899 ; SSE3-SLOW-NEXT: retq
901 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
902 ; SSE3-FAST: # %bb.0:
903 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
904 ; SSE3-FAST-NEXT: retq
906 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
908 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
909 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
910 ; AVX-SLOW-NEXT: vzeroupper
911 ; AVX-SLOW-NEXT: retq
913 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
915 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
916 ; AVX-FAST-NEXT: vzeroupper
917 ; AVX-FAST-NEXT: retq
918 %x0 = extractelement <8 x float> %x, i32 0
919 %x1 = extractelement <8 x float> %x, i32 1
920 %x01 = fadd float %x1, %x0
924 define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
925 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
926 ; SSE3-SLOW: # %bb.0:
927 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
928 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
929 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
930 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
931 ; SSE3-SLOW-NEXT: retq
933 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
934 ; SSE3-FAST: # %bb.0:
935 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
936 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
937 ; SSE3-FAST-NEXT: retq
939 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
941 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
942 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
943 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
944 ; AVX-SLOW-NEXT: vzeroupper
945 ; AVX-SLOW-NEXT: retq
947 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
949 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
950 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
951 ; AVX-FAST-NEXT: vzeroupper
952 ; AVX-FAST-NEXT: retq
953 %x0 = extractelement <8 x float> %x, i32 2
954 %x1 = extractelement <8 x float> %x, i32 3
955 %x01 = fadd float %x1, %x0
959 define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
960 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
961 ; SSE3-SLOW: # %bb.0:
962 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
963 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
964 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
965 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
966 ; SSE3-SLOW-NEXT: retq
968 ; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
969 ; SSE3-FAST: # %bb.0:
970 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
971 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
972 ; SSE3-FAST-NEXT: retq
974 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
976 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
977 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
978 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
979 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
980 ; AVX-SLOW-NEXT: vzeroupper
981 ; AVX-SLOW-NEXT: retq
983 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
985 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
986 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
987 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
988 ; AVX-FAST-NEXT: vzeroupper
989 ; AVX-FAST-NEXT: retq
990 %x0 = extractelement <8 x float> %x, i32 6
991 %x1 = extractelement <8 x float> %x, i32 7
992 %x01 = fadd float %x1, %x0
996 define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
997 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
998 ; SSE3-SLOW: # %bb.0:
999 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1000 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1001 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1002 ; SSE3-SLOW-NEXT: retq
1004 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1005 ; SSE3-FAST: # %bb.0:
1006 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1007 ; SSE3-FAST-NEXT: retq
1009 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1010 ; AVX-SLOW: # %bb.0:
1011 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1012 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1013 ; AVX-SLOW-NEXT: vzeroupper
1014 ; AVX-SLOW-NEXT: retq
1016 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1017 ; AVX-FAST: # %bb.0:
1018 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1019 ; AVX-FAST-NEXT: vzeroupper
1020 ; AVX-FAST-NEXT: retq
1021 %x0 = extractelement <4 x double> %x, i32 0
1022 %x1 = extractelement <4 x double> %x, i32 1
1023 %x01 = fadd double %x0, %x1
1027 define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
1028 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1029 ; SSE3-SLOW: # %bb.0:
1030 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1031 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1032 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1033 ; SSE3-SLOW-NEXT: retq
1035 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1036 ; SSE3-FAST: # %bb.0:
1037 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1038 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1039 ; SSE3-FAST-NEXT: retq
1041 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1042 ; AVX-SLOW: # %bb.0:
1043 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1044 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1045 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1046 ; AVX-SLOW-NEXT: vzeroupper
1047 ; AVX-SLOW-NEXT: retq
1049 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1050 ; AVX-FAST: # %bb.0:
1051 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1052 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1053 ; AVX-FAST-NEXT: vzeroupper
1054 ; AVX-FAST-NEXT: retq
1055 %x0 = extractelement <4 x double> %x, i32 2
1056 %x1 = extractelement <4 x double> %x, i32 3
1057 %x01 = fadd double %x0, %x1
1061 define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
1062 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1063 ; SSE3-SLOW: # %bb.0:
1064 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1065 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1066 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1067 ; SSE3-SLOW-NEXT: retq
1069 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1070 ; SSE3-FAST: # %bb.0:
1071 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1072 ; SSE3-FAST-NEXT: retq
1074 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1075 ; AVX-SLOW: # %bb.0:
1076 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1077 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1078 ; AVX-SLOW-NEXT: vzeroupper
1079 ; AVX-SLOW-NEXT: retq
1081 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1082 ; AVX-FAST: # %bb.0:
1083 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1084 ; AVX-FAST-NEXT: vzeroupper
1085 ; AVX-FAST-NEXT: retq
1086 %x0 = extractelement <4 x double> %x, i32 0
1087 %x1 = extractelement <4 x double> %x, i32 1
1088 %x01 = fadd double %x1, %x0
1092 define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
1093 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1094 ; SSE3-SLOW: # %bb.0:
1095 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1096 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1097 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1098 ; SSE3-SLOW-NEXT: retq
1100 ; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1101 ; SSE3-FAST: # %bb.0:
1102 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1103 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
1104 ; SSE3-FAST-NEXT: retq
1106 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1107 ; AVX-SLOW: # %bb.0:
1108 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1109 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1110 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1111 ; AVX-SLOW-NEXT: vzeroupper
1112 ; AVX-SLOW-NEXT: retq
1114 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1115 ; AVX-FAST: # %bb.0:
1116 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1117 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1118 ; AVX-FAST-NEXT: vzeroupper
1119 ; AVX-FAST-NEXT: retq
1120 %x0 = extractelement <4 x double> %x, i32 2
1121 %x1 = extractelement <4 x double> %x, i32 3
1122 %x01 = fadd double %x1, %x0
1126 define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
1127 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1128 ; SSE3-SLOW: # %bb.0:
1129 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1130 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1131 ; SSE3-SLOW-NEXT: retq
1133 ; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1134 ; SSE3-FAST: # %bb.0:
1135 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1136 ; SSE3-FAST-NEXT: retq
1138 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1139 ; AVX-SLOW: # %bb.0:
1140 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1141 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1142 ; AVX-SLOW-NEXT: vzeroupper
1143 ; AVX-SLOW-NEXT: retq
1145 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1146 ; AVX-FAST: # %bb.0:
1147 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1148 ; AVX-FAST-NEXT: vzeroupper
1149 ; AVX-FAST-NEXT: retq
1150 %x0 = extractelement <8 x float> %x, i32 0
1151 %x1 = extractelement <8 x float> %x, i32 1
1152 %x01 = fsub float %x0, %x1
1156 define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
1157 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1158 ; SSE3-SLOW: # %bb.0:
1159 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1160 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1161 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1162 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
1163 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1164 ; SSE3-SLOW-NEXT: retq
1166 ; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1167 ; SSE3-FAST: # %bb.0:
1168 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1169 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1170 ; SSE3-FAST-NEXT: retq
1172 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1173 ; AVX-SLOW: # %bb.0:
1174 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1175 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1176 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
1177 ; AVX-SLOW-NEXT: vzeroupper
1178 ; AVX-SLOW-NEXT: retq
1180 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1181 ; AVX-FAST: # %bb.0:
1182 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1183 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1184 ; AVX-FAST-NEXT: vzeroupper
1185 ; AVX-FAST-NEXT: retq
1186 %x0 = extractelement <8 x float> %x, i32 2
1187 %x1 = extractelement <8 x float> %x, i32 3
1188 %x01 = fsub float %x0, %x1
1192 define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
1193 ; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1194 ; SSE3-SLOW: # %bb.0:
1195 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1196 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1197 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1198 ; SSE3-SLOW-NEXT: retq
1200 ; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1201 ; SSE3-FAST: # %bb.0:
1202 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1203 ; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0
1204 ; SSE3-FAST-NEXT: retq
1206 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1207 ; AVX-SLOW: # %bb.0:
1208 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1209 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1210 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1211 ; AVX-SLOW-NEXT: vzeroupper
1212 ; AVX-SLOW-NEXT: retq
1214 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1215 ; AVX-FAST: # %bb.0:
1216 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1217 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1218 ; AVX-FAST-NEXT: vzeroupper
1219 ; AVX-FAST-NEXT: retq
1220 %x0 = extractelement <8 x float> %x, i32 4
1221 %x1 = extractelement <8 x float> %x, i32 5
1222 %x01 = fsub float %x0, %x1
1226 ; Negative test...or get hoppy and negate?
1228 define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
1229 ; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1231 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1232 ; SSE3-NEXT: subss %xmm0, %xmm1
1233 ; SSE3-NEXT: movaps %xmm1, %xmm0
1236 ; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1238 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1239 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1240 ; AVX-NEXT: vzeroupper
1242 %x0 = extractelement <8 x float> %x, i32 0
1243 %x1 = extractelement <8 x float> %x, i32 1
1244 %x01 = fsub float %x1, %x0
1248 define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
1249 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1250 ; SSE3-SLOW: # %bb.0:
1251 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1252 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1253 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1254 ; SSE3-SLOW-NEXT: retq
1256 ; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1257 ; SSE3-FAST: # %bb.0:
1258 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1259 ; SSE3-FAST-NEXT: retq
1261 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1262 ; AVX-SLOW: # %bb.0:
1263 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1264 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1265 ; AVX-SLOW-NEXT: vzeroupper
1266 ; AVX-SLOW-NEXT: retq
1268 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1269 ; AVX-FAST: # %bb.0:
1270 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1271 ; AVX-FAST-NEXT: vzeroupper
1272 ; AVX-FAST-NEXT: retq
1273 %x0 = extractelement <4 x double> %x, i32 0
1274 %x1 = extractelement <4 x double> %x, i32 1
1275 %x01 = fsub double %x0, %x1
1279 ; Negative test...or get hoppy and negate?
1281 define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
1282 ; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1284 ; SSE3-NEXT: movapd %xmm0, %xmm1
1285 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1286 ; SSE3-NEXT: subsd %xmm0, %xmm1
1287 ; SSE3-NEXT: movapd %xmm1, %xmm0
1290 ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1292 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1293 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1294 ; AVX-NEXT: vzeroupper
1296 %x0 = extractelement <4 x double> %x, i32 0
1297 %x1 = extractelement <4 x double> %x, i32 1
1298 %x01 = fsub double %x1, %x0
1302 ; 512-bit vectors, float/double, fadd/fsub
1304 define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
1305 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1306 ; SSE3-SLOW: # %bb.0:
1307 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1308 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1309 ; SSE3-SLOW-NEXT: retq
1311 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1312 ; SSE3-FAST: # %bb.0:
1313 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1314 ; SSE3-FAST-NEXT: retq
1316 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1317 ; AVX-SLOW: # %bb.0:
1318 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1319 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1320 ; AVX-SLOW-NEXT: vzeroupper
1321 ; AVX-SLOW-NEXT: retq
1323 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1324 ; AVX-FAST: # %bb.0:
1325 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1326 ; AVX-FAST-NEXT: vzeroupper
1327 ; AVX-FAST-NEXT: retq
1328 %x0 = extractelement <16 x float> %x, i32 0
1329 %x1 = extractelement <16 x float> %x, i32 1
1330 %x01 = fadd float %x0, %x1
1334 define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
1335 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1336 ; SSE3-SLOW: # %bb.0:
1337 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1338 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1339 ; SSE3-SLOW-NEXT: retq
1341 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1342 ; SSE3-FAST: # %bb.0:
1343 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1344 ; SSE3-FAST-NEXT: retq
1346 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1347 ; AVX-SLOW: # %bb.0:
1348 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1349 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
1350 ; AVX-SLOW-NEXT: vzeroupper
1351 ; AVX-SLOW-NEXT: retq
1353 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1354 ; AVX-FAST: # %bb.0:
1355 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1356 ; AVX-FAST-NEXT: vzeroupper
1357 ; AVX-FAST-NEXT: retq
1358 %x0 = extractelement <16 x float> %x, i32 0
1359 %x1 = extractelement <16 x float> %x, i32 1
1360 %x01 = fadd float %x1, %x0
1364 define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
1365 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1366 ; SSE3-SLOW: # %bb.0:
1367 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1368 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1369 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1370 ; SSE3-SLOW-NEXT: retq
1372 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1373 ; SSE3-FAST: # %bb.0:
1374 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1375 ; SSE3-FAST-NEXT: retq
1377 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1378 ; AVX-SLOW: # %bb.0:
1379 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1380 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1381 ; AVX-SLOW-NEXT: vzeroupper
1382 ; AVX-SLOW-NEXT: retq
1384 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1385 ; AVX-FAST: # %bb.0:
1386 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1387 ; AVX-FAST-NEXT: vzeroupper
1388 ; AVX-FAST-NEXT: retq
1389 %x0 = extractelement <8 x double> %x, i32 0
1390 %x1 = extractelement <8 x double> %x, i32 1
1391 %x01 = fadd double %x0, %x1
1395 define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
1396 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1397 ; SSE3-SLOW: # %bb.0:
1398 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1399 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1400 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1401 ; SSE3-SLOW-NEXT: retq
1403 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1404 ; SSE3-FAST: # %bb.0:
1405 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1406 ; SSE3-FAST-NEXT: retq
1408 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1409 ; AVX-SLOW: # %bb.0:
1410 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1411 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1412 ; AVX-SLOW-NEXT: vzeroupper
1413 ; AVX-SLOW-NEXT: retq
1415 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-FAST: # %bb.0:
1417 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1418 ; AVX-FAST-NEXT: vzeroupper
1419 ; AVX-FAST-NEXT: retq
1420 %x0 = extractelement <8 x double> %x, i32 0
1421 %x1 = extractelement <8 x double> %x, i32 1
1422 %x01 = fadd double %x1, %x0
1426 define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
1427 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1428 ; SSE3-SLOW: # %bb.0:
1429 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1430 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1431 ; SSE3-SLOW-NEXT: retq
1433 ; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1434 ; SSE3-FAST: # %bb.0:
1435 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1436 ; SSE3-FAST-NEXT: retq
1438 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1439 ; AVX-SLOW: # %bb.0:
1440 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1441 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1442 ; AVX-SLOW-NEXT: vzeroupper
1443 ; AVX-SLOW-NEXT: retq
1445 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-FAST: # %bb.0:
1447 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1448 ; AVX-FAST-NEXT: vzeroupper
1449 ; AVX-FAST-NEXT: retq
1450 %x0 = extractelement <16 x float> %x, i32 0
1451 %x1 = extractelement <16 x float> %x, i32 1
1452 %x01 = fsub float %x0, %x1
1456 define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
1457 ; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1459 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1460 ; SSE3-NEXT: subss %xmm0, %xmm1
1461 ; SSE3-NEXT: movaps %xmm1, %xmm0
1464 ; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1466 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1467 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1468 ; AVX-NEXT: vzeroupper
1470 %x0 = extractelement <16 x float> %x, i32 0
1471 %x1 = extractelement <16 x float> %x, i32 1
1472 %x01 = fsub float %x1, %x0
1476 define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
1477 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1478 ; SSE3-SLOW: # %bb.0:
1479 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1480 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1481 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1482 ; SSE3-SLOW-NEXT: retq
1484 ; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1485 ; SSE3-FAST: # %bb.0:
1486 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1487 ; SSE3-FAST-NEXT: retq
1489 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1490 ; AVX-SLOW: # %bb.0:
1491 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1492 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1493 ; AVX-SLOW-NEXT: vzeroupper
1494 ; AVX-SLOW-NEXT: retq
1496 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-FAST: # %bb.0:
1498 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1499 ; AVX-FAST-NEXT: vzeroupper
1500 ; AVX-FAST-NEXT: retq
1501 %x0 = extractelement <8 x double> %x, i32 0
1502 %x1 = extractelement <8 x double> %x, i32 1
1503 %x01 = fsub double %x0, %x1
1507 define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
1508 ; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1510 ; SSE3-NEXT: movapd %xmm0, %xmm1
1511 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1512 ; SSE3-NEXT: subsd %xmm0, %xmm1
1513 ; SSE3-NEXT: movapd %xmm1, %xmm0
1516 ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1518 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1519 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1520 ; AVX-NEXT: vzeroupper
1522 %x0 = extractelement <8 x double> %x, i32 0
1523 %x1 = extractelement <8 x double> %x, i32 1
1524 %x01 = fsub double %x1, %x0
1528 ; Check output when 1 or both extracts have extra uses.
1530 define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, ptr %p) {
1531 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1532 ; SSE3-SLOW: # %bb.0:
1533 ; SSE3-SLOW-NEXT: movss %xmm0, (%rdi)
1534 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1535 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1536 ; SSE3-SLOW-NEXT: retq
1538 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539 ; SSE3-FAST: # %bb.0:
1540 ; SSE3-FAST-NEXT: movss %xmm0, (%rdi)
1541 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1542 ; SSE3-FAST-NEXT: retq
1544 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1545 ; AVX-SLOW: # %bb.0:
1546 ; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi)
1547 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1548 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1549 ; AVX-SLOW-NEXT: retq
1551 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-FAST: # %bb.0:
1553 ; AVX-FAST-NEXT: vmovss %xmm0, (%rdi)
1554 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1555 ; AVX-FAST-NEXT: retq
1556 %x0 = extractelement <4 x float> %x, i32 0
1557 store float %x0, ptr %p
1558 %x1 = extractelement <4 x float> %x, i32 1
1559 %x01 = fadd float %x0, %x1
1563 define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, ptr %p) {
1564 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1565 ; SSE3-SLOW: # %bb.0:
1566 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1567 ; SSE3-SLOW-NEXT: movss %xmm1, (%rdi)
1568 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1569 ; SSE3-SLOW-NEXT: retq
1571 ; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572 ; SSE3-FAST: # %bb.0:
1573 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574 ; SSE3-FAST-NEXT: movss %xmm1, (%rdi)
1575 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1576 ; SSE3-FAST-NEXT: retq
1578 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1579 ; AVX-SLOW: # %bb.0:
1580 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1581 ; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi)
1582 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1583 ; AVX-SLOW-NEXT: retq
1585 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-FAST: # %bb.0:
1587 ; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi)
1588 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1589 ; AVX-FAST-NEXT: retq
1590 %x0 = extractelement <4 x float> %x, i32 0
1591 %x1 = extractelement <4 x float> %x, i32 1
1592 store float %x1, ptr %p
1593 %x01 = fadd float %x0, %x1
1597 define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, ptr %p1, ptr %p2) {
1598 ; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1600 ; SSE3-NEXT: movss %xmm0, (%rdi)
1601 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1602 ; SSE3-NEXT: movss %xmm1, (%rsi)
1603 ; SSE3-NEXT: addss %xmm1, %xmm0
1606 ; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1608 ; AVX-NEXT: vmovss %xmm0, (%rdi)
1609 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1610 ; AVX-NEXT: vmovss %xmm1, (%rsi)
1611 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1613 %x0 = extractelement <4 x float> %x, i32 0
1614 store float %x0, ptr %p1
1615 %x1 = extractelement <4 x float> %x, i32 1
1616 store float %x1, ptr %p2
1617 %x01 = fadd float %x0, %x1
1621 ; Repeat tests from general reductions to verify output for hoppy targets:
1622 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1624 declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1625 declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
1627 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1628 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1629 ; SSE3-SLOW: # %bb.0:
1630 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
1631 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
1632 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1633 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
1634 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1635 ; SSE3-SLOW-NEXT: addss %xmm2, %xmm1
1636 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1637 ; SSE3-SLOW-NEXT: retq
1639 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1640 ; SSE3-FAST: # %bb.0:
1641 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm2
1642 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1643 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2
1644 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0
1645 ; SSE3-FAST-NEXT: retq
1647 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1648 ; AVX-SLOW: # %bb.0:
1649 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1650 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1651 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1652 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1653 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1654 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
1655 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1656 ; AVX-SLOW-NEXT: vzeroupper
1657 ; AVX-SLOW-NEXT: retq
1659 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1660 ; AVX-FAST: # %bb.0:
1661 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1662 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1
1663 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1664 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1665 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
1666 ; AVX-FAST-NEXT: vzeroupper
1667 ; AVX-FAST-NEXT: retq
1668 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
1672 define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1673 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1674 ; SSE3-SLOW: # %bb.0:
1675 ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
1676 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2
1677 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1678 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2
1679 ; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0
1680 ; SSE3-SLOW-NEXT: retq
1682 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1683 ; SSE3-FAST: # %bb.0:
1684 ; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2
1685 ; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2
1686 ; SSE3-FAST-NEXT: addsd %xmm2, %xmm0
1687 ; SSE3-FAST-NEXT: retq
1689 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1690 ; AVX-SLOW: # %bb.0:
1691 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1692 ; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
1693 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1694 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1695 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1696 ; AVX-SLOW-NEXT: vzeroupper
1697 ; AVX-SLOW-NEXT: retq
1699 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1700 ; AVX-FAST: # %bb.0:
1701 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1702 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1
1703 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1704 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1705 ; AVX-FAST-NEXT: vzeroupper
1706 ; AVX-FAST-NEXT: retq
1707 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
1711 define float @PR39936_v8f32(<8 x float>) {
1712 ; SSSE3-SLOW-LABEL: PR39936_v8f32:
1713 ; SSSE3-SLOW: # %bb.0:
1714 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1715 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1716 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1717 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1718 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
1719 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1720 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
1721 ; SSSE3-SLOW-NEXT: retq
1723 ; SSSE3-FAST-LABEL: PR39936_v8f32:
1724 ; SSSE3-FAST: # %bb.0:
1725 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
1726 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1727 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
1728 ; SSSE3-FAST-NEXT: retq
1730 ; SSE3-SLOW-LABEL: PR39936_v8f32:
1731 ; SSE3-SLOW: # %bb.0:
1732 ; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1733 ; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0
1734 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1735 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1736 ; SSE3-SLOW-NEXT: retq
1738 ; SSE3-FAST-LABEL: PR39936_v8f32:
1739 ; SSE3-FAST: # %bb.0:
1740 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm0
1741 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1742 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1743 ; SSE3-FAST-NEXT: retq
1745 ; AVX-SLOW-LABEL: PR39936_v8f32:
1746 ; AVX-SLOW: # %bb.0:
1747 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1748 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1749 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1750 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1751 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1752 ; AVX-SLOW-NEXT: vzeroupper
1753 ; AVX-SLOW-NEXT: retq
1755 ; AVX-FAST-LABEL: PR39936_v8f32:
1756 ; AVX-FAST: # %bb.0:
1757 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1758 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1759 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1760 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1761 ; AVX-FAST-NEXT: vzeroupper
1762 ; AVX-FAST-NEXT: retq
1763 %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
1764 %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1765 %4 = fadd <8 x float> %2, %3
1766 %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1767 %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1768 %7 = fadd <8 x float> %5, %6
1769 %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1770 %9 = fadd <8 x float> %7, %8
1771 %10 = extractelement <8 x float> %9, i32 0
1775 define float @hadd32_4(<4 x float> %x225) {
1776 ; SSE3-SLOW-LABEL: hadd32_4:
1777 ; SSE3-SLOW: # %bb.0:
1778 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1779 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1780 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
1781 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1782 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1783 ; SSE3-SLOW-NEXT: retq
1785 ; SSE3-FAST-LABEL: hadd32_4:
1786 ; SSE3-FAST: # %bb.0:
1787 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1788 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1789 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
1790 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1791 ; SSE3-FAST-NEXT: retq
1793 ; AVX-SLOW-LABEL: hadd32_4:
1794 ; AVX-SLOW: # %bb.0:
1795 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1796 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1797 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1798 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1799 ; AVX-SLOW-NEXT: retq
1801 ; AVX-FAST-LABEL: hadd32_4:
1802 ; AVX-FAST: # %bb.0:
1803 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1804 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1805 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1806 ; AVX-FAST-NEXT: retq
1807 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1808 %x227 = fadd <4 x float> %x225, %x226
1809 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1810 %x229 = fadd <4 x float> %x227, %x228
1811 %x230 = extractelement <4 x float> %x229, i32 0
1815 define float @hadd32_8(<8 x float> %x225) {
1816 ; SSE3-SLOW-LABEL: hadd32_8:
1817 ; SSE3-SLOW: # %bb.0:
1818 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1819 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1820 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
1821 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1822 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1823 ; SSE3-SLOW-NEXT: retq
1825 ; SSE3-FAST-LABEL: hadd32_8:
1826 ; SSE3-FAST: # %bb.0:
1827 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1828 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1829 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
1830 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1831 ; SSE3-FAST-NEXT: retq
1833 ; AVX-SLOW-LABEL: hadd32_8:
1834 ; AVX-SLOW: # %bb.0:
1835 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1836 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1837 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1838 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1839 ; AVX-SLOW-NEXT: vzeroupper
1840 ; AVX-SLOW-NEXT: retq
1842 ; AVX-FAST-LABEL: hadd32_8:
1843 ; AVX-FAST: # %bb.0:
1844 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1845 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1846 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1847 ; AVX-FAST-NEXT: vzeroupper
1848 ; AVX-FAST-NEXT: retq
1849 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1850 %x227 = fadd <8 x float> %x225, %x226
1851 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1852 %x229 = fadd <8 x float> %x227, %x228
1853 %x230 = extractelement <8 x float> %x229, i32 0
1857 define float @hadd32_16(<16 x float> %x225) {
1858 ; SSE3-SLOW-LABEL: hadd32_16:
1859 ; SSE3-SLOW: # %bb.0:
1860 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1861 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1862 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
1863 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1864 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1865 ; SSE3-SLOW-NEXT: retq
1867 ; SSE3-FAST-LABEL: hadd32_16:
1868 ; SSE3-FAST: # %bb.0:
1869 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
1870 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1871 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
1872 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1873 ; SSE3-FAST-NEXT: retq
1875 ; AVX-SLOW-LABEL: hadd32_16:
1876 ; AVX-SLOW: # %bb.0:
1877 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1878 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1879 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1880 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1881 ; AVX-SLOW-NEXT: vzeroupper
1882 ; AVX-SLOW-NEXT: retq
1884 ; AVX-FAST-LABEL: hadd32_16:
1885 ; AVX-FAST: # %bb.0:
1886 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1887 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1888 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1889 ; AVX-FAST-NEXT: vzeroupper
1890 ; AVX-FAST-NEXT: retq
1891 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1892 %x227 = fadd <16 x float> %x225, %x226
1893 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1894 %x229 = fadd <16 x float> %x227, %x228
1895 %x230 = extractelement <16 x float> %x229, i32 0
1899 define float @hadd32_4_optsize(<4 x float> %x225) optsize {
1900 ; SSE3-LABEL: hadd32_4_optsize:
1902 ; SSE3-NEXT: movaps %xmm0, %xmm1
1903 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1904 ; SSE3-NEXT: addps %xmm1, %xmm0
1905 ; SSE3-NEXT: haddps %xmm0, %xmm0
1908 ; AVX-LABEL: hadd32_4_optsize:
1910 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1911 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1912 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1914 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1915 %x227 = fadd <4 x float> %x225, %x226
1916 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1917 %x229 = fadd <4 x float> %x227, %x228
1918 %x230 = extractelement <4 x float> %x229, i32 0
1922 define float @hadd32_8_optsize(<8 x float> %x225) optsize {
1923 ; SSE3-LABEL: hadd32_8_optsize:
1925 ; SSE3-NEXT: movaps %xmm0, %xmm1
1926 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1927 ; SSE3-NEXT: addps %xmm1, %xmm0
1928 ; SSE3-NEXT: haddps %xmm0, %xmm0
1931 ; AVX-LABEL: hadd32_8_optsize:
1933 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1934 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1935 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1936 ; AVX-NEXT: vzeroupper
1938 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1939 %x227 = fadd <8 x float> %x225, %x226
1940 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1941 %x229 = fadd <8 x float> %x227, %x228
1942 %x230 = extractelement <8 x float> %x229, i32 0
1946 define float @hadd32_16_optsize(<16 x float> %x225) optsize {
1947 ; SSE3-LABEL: hadd32_16_optsize:
1949 ; SSE3-NEXT: movaps %xmm0, %xmm1
1950 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1951 ; SSE3-NEXT: addps %xmm1, %xmm0
1952 ; SSE3-NEXT: haddps %xmm0, %xmm0
1955 ; AVX-LABEL: hadd32_16_optsize:
1957 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1958 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1959 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1960 ; AVX-NEXT: vzeroupper
1962 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1963 %x227 = fadd <16 x float> %x225, %x226
1964 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1965 %x229 = fadd <16 x float> %x227, %x228
1966 %x230 = extractelement <16 x float> %x229, i32 0
1970 define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
1971 ; SSE3-LABEL: hadd32_4_pgso:
1973 ; SSE3-NEXT: movaps %xmm0, %xmm1
1974 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1975 ; SSE3-NEXT: addps %xmm1, %xmm0
1976 ; SSE3-NEXT: haddps %xmm0, %xmm0
1979 ; AVX-LABEL: hadd32_4_pgso:
1981 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1982 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1983 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1985 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1986 %x227 = fadd <4 x float> %x225, %x226
1987 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1988 %x229 = fadd <4 x float> %x227, %x228
1989 %x230 = extractelement <4 x float> %x229, i32 0
1993 define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
1994 ; SSE3-LABEL: hadd32_8_pgso:
1996 ; SSE3-NEXT: movaps %xmm0, %xmm1
1997 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1998 ; SSE3-NEXT: addps %xmm1, %xmm0
1999 ; SSE3-NEXT: haddps %xmm0, %xmm0
2002 ; AVX-LABEL: hadd32_8_pgso:
2004 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2005 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2006 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2007 ; AVX-NEXT: vzeroupper
2009 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2010 %x227 = fadd <8 x float> %x225, %x226
2011 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2012 %x229 = fadd <8 x float> %x227, %x228
2013 %x230 = extractelement <8 x float> %x229, i32 0
2017 define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
2018 ; SSE3-LABEL: hadd32_16_pgso:
2020 ; SSE3-NEXT: movaps %xmm0, %xmm1
2021 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2022 ; SSE3-NEXT: addps %xmm1, %xmm0
2023 ; SSE3-NEXT: haddps %xmm0, %xmm0
2026 ; AVX-LABEL: hadd32_16_pgso:
2028 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2029 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2030 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2031 ; AVX-NEXT: vzeroupper
2033 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2034 %x227 = fadd <16 x float> %x225, %x226
2035 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2036 %x229 = fadd <16 x float> %x227, %x228
2037 %x230 = extractelement <16 x float> %x229, i32 0
2041 define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
2042 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
2043 ; SSE3-SLOW: # %bb.0:
2044 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2045 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2046 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
2047 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2048 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
2049 ; SSE3-SLOW-NEXT: retq
2051 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
2052 ; SSE3-FAST: # %bb.0:
2053 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2054 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2055 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
2056 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
2057 ; SSE3-FAST-NEXT: retq
2059 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2060 ; AVX-SLOW: # %bb.0:
2061 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2062 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2063 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2064 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2065 ; AVX-SLOW-NEXT: vzeroupper
2066 ; AVX-SLOW-NEXT: retq
2068 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2069 ; AVX-FAST: # %bb.0:
2070 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2071 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2072 ; AVX-FAST-NEXT: vzeroupper
2073 ; AVX-FAST-NEXT: retq
2074 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2075 %x0213 = fadd <8 x float> %x, %x23
2076 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2077 %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13
2078 %r = extractelement <8 x float> %x0123, i32 0
2082 ; Negative test - only the flags on the final math op in the
2083 ; sequence determine whether we can transform to horizontal ops.
2085 define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
2086 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2087 ; SSE3-SLOW: # %bb.0:
2088 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2089 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2090 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
2091 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2092 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
2093 ; SSE3-SLOW-NEXT: retq
2095 ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2096 ; SSE3-FAST: # %bb.0:
2097 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2098 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2099 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
2100 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
2101 ; SSE3-FAST-NEXT: retq
2103 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2104 ; AVX-SLOW: # %bb.0:
2105 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2106 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2107 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2108 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2109 ; AVX-SLOW-NEXT: vzeroupper
2110 ; AVX-SLOW-NEXT: retq
2112 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2113 ; AVX-FAST: # %bb.0:
2114 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2115 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
2116 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2117 ; AVX-FAST-NEXT: vzeroupper
2118 ; AVX-FAST-NEXT: retq
2119 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2120 %x0213 = fadd fast <8 x float> %x, %x23
2121 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2122 %x0123 = fadd ninf nnan <8 x float> %x0213, %x13
2123 %r = extractelement <8 x float> %x0123, i32 0
2127 define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
2128 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32:
2129 ; SSE3-SLOW: # %bb.0:
2130 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2131 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2132 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
2133 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2134 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
2135 ; SSE3-SLOW-NEXT: retq
2137 ; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
2138 ; SSE3-FAST: # %bb.0:
2139 ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
2140 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2141 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
2142 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
2143 ; SSE3-FAST-NEXT: retq
2145 ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2146 ; AVX-SLOW: # %bb.0:
2147 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
2148 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2149 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2150 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2151 ; AVX-SLOW-NEXT: vzeroupper
2152 ; AVX-SLOW-NEXT: retq
2154 ; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2155 ; AVX-FAST: # %bb.0:
2156 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2157 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2158 ; AVX-FAST-NEXT: vzeroupper
2159 ; AVX-FAST-NEXT: retq
2160 %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2161 %x0213 = fadd <16 x float> %x, %x23
2162 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2163 %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13
2164 %r = extractelement <16 x float> %x0123, i32 0
2168 !llvm.module.flags = !{!0}
2169 !0 = !{i32 1, !"ProfileSummary", !1}
2170 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
2171 !2 = !{!"ProfileFormat", !"InstrProf"}
2172 !3 = !{!"TotalCount", i64 10000}
2173 !4 = !{!"MaxCount", i64 10}
2174 !5 = !{!"MaxInternalCount", i64 1}
2175 !6 = !{!"MaxFunctionCount", i64 1000}
2176 !7 = !{!"NumCounts", i64 3}
2177 !8 = !{!"NumFunctions", i64 3}
2178 !9 = !{!"DetailedSummary", !10}
2179 !10 = !{!11, !12, !13}
2180 !11 = !{i32 10000, i64 100, i32 1}
2181 !12 = !{i32 999000, i64 100, i32 1}
2182 !13 = !{i32 999999, i64 1, i32 2}
2183 !14 = !{!"function_entry_count", i64 0}