1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512-FAST
9 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
10 ; SSE3-LABEL: haddpd1:
12 ; SSE3-NEXT: haddpd %xmm1, %xmm0
17 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
19 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
20 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
21 %r = fadd <2 x double> %a, %b
25 define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
26 ; SSE3-LABEL: haddpd2:
28 ; SSE3-NEXT: haddpd %xmm1, %xmm0
33 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
35 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
36 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
37 %r = fadd <2 x double> %a, %b
41 define <2 x double> @haddpd3(<2 x double> %x) {
42 ; SSE3-SLOW-LABEL: haddpd3:
44 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
45 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
46 ; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1
47 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
48 ; SSE3-SLOW-NEXT: retq
50 ; SSE3-FAST-LABEL: haddpd3:
52 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
53 ; SSE3-FAST-NEXT: retq
55 ; AVX-SLOW-LABEL: haddpd3:
57 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
58 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
61 ; AVX-FAST-LABEL: haddpd3:
63 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
65 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
66 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
67 %r = fadd <2 x double> %a, %b
71 define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
72 ; SSE3-LABEL: haddps1:
74 ; SSE3-NEXT: haddps %xmm1, %xmm0
79 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
81 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
82 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
83 %r = fadd <4 x float> %a, %b
87 define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
88 ; SSE3-LABEL: haddps2:
90 ; SSE3-NEXT: haddps %xmm1, %xmm0
95 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
97 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
98 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
99 %r = fadd <4 x float> %a, %b
103 define <4 x float> @haddps3(<4 x float> %x) {
104 ; SSE3-SLOW-LABEL: haddps3:
105 ; SSE3-SLOW: # %bb.0:
106 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
107 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
108 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
109 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
110 ; SSE3-SLOW-NEXT: retq
112 ; SSE3-FAST-LABEL: haddps3:
113 ; SSE3-FAST: # %bb.0:
114 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
115 ; SSE3-FAST-NEXT: retq
117 ; AVX-SLOW-LABEL: haddps3:
119 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
120 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
121 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
122 ; AVX-SLOW-NEXT: retq
124 ; AVX-FAST-LABEL: haddps3:
126 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
127 ; AVX-FAST-NEXT: retq
128 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
129 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
130 %r = fadd <4 x float> %a, %b
134 define <4 x float> @haddps4(<4 x float> %x) {
135 ; SSE3-SLOW-LABEL: haddps4:
136 ; SSE3-SLOW: # %bb.0:
137 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
138 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
139 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
140 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
141 ; SSE3-SLOW-NEXT: retq
143 ; SSE3-FAST-LABEL: haddps4:
144 ; SSE3-FAST: # %bb.0:
145 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
146 ; SSE3-FAST-NEXT: retq
148 ; AVX-SLOW-LABEL: haddps4:
150 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
151 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
152 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
153 ; AVX-SLOW-NEXT: retq
155 ; AVX-FAST-LABEL: haddps4:
157 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
158 ; AVX-FAST-NEXT: retq
159 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
160 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
161 %r = fadd <4 x float> %a, %b
165 define <4 x float> @haddps5(<4 x float> %x) {
166 ; SSE3-SLOW-LABEL: haddps5:
167 ; SSE3-SLOW: # %bb.0:
168 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
169 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3]
170 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,2,3]
171 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
172 ; SSE3-SLOW-NEXT: retq
174 ; SSE3-FAST-LABEL: haddps5:
175 ; SSE3-FAST: # %bb.0:
176 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
177 ; SSE3-FAST-NEXT: retq
179 ; AVX-SLOW-LABEL: haddps5:
181 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3]
182 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3]
183 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
184 ; AVX-SLOW-NEXT: retq
186 ; AVX-FAST-LABEL: haddps5:
188 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
189 ; AVX-FAST-NEXT: retq
190 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
191 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
192 %r = fadd <4 x float> %a, %b
196 define <4 x float> @haddps6(<4 x float> %x) {
197 ; SSE3-SLOW-LABEL: haddps6:
198 ; SSE3-SLOW: # %bb.0:
199 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
200 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
201 ; SSE3-SLOW-NEXT: retq
203 ; SSE3-FAST-LABEL: haddps6:
204 ; SSE3-FAST: # %bb.0:
205 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
206 ; SSE3-FAST-NEXT: retq
208 ; AVX-SLOW-LABEL: haddps6:
210 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
211 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
212 ; AVX-SLOW-NEXT: retq
214 ; AVX-FAST-LABEL: haddps6:
216 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
217 ; AVX-FAST-NEXT: retq
218 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
219 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
220 %r = fadd <4 x float> %a, %b
224 define <4 x float> @haddps7(<4 x float> %x) {
225 ; SSE3-SLOW-LABEL: haddps7:
226 ; SSE3-SLOW: # %bb.0:
227 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
228 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
229 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
230 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
231 ; SSE3-SLOW-NEXT: retq
233 ; SSE3-FAST-LABEL: haddps7:
234 ; SSE3-FAST: # %bb.0:
235 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
236 ; SSE3-FAST-NEXT: retq
238 ; AVX-SLOW-LABEL: haddps7:
240 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
241 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
242 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
243 ; AVX-SLOW-NEXT: retq
245 ; AVX-FAST-LABEL: haddps7:
247 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
248 ; AVX-FAST-NEXT: retq
249 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
250 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
251 %r = fadd <4 x float> %a, %b
255 define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
256 ; SSE3-LABEL: hsubpd1:
258 ; SSE3-NEXT: hsubpd %xmm1, %xmm0
261 ; AVX-LABEL: hsubpd1:
263 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
265 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
266 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
267 %r = fsub <2 x double> %a, %b
271 define <2 x double> @hsubpd2(<2 x double> %x) {
272 ; SSE3-SLOW-LABEL: hsubpd2:
273 ; SSE3-SLOW: # %bb.0:
274 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
275 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
276 ; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0
277 ; SSE3-SLOW-NEXT: retq
279 ; SSE3-FAST-LABEL: hsubpd2:
280 ; SSE3-FAST: # %bb.0:
281 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
282 ; SSE3-FAST-NEXT: retq
284 ; AVX-SLOW-LABEL: hsubpd2:
286 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
287 ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
288 ; AVX-SLOW-NEXT: retq
290 ; AVX-FAST-LABEL: hsubpd2:
292 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
293 ; AVX-FAST-NEXT: retq
294 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
295 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
296 %r = fsub <2 x double> %a, %b
300 define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
301 ; SSE3-LABEL: hsubps1:
303 ; SSE3-NEXT: hsubps %xmm1, %xmm0
306 ; AVX-LABEL: hsubps1:
308 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
310 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
311 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
312 %r = fsub <4 x float> %a, %b
316 define <4 x float> @hsubps2(<4 x float> %x) {
317 ; SSE3-SLOW-LABEL: hsubps2:
318 ; SSE3-SLOW: # %bb.0:
319 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
320 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
321 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
322 ; SSE3-SLOW-NEXT: subps %xmm0, %xmm1
323 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
324 ; SSE3-SLOW-NEXT: retq
326 ; SSE3-FAST-LABEL: hsubps2:
327 ; SSE3-FAST: # %bb.0:
328 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
329 ; SSE3-FAST-NEXT: retq
331 ; AVX-SLOW-LABEL: hsubps2:
333 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
334 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
335 ; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
336 ; AVX-SLOW-NEXT: retq
338 ; AVX-FAST-LABEL: hsubps2:
340 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
341 ; AVX-FAST-NEXT: retq
342 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
343 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
344 %r = fsub <4 x float> %a, %b
348 define <4 x float> @hsubps3(<4 x float> %x) {
349 ; SSE3-SLOW-LABEL: hsubps3:
350 ; SSE3-SLOW: # %bb.0:
351 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
352 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
353 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
354 ; SSE3-SLOW-NEXT: subps %xmm0, %xmm1
355 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
356 ; SSE3-SLOW-NEXT: retq
358 ; SSE3-FAST-LABEL: hsubps3:
359 ; SSE3-FAST: # %bb.0:
360 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
361 ; SSE3-FAST-NEXT: retq
363 ; AVX-SLOW-LABEL: hsubps3:
365 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
366 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
367 ; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
368 ; AVX-SLOW-NEXT: retq
370 ; AVX-FAST-LABEL: hsubps3:
372 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
373 ; AVX-FAST-NEXT: retq
374 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
375 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
376 %r = fsub <4 x float> %a, %b
380 define <4 x float> @hsubps4(<4 x float> %x) {
381 ; SSE3-SLOW-LABEL: hsubps4:
382 ; SSE3-SLOW: # %bb.0:
383 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
384 ; SSE3-SLOW-NEXT: subps %xmm1, %xmm0
385 ; SSE3-SLOW-NEXT: retq
387 ; SSE3-FAST-LABEL: hsubps4:
388 ; SSE3-FAST: # %bb.0:
389 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
390 ; SSE3-FAST-NEXT: retq
392 ; AVX-SLOW-LABEL: hsubps4:
394 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
395 ; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
396 ; AVX-SLOW-NEXT: retq
398 ; AVX-FAST-LABEL: hsubps4:
400 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
401 ; AVX-FAST-NEXT: retq
402 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
403 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
404 %r = fsub <4 x float> %a, %b
408 define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
409 ; SSE3-LABEL: vhaddps1:
411 ; SSE3-NEXT: haddps %xmm2, %xmm0
412 ; SSE3-NEXT: haddps %xmm3, %xmm1
415 ; AVX-LABEL: vhaddps1:
417 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
419 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
420 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
421 %r = fadd <8 x float> %a, %b
425 define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
426 ; SSE3-LABEL: vhaddps2:
428 ; SSE3-NEXT: haddps %xmm2, %xmm0
429 ; SSE3-NEXT: haddps %xmm3, %xmm1
432 ; AVX-LABEL: vhaddps2:
434 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
436 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
437 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
438 %r = fadd <8 x float> %a, %b
442 define <8 x float> @vhaddps3(<8 x float> %x) {
443 ; SSE3-SLOW-LABEL: vhaddps3:
444 ; SSE3-SLOW: # %bb.0:
445 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
446 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
447 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3
448 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
449 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
450 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
451 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
452 ; SSE3-SLOW-NEXT: addps %xmm3, %xmm0
453 ; SSE3-SLOW-NEXT: retq
455 ; SSE3-FAST-LABEL: vhaddps3:
456 ; SSE3-FAST: # %bb.0:
457 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
458 ; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
459 ; SSE3-FAST-NEXT: retq
461 ; AVX-SLOW-LABEL: vhaddps3:
463 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
464 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
465 ; AVX-SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0
466 ; AVX-SLOW-NEXT: retq
468 ; AVX-FAST-LABEL: vhaddps3:
470 ; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
471 ; AVX-FAST-NEXT: retq
472 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
473 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
474 %r = fadd <8 x float> %a, %b
478 define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
479 ; SSE3-LABEL: vhsubps1:
481 ; SSE3-NEXT: hsubps %xmm2, %xmm0
482 ; SSE3-NEXT: hsubps %xmm3, %xmm1
485 ; AVX-LABEL: vhsubps1:
487 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
489 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
490 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
491 %r = fsub <8 x float> %a, %b
495 define <8 x float> @vhsubps3(<8 x float> %x) {
496 ; SSE3-SLOW-LABEL: vhsubps3:
497 ; SSE3-SLOW: # %bb.0:
498 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
499 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
500 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3
501 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
502 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
503 ; SSE3-SLOW-NEXT: subps %xmm1, %xmm2
504 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
505 ; SSE3-SLOW-NEXT: subps %xmm0, %xmm3
506 ; SSE3-SLOW-NEXT: movaps %xmm3, %xmm0
507 ; SSE3-SLOW-NEXT: movaps %xmm2, %xmm1
508 ; SSE3-SLOW-NEXT: retq
510 ; SSE3-FAST-LABEL: vhsubps3:
511 ; SSE3-FAST: # %bb.0:
512 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
513 ; SSE3-FAST-NEXT: hsubps %xmm1, %xmm1
514 ; SSE3-FAST-NEXT: retq
516 ; AVX-SLOW-LABEL: vhsubps3:
518 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
519 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
520 ; AVX-SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0
521 ; AVX-SLOW-NEXT: retq
523 ; AVX-FAST-LABEL: vhsubps3:
525 ; AVX-FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0
526 ; AVX-FAST-NEXT: retq
527 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
528 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
529 %r = fsub <8 x float> %a, %b
533 define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
534 ; SSE3-LABEL: vhaddpd1:
536 ; SSE3-NEXT: haddpd %xmm2, %xmm0
537 ; SSE3-NEXT: haddpd %xmm3, %xmm1
540 ; AVX-LABEL: vhaddpd1:
542 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
544 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
545 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
546 %r = fadd <4 x double> %a, %b
550 define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
551 ; SSE3-LABEL: vhsubpd1:
553 ; SSE3-NEXT: hsubpd %xmm2, %xmm0
554 ; SSE3-NEXT: hsubpd %xmm3, %xmm1
557 ; AVX-LABEL: vhsubpd1:
559 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
561 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
562 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
563 %r = fsub <4 x double> %a, %b
567 define <2 x float> @haddps_v2f32(<4 x float> %v0) {
568 ; SSE3-LABEL: haddps_v2f32:
570 ; SSE3-NEXT: haddps %xmm0, %xmm0
573 ; AVX-LABEL: haddps_v2f32:
575 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
577 %v0.0 = extractelement <4 x float> %v0, i32 0
578 %v0.1 = extractelement <4 x float> %v0, i32 1
579 %v0.2 = extractelement <4 x float> %v0, i32 2
580 %v0.3 = extractelement <4 x float> %v0, i32 3
581 %op0 = fadd float %v0.0, %v0.1
582 %op1 = fadd float %v0.2, %v0.3
583 %res0 = insertelement <2 x float> undef, float %op0, i32 0
584 %res1 = insertelement <2 x float> %res0, float %op1, i32 1
585 ret <2 x float> %res1
588 ; 128-bit vectors, float/double, fadd/fsub
590 define float @extract_extract_v4f32_fadd_f32(<4 x float> %x) {
591 ; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32:
592 ; SSE3-SLOW: # %bb.0:
593 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
594 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
595 ; SSE3-SLOW-NEXT: retq
597 ; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32:
598 ; SSE3-FAST: # %bb.0:
599 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
600 ; SSE3-FAST-NEXT: retq
602 ; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32:
604 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
605 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
606 ; AVX-SLOW-NEXT: retq
608 ; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32:
610 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
611 ; AVX-FAST-NEXT: retq
612 %x0 = extractelement <4 x float> %x, i32 0
613 %x1 = extractelement <4 x float> %x, i32 1
614 %x01 = fadd float %x0, %x1
618 define float @extract_extract_v4f32_fadd_f32_commute(<4 x float> %x) {
619 ; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_commute:
620 ; SSE3-SLOW: # %bb.0:
621 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
622 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
623 ; SSE3-SLOW-NEXT: retq
625 ; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_commute:
626 ; SSE3-FAST: # %bb.0:
627 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
628 ; SSE3-FAST-NEXT: retq
630 ; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_commute:
632 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
633 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
634 ; AVX-SLOW-NEXT: retq
636 ; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_commute:
638 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
639 ; AVX-FAST-NEXT: retq
640 %x0 = extractelement <4 x float> %x, i32 0
641 %x1 = extractelement <4 x float> %x, i32 1
642 %x01 = fadd float %x1, %x0
646 define double @extract_extract_v2f64_fadd_f64(<2 x double> %x) {
647 ; SSE3-SLOW-LABEL: extract_extract_v2f64_fadd_f64:
648 ; SSE3-SLOW: # %bb.0:
649 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
650 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
651 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
652 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
653 ; SSE3-SLOW-NEXT: retq
655 ; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64:
656 ; SSE3-FAST: # %bb.0:
657 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
658 ; SSE3-FAST-NEXT: retq
660 ; AVX-SLOW-LABEL: extract_extract_v2f64_fadd_f64:
662 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
663 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
664 ; AVX-SLOW-NEXT: retq
666 ; AVX-FAST-LABEL: extract_extract_v2f64_fadd_f64:
668 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
669 ; AVX-FAST-NEXT: retq
670 %x0 = extractelement <2 x double> %x, i32 0
671 %x1 = extractelement <2 x double> %x, i32 1
672 %x01 = fadd double %x0, %x1
676 define double @extract_extract_v2f64_fadd_f64_commute(<2 x double> %x) {
677 ; SSE3-SLOW-LABEL: extract_extract_v2f64_fadd_f64_commute:
678 ; SSE3-SLOW: # %bb.0:
679 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
680 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
681 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
682 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
683 ; SSE3-SLOW-NEXT: retq
685 ; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute:
686 ; SSE3-FAST: # %bb.0:
687 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
688 ; SSE3-FAST-NEXT: retq
690 ; AVX-SLOW-LABEL: extract_extract_v2f64_fadd_f64_commute:
692 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
693 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
694 ; AVX-SLOW-NEXT: retq
696 ; AVX-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute:
698 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
699 ; AVX-FAST-NEXT: retq
700 %x0 = extractelement <2 x double> %x, i32 0
701 %x1 = extractelement <2 x double> %x, i32 1
702 %x01 = fadd double %x1, %x0
706 define float @extract_extract_v4f32_fsub_f32(<4 x float> %x) {
707 ; SSE3-SLOW-LABEL: extract_extract_v4f32_fsub_f32:
708 ; SSE3-SLOW: # %bb.0:
709 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
710 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
711 ; SSE3-SLOW-NEXT: retq
713 ; SSE3-FAST-LABEL: extract_extract_v4f32_fsub_f32:
714 ; SSE3-FAST: # %bb.0:
715 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
716 ; SSE3-FAST-NEXT: retq
718 ; AVX-SLOW-LABEL: extract_extract_v4f32_fsub_f32:
720 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
721 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
722 ; AVX-SLOW-NEXT: retq
724 ; AVX-FAST-LABEL: extract_extract_v4f32_fsub_f32:
726 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
727 ; AVX-FAST-NEXT: retq
728 %x0 = extractelement <4 x float> %x, i32 0
729 %x1 = extractelement <4 x float> %x, i32 1
730 %x01 = fsub float %x0, %x1
734 define float @extract_extract_v4f32_fsub_f32_commute(<4 x float> %x) {
735 ; SSE3-LABEL: extract_extract_v4f32_fsub_f32_commute:
737 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
738 ; SSE3-NEXT: subss %xmm0, %xmm1
739 ; SSE3-NEXT: movaps %xmm1, %xmm0
742 ; AVX-LABEL: extract_extract_v4f32_fsub_f32_commute:
744 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
745 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
747 %x0 = extractelement <4 x float> %x, i32 0
748 %x1 = extractelement <4 x float> %x, i32 1
749 %x01 = fsub float %x1, %x0
753 define double @extract_extract_v2f64_fsub_f64(<2 x double> %x) {
754 ; SSE3-SLOW-LABEL: extract_extract_v2f64_fsub_f64:
755 ; SSE3-SLOW: # %bb.0:
756 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
757 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
758 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
759 ; SSE3-SLOW-NEXT: retq
761 ; SSE3-FAST-LABEL: extract_extract_v2f64_fsub_f64:
762 ; SSE3-FAST: # %bb.0:
763 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
764 ; SSE3-FAST-NEXT: retq
766 ; AVX-SLOW-LABEL: extract_extract_v2f64_fsub_f64:
768 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
769 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
770 ; AVX-SLOW-NEXT: retq
772 ; AVX-FAST-LABEL: extract_extract_v2f64_fsub_f64:
774 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
775 ; AVX-FAST-NEXT: retq
776 %x0 = extractelement <2 x double> %x, i32 0
777 %x1 = extractelement <2 x double> %x, i32 1
778 %x01 = fsub double %x0, %x1
782 define double @extract_extract_v2f64_fsub_f64_commute(<2 x double> %x) {
783 ; SSE3-LABEL: extract_extract_v2f64_fsub_f64_commute:
785 ; SSE3-NEXT: movapd %xmm0, %xmm1
786 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
787 ; SSE3-NEXT: subsd %xmm0, %xmm1
788 ; SSE3-NEXT: movapd %xmm1, %xmm0
791 ; AVX-LABEL: extract_extract_v2f64_fsub_f64_commute:
793 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
794 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
796 %x0 = extractelement <2 x double> %x, i32 0
797 %x1 = extractelement <2 x double> %x, i32 1
798 %x01 = fsub double %x1, %x0
802 ; 256-bit vectors, float/double, fadd/fsub
804 define float @extract_extract_v8f32_fadd_f32(<8 x float> %x) {
805 ; SSE3-SLOW-LABEL: extract_extract_v8f32_fadd_f32:
806 ; SSE3-SLOW: # %bb.0:
807 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
808 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
809 ; SSE3-SLOW-NEXT: retq
811 ; SSE3-FAST-LABEL: extract_extract_v8f32_fadd_f32:
812 ; SSE3-FAST: # %bb.0:
813 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
814 ; SSE3-FAST-NEXT: retq
816 ; AVX-SLOW-LABEL: extract_extract_v8f32_fadd_f32:
818 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
819 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
820 ; AVX-SLOW-NEXT: vzeroupper
821 ; AVX-SLOW-NEXT: retq
823 ; AVX-FAST-LABEL: extract_extract_v8f32_fadd_f32:
825 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
826 ; AVX-FAST-NEXT: vzeroupper
827 ; AVX-FAST-NEXT: retq
828 %x0 = extractelement <8 x float> %x, i32 0
829 %x1 = extractelement <8 x float> %x, i32 1
830 %x01 = fadd float %x0, %x1
834 define float @extract_extract_v8f32_fadd_f32_commute(<8 x float> %x) {
835 ; SSE3-SLOW-LABEL: extract_extract_v8f32_fadd_f32_commute:
836 ; SSE3-SLOW: # %bb.0:
837 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
838 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
839 ; SSE3-SLOW-NEXT: retq
841 ; SSE3-FAST-LABEL: extract_extract_v8f32_fadd_f32_commute:
842 ; SSE3-FAST: # %bb.0:
843 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
844 ; SSE3-FAST-NEXT: retq
846 ; AVX-SLOW-LABEL: extract_extract_v8f32_fadd_f32_commute:
848 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
849 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
850 ; AVX-SLOW-NEXT: vzeroupper
851 ; AVX-SLOW-NEXT: retq
853 ; AVX-FAST-LABEL: extract_extract_v8f32_fadd_f32_commute:
855 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
856 ; AVX-FAST-NEXT: vzeroupper
857 ; AVX-FAST-NEXT: retq
858 %x0 = extractelement <8 x float> %x, i32 0
859 %x1 = extractelement <8 x float> %x, i32 1
860 %x01 = fadd float %x1, %x0
864 define double @extract_extract_v4f64_fadd_f64(<4 x double> %x) {
865 ; SSE3-SLOW-LABEL: extract_extract_v4f64_fadd_f64:
866 ; SSE3-SLOW: # %bb.0:
867 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
868 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
869 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
870 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
871 ; SSE3-SLOW-NEXT: retq
873 ; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64:
874 ; SSE3-FAST: # %bb.0:
875 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
876 ; SSE3-FAST-NEXT: retq
878 ; AVX-SLOW-LABEL: extract_extract_v4f64_fadd_f64:
880 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
881 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
882 ; AVX-SLOW-NEXT: vzeroupper
883 ; AVX-SLOW-NEXT: retq
885 ; AVX-FAST-LABEL: extract_extract_v4f64_fadd_f64:
887 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
888 ; AVX-FAST-NEXT: vzeroupper
889 ; AVX-FAST-NEXT: retq
890 %x0 = extractelement <4 x double> %x, i32 0
891 %x1 = extractelement <4 x double> %x, i32 1
892 %x01 = fadd double %x0, %x1
896 define double @extract_extract_v4f64_fadd_f64_commute(<4 x double> %x) {
897 ; SSE3-SLOW-LABEL: extract_extract_v4f64_fadd_f64_commute:
898 ; SSE3-SLOW: # %bb.0:
899 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
900 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
901 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
902 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
903 ; SSE3-SLOW-NEXT: retq
905 ; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute:
906 ; SSE3-FAST: # %bb.0:
907 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
908 ; SSE3-FAST-NEXT: retq
910 ; AVX-SLOW-LABEL: extract_extract_v4f64_fadd_f64_commute:
912 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
913 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
914 ; AVX-SLOW-NEXT: vzeroupper
915 ; AVX-SLOW-NEXT: retq
917 ; AVX-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute:
919 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
920 ; AVX-FAST-NEXT: vzeroupper
921 ; AVX-FAST-NEXT: retq
922 %x0 = extractelement <4 x double> %x, i32 0
923 %x1 = extractelement <4 x double> %x, i32 1
924 %x01 = fadd double %x1, %x0
928 define float @extract_extract_v8f32_fsub_f32(<8 x float> %x) {
929 ; SSE3-SLOW-LABEL: extract_extract_v8f32_fsub_f32:
930 ; SSE3-SLOW: # %bb.0:
931 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
932 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
933 ; SSE3-SLOW-NEXT: retq
935 ; SSE3-FAST-LABEL: extract_extract_v8f32_fsub_f32:
936 ; SSE3-FAST: # %bb.0:
937 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
938 ; SSE3-FAST-NEXT: retq
940 ; AVX-SLOW-LABEL: extract_extract_v8f32_fsub_f32:
942 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
943 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
944 ; AVX-SLOW-NEXT: vzeroupper
945 ; AVX-SLOW-NEXT: retq
947 ; AVX-FAST-LABEL: extract_extract_v8f32_fsub_f32:
949 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
950 ; AVX-FAST-NEXT: vzeroupper
951 ; AVX-FAST-NEXT: retq
952 %x0 = extractelement <8 x float> %x, i32 0
953 %x1 = extractelement <8 x float> %x, i32 1
954 %x01 = fsub float %x0, %x1
958 ; Negative test...or get hoppy and negate?
960 define float @extract_extract_v8f32_fsub_f32_commute(<8 x float> %x) {
961 ; SSE3-LABEL: extract_extract_v8f32_fsub_f32_commute:
963 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
964 ; SSE3-NEXT: subss %xmm0, %xmm1
965 ; SSE3-NEXT: movaps %xmm1, %xmm0
968 ; AVX-LABEL: extract_extract_v8f32_fsub_f32_commute:
970 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
971 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
972 ; AVX-NEXT: vzeroupper
974 %x0 = extractelement <8 x float> %x, i32 0
975 %x1 = extractelement <8 x float> %x, i32 1
976 %x01 = fsub float %x1, %x0
980 define double @extract_extract_v4f64_fsub_f64(<4 x double> %x) {
981 ; SSE3-SLOW-LABEL: extract_extract_v4f64_fsub_f64:
982 ; SSE3-SLOW: # %bb.0:
983 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
984 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
985 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
986 ; SSE3-SLOW-NEXT: retq
988 ; SSE3-FAST-LABEL: extract_extract_v4f64_fsub_f64:
989 ; SSE3-FAST: # %bb.0:
990 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
991 ; SSE3-FAST-NEXT: retq
993 ; AVX-SLOW-LABEL: extract_extract_v4f64_fsub_f64:
995 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
996 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
997 ; AVX-SLOW-NEXT: vzeroupper
998 ; AVX-SLOW-NEXT: retq
1000 ; AVX-FAST-LABEL: extract_extract_v4f64_fsub_f64:
1001 ; AVX-FAST: # %bb.0:
1002 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1003 ; AVX-FAST-NEXT: vzeroupper
1004 ; AVX-FAST-NEXT: retq
1005 %x0 = extractelement <4 x double> %x, i32 0
1006 %x1 = extractelement <4 x double> %x, i32 1
1007 %x01 = fsub double %x0, %x1
1011 ; Negative test...or get hoppy and negate?
1013 define double @extract_extract_v4f64_fsub_f64_commute(<4 x double> %x) {
1014 ; SSE3-LABEL: extract_extract_v4f64_fsub_f64_commute:
1016 ; SSE3-NEXT: movapd %xmm0, %xmm1
1017 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1018 ; SSE3-NEXT: subsd %xmm0, %xmm1
1019 ; SSE3-NEXT: movapd %xmm1, %xmm0
1022 ; AVX-LABEL: extract_extract_v4f64_fsub_f64_commute:
1024 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1025 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1026 ; AVX-NEXT: vzeroupper
1028 %x0 = extractelement <4 x double> %x, i32 0
1029 %x1 = extractelement <4 x double> %x, i32 1
1030 %x01 = fsub double %x1, %x0
1034 ; 512-bit vectors, float/double, fadd/fsub
1036 define float @extract_extract_v16f32_fadd_f32(<16 x float> %x) {
1037 ; SSE3-SLOW-LABEL: extract_extract_v16f32_fadd_f32:
1038 ; SSE3-SLOW: # %bb.0:
1039 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1040 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1041 ; SSE3-SLOW-NEXT: retq
1043 ; SSE3-FAST-LABEL: extract_extract_v16f32_fadd_f32:
1044 ; SSE3-FAST: # %bb.0:
1045 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1046 ; SSE3-FAST-NEXT: retq
1048 ; AVX-SLOW-LABEL: extract_extract_v16f32_fadd_f32:
1049 ; AVX-SLOW: # %bb.0:
1050 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1051 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1052 ; AVX-SLOW-NEXT: vzeroupper
1053 ; AVX-SLOW-NEXT: retq
1055 ; AVX-FAST-LABEL: extract_extract_v16f32_fadd_f32:
1056 ; AVX-FAST: # %bb.0:
1057 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1058 ; AVX-FAST-NEXT: vzeroupper
1059 ; AVX-FAST-NEXT: retq
1060 %x0 = extractelement <16 x float> %x, i32 0
1061 %x1 = extractelement <16 x float> %x, i32 1
1062 %x01 = fadd float %x0, %x1
1066 define float @extract_extract_v16f32_fadd_f32_commute(<16 x float> %x) {
1067 ; SSE3-SLOW-LABEL: extract_extract_v16f32_fadd_f32_commute:
1068 ; SSE3-SLOW: # %bb.0:
1069 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1070 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1071 ; SSE3-SLOW-NEXT: retq
1073 ; SSE3-FAST-LABEL: extract_extract_v16f32_fadd_f32_commute:
1074 ; SSE3-FAST: # %bb.0:
1075 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1076 ; SSE3-FAST-NEXT: retq
1078 ; AVX-SLOW-LABEL: extract_extract_v16f32_fadd_f32_commute:
1079 ; AVX-SLOW: # %bb.0:
1080 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1081 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
1082 ; AVX-SLOW-NEXT: vzeroupper
1083 ; AVX-SLOW-NEXT: retq
1085 ; AVX-FAST-LABEL: extract_extract_v16f32_fadd_f32_commute:
1086 ; AVX-FAST: # %bb.0:
1087 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1088 ; AVX-FAST-NEXT: vzeroupper
1089 ; AVX-FAST-NEXT: retq
1090 %x0 = extractelement <16 x float> %x, i32 0
1091 %x1 = extractelement <16 x float> %x, i32 1
1092 %x01 = fadd float %x1, %x0
1096 define double @extract_extract_v8f64_fadd_f64(<8 x double> %x) {
1097 ; SSE3-SLOW-LABEL: extract_extract_v8f64_fadd_f64:
1098 ; SSE3-SLOW: # %bb.0:
1099 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1100 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1101 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1102 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1103 ; SSE3-SLOW-NEXT: retq
1105 ; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64:
1106 ; SSE3-FAST: # %bb.0:
1107 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1108 ; SSE3-FAST-NEXT: retq
1110 ; AVX-SLOW-LABEL: extract_extract_v8f64_fadd_f64:
1111 ; AVX-SLOW: # %bb.0:
1112 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1113 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1114 ; AVX-SLOW-NEXT: vzeroupper
1115 ; AVX-SLOW-NEXT: retq
1117 ; AVX-FAST-LABEL: extract_extract_v8f64_fadd_f64:
1118 ; AVX-FAST: # %bb.0:
1119 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1120 ; AVX-FAST-NEXT: vzeroupper
1121 ; AVX-FAST-NEXT: retq
1122 %x0 = extractelement <8 x double> %x, i32 0
1123 %x1 = extractelement <8 x double> %x, i32 1
1124 %x01 = fadd double %x0, %x1
1128 define double @extract_extract_v8f64_fadd_f64_commute(<8 x double> %x) {
1129 ; SSE3-SLOW-LABEL: extract_extract_v8f64_fadd_f64_commute:
1130 ; SSE3-SLOW: # %bb.0:
1131 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1132 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1133 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1134 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1135 ; SSE3-SLOW-NEXT: retq
1137 ; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute:
1138 ; SSE3-FAST: # %bb.0:
1139 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1140 ; SSE3-FAST-NEXT: retq
1142 ; AVX-SLOW-LABEL: extract_extract_v8f64_fadd_f64_commute:
1143 ; AVX-SLOW: # %bb.0:
1144 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1145 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1146 ; AVX-SLOW-NEXT: vzeroupper
1147 ; AVX-SLOW-NEXT: retq
1149 ; AVX-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute:
1150 ; AVX-FAST: # %bb.0:
1151 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1152 ; AVX-FAST-NEXT: vzeroupper
1153 ; AVX-FAST-NEXT: retq
1154 %x0 = extractelement <8 x double> %x, i32 0
1155 %x1 = extractelement <8 x double> %x, i32 1
1156 %x01 = fadd double %x1, %x0
1160 define float @extract_extract_v16f32_fsub_f32(<16 x float> %x) {
1161 ; SSE3-SLOW-LABEL: extract_extract_v16f32_fsub_f32:
1162 ; SSE3-SLOW: # %bb.0:
1163 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1164 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1165 ; SSE3-SLOW-NEXT: retq
1167 ; SSE3-FAST-LABEL: extract_extract_v16f32_fsub_f32:
1168 ; SSE3-FAST: # %bb.0:
1169 ; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
1170 ; SSE3-FAST-NEXT: retq
1172 ; AVX-SLOW-LABEL: extract_extract_v16f32_fsub_f32:
1173 ; AVX-SLOW: # %bb.0:
1174 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1175 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1176 ; AVX-SLOW-NEXT: vzeroupper
1177 ; AVX-SLOW-NEXT: retq
1179 ; AVX-FAST-LABEL: extract_extract_v16f32_fsub_f32:
1180 ; AVX-FAST: # %bb.0:
1181 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1182 ; AVX-FAST-NEXT: vzeroupper
1183 ; AVX-FAST-NEXT: retq
1184 %x0 = extractelement <16 x float> %x, i32 0
1185 %x1 = extractelement <16 x float> %x, i32 1
1186 %x01 = fsub float %x0, %x1
1190 define float @extract_extract_v16f32_fsub_f32_commute(<16 x float> %x) {
1191 ; SSE3-LABEL: extract_extract_v16f32_fsub_f32_commute:
1193 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1194 ; SSE3-NEXT: subss %xmm0, %xmm1
1195 ; SSE3-NEXT: movaps %xmm1, %xmm0
1198 ; AVX-LABEL: extract_extract_v16f32_fsub_f32_commute:
1200 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1201 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1202 ; AVX-NEXT: vzeroupper
1204 %x0 = extractelement <16 x float> %x, i32 0
1205 %x1 = extractelement <16 x float> %x, i32 1
1206 %x01 = fsub float %x1, %x0
1210 define double @extract_extract_v8f64_fsub_f64(<8 x double> %x) {
1211 ; SSE3-SLOW-LABEL: extract_extract_v8f64_fsub_f64:
1212 ; SSE3-SLOW: # %bb.0:
1213 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1214 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1215 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1216 ; SSE3-SLOW-NEXT: retq
1218 ; SSE3-FAST-LABEL: extract_extract_v8f64_fsub_f64:
1219 ; SSE3-FAST: # %bb.0:
1220 ; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
1221 ; SSE3-FAST-NEXT: retq
1223 ; AVX-SLOW-LABEL: extract_extract_v8f64_fsub_f64:
1224 ; AVX-SLOW: # %bb.0:
1225 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1226 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1227 ; AVX-SLOW-NEXT: vzeroupper
1228 ; AVX-SLOW-NEXT: retq
1230 ; AVX-FAST-LABEL: extract_extract_v8f64_fsub_f64:
1231 ; AVX-FAST: # %bb.0:
1232 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1233 ; AVX-FAST-NEXT: vzeroupper
1234 ; AVX-FAST-NEXT: retq
1235 %x0 = extractelement <8 x double> %x, i32 0
1236 %x1 = extractelement <8 x double> %x, i32 1
1237 %x01 = fsub double %x0, %x1
1241 define double @extract_extract_v8f64_fsub_f64_commute(<8 x double> %x) {
1242 ; SSE3-LABEL: extract_extract_v8f64_fsub_f64_commute:
1244 ; SSE3-NEXT: movapd %xmm0, %xmm1
1245 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1246 ; SSE3-NEXT: subsd %xmm0, %xmm1
1247 ; SSE3-NEXT: movapd %xmm1, %xmm0
1250 ; AVX-LABEL: extract_extract_v8f64_fsub_f64_commute:
1252 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1253 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1254 ; AVX-NEXT: vzeroupper
1256 %x0 = extractelement <8 x double> %x, i32 0
1257 %x1 = extractelement <8 x double> %x, i32 1
1258 %x01 = fsub double %x1, %x0
1262 ; Check output when 1 or both extracts have extra uses.
1264 define float @extract_extract_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) {
1265 ; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1266 ; SSE3-SLOW: # %bb.0:
1267 ; SSE3-SLOW-NEXT: movss %xmm0, (%rdi)
1268 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1269 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1270 ; SSE3-SLOW-NEXT: retq
1272 ; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1273 ; SSE3-FAST: # %bb.0:
1274 ; SSE3-FAST-NEXT: movss %xmm0, (%rdi)
1275 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1276 ; SSE3-FAST-NEXT: retq
1278 ; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1279 ; AVX-SLOW: # %bb.0:
1280 ; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi)
1281 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1282 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1283 ; AVX-SLOW-NEXT: retq
1285 ; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses1:
1286 ; AVX-FAST: # %bb.0:
1287 ; AVX-FAST-NEXT: vmovss %xmm0, (%rdi)
1288 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1289 ; AVX-FAST-NEXT: retq
1290 %x0 = extractelement <4 x float> %x, i32 0
1291 store float %x0, float* %p
1292 %x1 = extractelement <4 x float> %x, i32 1
1293 %x01 = fadd float %x0, %x1
1297 define float @extract_extract_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) {
1298 ; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1299 ; SSE3-SLOW: # %bb.0:
1300 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1301 ; SSE3-SLOW-NEXT: movss %xmm1, (%rdi)
1302 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1303 ; SSE3-SLOW-NEXT: retq
1305 ; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1306 ; SSE3-FAST: # %bb.0:
1307 ; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1308 ; SSE3-FAST-NEXT: movss %xmm1, (%rdi)
1309 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1310 ; SSE3-FAST-NEXT: retq
1312 ; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1313 ; AVX-SLOW: # %bb.0:
1314 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1315 ; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi)
1316 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1317 ; AVX-SLOW-NEXT: retq
1319 ; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses2:
1320 ; AVX-FAST: # %bb.0:
1321 ; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi)
1322 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1323 ; AVX-FAST-NEXT: retq
1324 %x0 = extractelement <4 x float> %x, i32 0
1325 %x1 = extractelement <4 x float> %x, i32 1
1326 store float %x1, float* %p
1327 %x01 = fadd float %x0, %x1
1331 define float @extract_extract_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) {
1332 ; SSE3-LABEL: extract_extract_v4f32_fadd_f32_uses3:
1334 ; SSE3-NEXT: movss %xmm0, (%rdi)
1335 ; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1336 ; SSE3-NEXT: movss %xmm1, (%rsi)
1337 ; SSE3-NEXT: addss %xmm1, %xmm0
1340 ; AVX-LABEL: extract_extract_v4f32_fadd_f32_uses3:
1342 ; AVX-NEXT: vmovss %xmm0, (%rdi)
1343 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1344 ; AVX-NEXT: vmovss %xmm1, (%rsi)
1345 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1347 %x0 = extractelement <4 x float> %x, i32 0
1348 store float %x0, float* %p1
1349 %x1 = extractelement <4 x float> %x, i32 1
1350 store float %x1, float* %p2
1351 %x01 = fadd float %x0, %x1
1355 ; Repeat tests from general reductions to verify output for hoppy targets:
1356 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971
1358 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
1359 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
1361 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
1362 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1363 ; SSE3-SLOW: # %bb.0:
1364 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
1365 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
1366 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1367 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
1368 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
1369 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm0
1370 ; SSE3-SLOW-NEXT: retq
1372 ; SSE3-FAST-LABEL: fadd_reduce_v8f32:
1373 ; SSE3-FAST: # %bb.0:
1374 ; SSE3-FAST-NEXT: addps %xmm2, %xmm1
1375 ; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
1376 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1377 ; SSE3-FAST-NEXT: addps %xmm1, %xmm0
1378 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
1379 ; SSE3-FAST-NEXT: retq
1381 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1382 ; AVX-SLOW: # %bb.0:
1383 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
1384 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
1385 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1388 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1389 ; AVX-SLOW-NEXT: vzeroupper
1390 ; AVX-SLOW-NEXT: retq
1392 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1393 ; AVX-FAST: # %bb.0:
1394 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
1395 ; AVX-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0
1396 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1397 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1398 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1399 ; AVX-FAST-NEXT: vzeroupper
1400 ; AVX-FAST-NEXT: retq
1401 %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
1405 define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
1406 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1407 ; SSE3-SLOW: # %bb.0:
1408 ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
1409 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1410 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1411 ; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0
1412 ; SSE3-SLOW-NEXT: retq
1414 ; SSE3-FAST-LABEL: fadd_reduce_v4f64:
1415 ; SSE3-FAST: # %bb.0:
1416 ; SSE3-FAST-NEXT: movapd %xmm1, %xmm0
1417 ; SSE3-FAST-NEXT: addpd %xmm2, %xmm0
1418 ; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
1419 ; SSE3-FAST-NEXT: retq
1421 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1422 ; AVX-SLOW: # %bb.0:
1423 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
1424 ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1425 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1426 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1427 ; AVX-SLOW-NEXT: vzeroupper
1428 ; AVX-SLOW-NEXT: retq
1430 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1431 ; AVX-FAST: # %bb.0:
1432 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
1433 ; AVX-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1434 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1435 ; AVX-FAST-NEXT: vzeroupper
1436 ; AVX-FAST-NEXT: retq
1437 %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)