1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
5 ; Verify that we correctly generate 'addsub' instructions from
6 ; a sequence of vector extracts + float add/sub + vector inserts.
8 define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
11 ; SSE-NEXT: addsubps %xmm1, %xmm0
16 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
18 %1 = extractelement <4 x float> %A, i32 0
19 %2 = extractelement <4 x float> %B, i32 0
20 %sub = fsub float %1, %2
21 %3 = extractelement <4 x float> %A, i32 2
22 %4 = extractelement <4 x float> %B, i32 2
23 %sub2 = fsub float %3, %4
24 %5 = extractelement <4 x float> %A, i32 1
25 %6 = extractelement <4 x float> %B, i32 1
26 %add = fadd float %5, %6
27 %7 = extractelement <4 x float> %A, i32 3
28 %8 = extractelement <4 x float> %B, i32 3
29 %add2 = fadd float %7, %8
30 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
31 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
32 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
33 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
34 ret <4 x float> %vecinsert4
37 define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
40 ; SSE-NEXT: addsubps %xmm1, %xmm0
45 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
47 %1 = extractelement <4 x float> %A, i32 2
48 %2 = extractelement <4 x float> %B, i32 2
49 %sub2 = fsub float %1, %2
50 %3 = extractelement <4 x float> %A, i32 3
51 %4 = extractelement <4 x float> %B, i32 3
52 %add2 = fadd float %3, %4
53 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
54 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
55 ret <4 x float> %vecinsert2
58 define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
61 ; SSE-NEXT: addsubps %xmm1, %xmm0
66 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
68 %1 = extractelement <4 x float> %A, i32 0
69 %2 = extractelement <4 x float> %B, i32 0
70 %sub = fsub float %1, %2
71 %3 = extractelement <4 x float> %A, i32 3
72 %4 = extractelement <4 x float> %B, i32 3
73 %add = fadd float %4, %3
74 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
75 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
76 ret <4 x float> %vecinsert2
79 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
82 ; SSE-NEXT: addsubps %xmm1, %xmm0
87 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
89 %1 = extractelement <4 x float> %A, i32 2
90 %2 = extractelement <4 x float> %B, i32 2
91 %sub = fsub float %1, %2
92 %3 = extractelement <4 x float> %A, i32 1
93 %4 = extractelement <4 x float> %B, i32 1
94 %add = fadd float %3, %4
95 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
96 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
97 ret <4 x float> %vecinsert2
100 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
103 ; SSE-NEXT: addsubps %xmm1, %xmm0
108 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
110 %1 = extractelement <4 x float> %A, i32 0
111 %2 = extractelement <4 x float> %B, i32 0
112 %sub2 = fsub float %1, %2
113 %3 = extractelement <4 x float> %A, i32 1
114 %4 = extractelement <4 x float> %B, i32 1
115 %add2 = fadd float %3, %4
116 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
117 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
118 ret <4 x float> %vecinsert2
121 define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
124 ; SSE-NEXT: addsubps %xmm1, %xmm0
129 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
131 %1 = extractelement <4 x float> %A, i32 0
132 %2 = extractelement <4 x float> %B, i32 0
133 %sub = fsub float %1, %2
134 %3 = extractelement <4 x float> %A, i32 2
135 %4 = extractelement <4 x float> %B, i32 2
136 %sub2 = fsub float %3, %4
137 %5 = extractelement <4 x float> %A, i32 1
138 %6 = extractelement <4 x float> %B, i32 1
139 %add = fadd float %5, %6
140 %7 = extractelement <4 x float> %A, i32 3
141 %8 = extractelement <4 x float> %B, i32 3
142 %add2 = fadd float %7, %8
143 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
144 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
145 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
146 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
147 ret <4 x float> %vecinsert4
150 define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
153 ; SSE-NEXT: addsubpd %xmm2, %xmm0
154 ; SSE-NEXT: addsubpd %xmm3, %xmm1
159 ; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
161 %1 = extractelement <4 x double> %A, i32 0
162 %2 = extractelement <4 x double> %B, i32 0
163 %sub = fsub double %1, %2
164 %3 = extractelement <4 x double> %A, i32 2
165 %4 = extractelement <4 x double> %B, i32 2
166 %sub2 = fsub double %3, %4
167 %5 = extractelement <4 x double> %A, i32 1
168 %6 = extractelement <4 x double> %B, i32 1
169 %add = fadd double %5, %6
170 %7 = extractelement <4 x double> %A, i32 3
171 %8 = extractelement <4 x double> %B, i32 3
172 %add2 = fadd double %7, %8
173 %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
174 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
175 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
176 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
177 ret <4 x double> %vecinsert4
180 define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
183 ; SSE-NEXT: addsubpd %xmm1, %xmm0
188 ; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
190 %1 = extractelement <2 x double> %A, i32 0
191 %2 = extractelement <2 x double> %B, i32 0
192 %sub = fsub double %1, %2
193 %3 = extractelement <2 x double> %A, i32 1
194 %4 = extractelement <2 x double> %B, i32 1
195 %add = fadd double %3, %4
196 %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
197 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
198 ret <2 x double> %vecinsert2
201 define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
204 ; SSE-NEXT: addsubps %xmm2, %xmm0
205 ; SSE-NEXT: addsubps %xmm3, %xmm1
210 ; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
212 %1 = extractelement <8 x float> %A, i32 0
213 %2 = extractelement <8 x float> %B, i32 0
214 %sub = fsub float %1, %2
215 %3 = extractelement <8 x float> %A, i32 2
216 %4 = extractelement <8 x float> %B, i32 2
217 %sub2 = fsub float %3, %4
218 %5 = extractelement <8 x float> %A, i32 1
219 %6 = extractelement <8 x float> %B, i32 1
220 %add = fadd float %5, %6
221 %7 = extractelement <8 x float> %A, i32 3
222 %8 = extractelement <8 x float> %B, i32 3
223 %add2 = fadd float %7, %8
224 %9 = extractelement <8 x float> %A, i32 4
225 %10 = extractelement <8 x float> %B, i32 4
226 %sub3 = fsub float %9, %10
227 %11 = extractelement <8 x float> %A, i32 6
228 %12 = extractelement <8 x float> %B, i32 6
229 %sub4 = fsub float %11, %12
230 %13 = extractelement <8 x float> %A, i32 5
231 %14 = extractelement <8 x float> %B, i32 5
232 %add3 = fadd float %13, %14
233 %15 = extractelement <8 x float> %A, i32 7
234 %16 = extractelement <8 x float> %B, i32 7
235 %add4 = fadd float %15, %16
236 %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
237 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
238 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
239 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
240 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
241 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
242 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
243 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
244 ret <8 x float> %vecinsert8
247 ; Verify that we don't generate addsub instruction for the following
250 define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
253 ; SSE-NEXT: subss %xmm1, %xmm0
258 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
260 %1 = extractelement <4 x float> %A, i32 0
261 %2 = extractelement <4 x float> %B, i32 0
262 %sub = fsub float %1, %2
263 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
264 ret <4 x float> %vecinsert1
267 define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
270 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
271 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
272 ; SSE-NEXT: subss %xmm1, %xmm0
273 ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
278 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
279 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
280 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
281 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
283 %1 = extractelement <4 x float> %A, i32 2
284 %2 = extractelement <4 x float> %B, i32 2
285 %sub = fsub float %1, %2
286 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
287 ret <4 x float> %vecinsert1
290 define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
293 ; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
294 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
295 ; SSE-NEXT: addss %xmm0, %xmm1
296 ; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
301 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
302 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
303 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
304 ; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
306 %1 = extractelement <4 x float> %A, i32 1
307 %2 = extractelement <4 x float> %B, i32 1
308 %add = fadd float %1, %2
309 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
310 ret <4 x float> %vecinsert1
313 define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
316 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
317 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
318 ; SSE-NEXT: addss %xmm0, %xmm1
319 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
320 ; SSE-NEXT: movaps %xmm1, %xmm0
325 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
326 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
327 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
328 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
330 %1 = extractelement <4 x float> %A, i32 3
331 %2 = extractelement <4 x float> %B, i32 3
332 %add = fadd float %1, %2
333 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
334 ret <4 x float> %vecinsert1
337 define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
340 ; SSE-NEXT: movaps %xmm0, %xmm2
341 ; SSE-NEXT: subss %xmm1, %xmm2
342 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
343 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
344 ; SSE-NEXT: subss %xmm1, %xmm0
345 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
346 ; SSE-NEXT: movaps %xmm2, %xmm0
351 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2
352 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
353 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
354 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
355 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
357 %1 = extractelement <4 x float> %A, i32 0
358 %2 = extractelement <4 x float> %B, i32 0
359 %sub = fsub float %1, %2
360 %3 = extractelement <4 x float> %A, i32 2
361 %4 = extractelement <4 x float> %B, i32 2
362 %sub2 = fsub float %3, %4
363 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
364 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
365 ret <4 x float> %vecinsert2
368 define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
371 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
372 ; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
373 ; SSE-NEXT: addss %xmm3, %xmm2
374 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
375 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
376 ; SSE-NEXT: addss %xmm0, %xmm1
377 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
378 ; SSE-NEXT: movaps %xmm2, %xmm0
383 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
384 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
385 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
386 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
387 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
388 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
389 ; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
390 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
392 %1 = extractelement <4 x float> %A, i32 1
393 %2 = extractelement <4 x float> %B, i32 1
394 %add = fadd float %1, %2
395 %3 = extractelement <4 x float> %A, i32 3
396 %4 = extractelement <4 x float> %B, i32 3
397 %add2 = fadd float %3, %4
398 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
399 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
400 ret <4 x float> %vecinsert2
403 define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
406 ; SSE-NEXT: movaps %xmm0, %xmm2
407 ; SSE-NEXT: subss %xmm0, %xmm2
408 ; SSE-NEXT: movaps %xmm0, %xmm3
409 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
410 ; SSE-NEXT: movaps %xmm1, %xmm4
411 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
412 ; SSE-NEXT: subss %xmm4, %xmm3
413 ; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
414 ; SSE-NEXT: addss %xmm0, %xmm4
415 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
416 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
417 ; SSE-NEXT: addss %xmm0, %xmm1
418 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
419 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
420 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
421 ; SSE-NEXT: movaps %xmm2, %xmm0
426 ; AVX-NEXT: vsubss %xmm0, %xmm0, %xmm2
427 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
428 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
429 ; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
430 ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
431 ; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4
432 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
433 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
434 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
435 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
436 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
437 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
439 %1 = extractelement <4 x float> %A, i32 0
440 %2 = extractelement <4 x float> %B, i32 0
441 %sub = fsub float %1, undef
442 %3 = extractelement <4 x float> %A, i32 2
443 %4 = extractelement <4 x float> %B, i32 2
444 %sub2 = fsub float %3, %4
445 %5 = extractelement <4 x float> %A, i32 1
446 %6 = extractelement <4 x float> %B, i32 1
447 %add = fadd float %5, undef
448 %7 = extractelement <4 x float> %A, i32 3
449 %8 = extractelement <4 x float> %B, i32 3
450 %add2 = fadd float %7, %8
451 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
452 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
453 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
454 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
455 ret <4 x float> %vecinsert4
458 define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
459 ; SSE-LABEL: test_v2f32:
461 ; SSE-NEXT: addsubps %xmm1, %xmm0
464 ; AVX-LABEL: test_v2f32:
466 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
468 %v2 = extractelement <2 x float> %v0, i32 0
469 %v3 = extractelement <2 x float> %v1, i32 0
470 %v4 = extractelement <2 x float> %v0, i32 1
471 %v5 = extractelement <2 x float> %v1, i32 1
472 %sub = fsub float %v2, %v3
473 %add = fadd float %v5, %v4
474 %res0 = insertelement <2 x float> undef, float %sub, i32 0
475 %res1 = insertelement <2 x float> %res0, float %add, i32 1
476 ret <2 x float> %res1