1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
6 ; Verify that we correctly generate 'addsub' instructions from
7 ; a sequence of vector extracts + float add/sub + vector inserts.
9 define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
12 ; SSE-NEXT: addsubps %xmm1, %xmm0
17 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
19 %1 = extractelement <4 x float> %A, i32 0
20 %2 = extractelement <4 x float> %B, i32 0
21 %sub = fsub float %1, %2
22 %3 = extractelement <4 x float> %A, i32 2
23 %4 = extractelement <4 x float> %B, i32 2
24 %sub2 = fsub float %3, %4
25 %5 = extractelement <4 x float> %A, i32 1
26 %6 = extractelement <4 x float> %B, i32 1
27 %add = fadd float %5, %6
28 %7 = extractelement <4 x float> %A, i32 3
29 %8 = extractelement <4 x float> %B, i32 3
30 %add2 = fadd float %7, %8
31 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
32 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
33 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
34 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
35 ret <4 x float> %vecinsert4
38 define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
41 ; SSE-NEXT: addsubps %xmm1, %xmm0
46 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
48 %1 = extractelement <4 x float> %A, i32 2
49 %2 = extractelement <4 x float> %B, i32 2
50 %sub2 = fsub float %1, %2
51 %3 = extractelement <4 x float> %A, i32 3
52 %4 = extractelement <4 x float> %B, i32 3
53 %add2 = fadd float %3, %4
54 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
55 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
56 ret <4 x float> %vecinsert2
59 define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
62 ; SSE-NEXT: addsubps %xmm1, %xmm0
67 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
69 %1 = extractelement <4 x float> %A, i32 0
70 %2 = extractelement <4 x float> %B, i32 0
71 %sub = fsub float %1, %2
72 %3 = extractelement <4 x float> %A, i32 3
73 %4 = extractelement <4 x float> %B, i32 3
74 %add = fadd float %4, %3
75 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
76 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
77 ret <4 x float> %vecinsert2
80 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
83 ; SSE-NEXT: addsubps %xmm1, %xmm0
88 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
90 %1 = extractelement <4 x float> %A, i32 2
91 %2 = extractelement <4 x float> %B, i32 2
92 %sub = fsub float %1, %2
93 %3 = extractelement <4 x float> %A, i32 1
94 %4 = extractelement <4 x float> %B, i32 1
95 %add = fadd float %3, %4
96 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
97 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
98 ret <4 x float> %vecinsert2
101 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
104 ; SSE-NEXT: addsubps %xmm1, %xmm0
109 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
111 %1 = extractelement <4 x float> %A, i32 0
112 %2 = extractelement <4 x float> %B, i32 0
113 %sub2 = fsub float %1, %2
114 %3 = extractelement <4 x float> %A, i32 1
115 %4 = extractelement <4 x float> %B, i32 1
116 %add2 = fadd float %3, %4
117 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
118 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
119 ret <4 x float> %vecinsert2
122 define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
125 ; SSE-NEXT: addsubps %xmm1, %xmm0
130 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
132 %1 = extractelement <4 x float> %A, i32 0
133 %2 = extractelement <4 x float> %B, i32 0
134 %sub = fsub float %1, %2
135 %3 = extractelement <4 x float> %A, i32 2
136 %4 = extractelement <4 x float> %B, i32 2
137 %sub2 = fsub float %3, %4
138 %5 = extractelement <4 x float> %A, i32 1
139 %6 = extractelement <4 x float> %B, i32 1
140 %add = fadd float %5, %6
141 %7 = extractelement <4 x float> %A, i32 3
142 %8 = extractelement <4 x float> %B, i32 3
143 %add2 = fadd float %7, %8
144 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
145 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
146 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
147 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
148 ret <4 x float> %vecinsert4
151 define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
154 ; SSE-NEXT: addsubpd %xmm2, %xmm0
155 ; SSE-NEXT: addsubpd %xmm3, %xmm1
160 ; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
162 %1 = extractelement <4 x double> %A, i32 0
163 %2 = extractelement <4 x double> %B, i32 0
164 %sub = fsub double %1, %2
165 %3 = extractelement <4 x double> %A, i32 2
166 %4 = extractelement <4 x double> %B, i32 2
167 %sub2 = fsub double %3, %4
168 %5 = extractelement <4 x double> %A, i32 1
169 %6 = extractelement <4 x double> %B, i32 1
170 %add = fadd double %5, %6
171 %7 = extractelement <4 x double> %A, i32 3
172 %8 = extractelement <4 x double> %B, i32 3
173 %add2 = fadd double %7, %8
174 %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
175 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
176 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
177 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
178 ret <4 x double> %vecinsert4
181 define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
184 ; SSE-NEXT: addsubpd %xmm1, %xmm0
189 ; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
191 %1 = extractelement <2 x double> %A, i32 0
192 %2 = extractelement <2 x double> %B, i32 0
193 %sub = fsub double %1, %2
194 %3 = extractelement <2 x double> %A, i32 1
195 %4 = extractelement <2 x double> %B, i32 1
196 %add = fadd double %3, %4
197 %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
198 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
199 ret <2 x double> %vecinsert2
202 define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
205 ; SSE-NEXT: addsubps %xmm2, %xmm0
206 ; SSE-NEXT: addsubps %xmm3, %xmm1
211 ; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
213 %1 = extractelement <8 x float> %A, i32 0
214 %2 = extractelement <8 x float> %B, i32 0
215 %sub = fsub float %1, %2
216 %3 = extractelement <8 x float> %A, i32 2
217 %4 = extractelement <8 x float> %B, i32 2
218 %sub2 = fsub float %3, %4
219 %5 = extractelement <8 x float> %A, i32 1
220 %6 = extractelement <8 x float> %B, i32 1
221 %add = fadd float %5, %6
222 %7 = extractelement <8 x float> %A, i32 3
223 %8 = extractelement <8 x float> %B, i32 3
224 %add2 = fadd float %7, %8
225 %9 = extractelement <8 x float> %A, i32 4
226 %10 = extractelement <8 x float> %B, i32 4
227 %sub3 = fsub float %9, %10
228 %11 = extractelement <8 x float> %A, i32 6
229 %12 = extractelement <8 x float> %B, i32 6
230 %sub4 = fsub float %11, %12
231 %13 = extractelement <8 x float> %A, i32 5
232 %14 = extractelement <8 x float> %B, i32 5
233 %add3 = fadd float %13, %14
234 %15 = extractelement <8 x float> %A, i32 7
235 %16 = extractelement <8 x float> %B, i32 7
236 %add4 = fadd float %15, %16
237 %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
238 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
239 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
240 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
241 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
242 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
243 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
244 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
245 ret <8 x float> %vecinsert8
248 ; Verify that we don't generate addsub instruction for the following
251 define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
254 ; SSE-NEXT: subss %xmm1, %xmm0
259 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
261 %1 = extractelement <4 x float> %A, i32 0
262 %2 = extractelement <4 x float> %B, i32 0
263 %sub = fsub float %1, %2
264 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
265 ret <4 x float> %vecinsert1
268 define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
271 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
272 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
273 ; SSE-NEXT: subss %xmm1, %xmm0
274 ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
277 ; AVX1-LABEL: test11:
279 ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
280 ; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
281 ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
282 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
285 ; AVX512-LABEL: test11:
287 ; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
288 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
289 ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
290 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
292 %1 = extractelement <4 x float> %A, i32 2
293 %2 = extractelement <4 x float> %B, i32 2
294 %sub = fsub float %1, %2
295 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
296 ret <4 x float> %vecinsert1
299 define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
302 ; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
303 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
304 ; SSE-NEXT: addss %xmm0, %xmm1
305 ; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
308 ; AVX1-LABEL: test12:
310 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
311 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
312 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
313 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
316 ; AVX512-LABEL: test12:
318 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
319 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
320 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
321 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
323 %1 = extractelement <4 x float> %A, i32 1
324 %2 = extractelement <4 x float> %B, i32 1
325 %add = fadd float %1, %2
326 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
327 ret <4 x float> %vecinsert1
330 define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
333 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
334 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
335 ; SSE-NEXT: addss %xmm1, %xmm0
336 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
339 ; AVX1-LABEL: test13:
341 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
342 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
343 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
344 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
347 ; AVX512-LABEL: test13:
349 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
350 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
351 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
352 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
354 %1 = extractelement <4 x float> %A, i32 3
355 %2 = extractelement <4 x float> %B, i32 3
356 %add = fadd float %1, %2
357 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
358 ret <4 x float> %vecinsert1
361 define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
364 ; SSE-NEXT: movaps %xmm0, %xmm2
365 ; SSE-NEXT: subss %xmm1, %xmm2
366 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
367 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
368 ; SSE-NEXT: subss %xmm1, %xmm0
369 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
370 ; SSE-NEXT: movaps %xmm2, %xmm0
375 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2
376 ; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
377 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
378 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
379 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
381 %1 = extractelement <4 x float> %A, i32 0
382 %2 = extractelement <4 x float> %B, i32 0
383 %sub = fsub float %1, %2
384 %3 = extractelement <4 x float> %A, i32 2
385 %4 = extractelement <4 x float> %B, i32 2
386 %sub2 = fsub float %3, %4
387 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
388 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
389 ret <4 x float> %vecinsert2
392 define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
395 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
396 ; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
397 ; SSE-NEXT: addss %xmm3, %xmm2
398 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
399 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
400 ; SSE-NEXT: addss %xmm0, %xmm1
401 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
402 ; SSE-NEXT: movaps %xmm2, %xmm0
405 ; AVX1-LABEL: test15:
407 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
408 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
409 ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2
410 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
411 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
412 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
413 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
414 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
417 ; AVX512-LABEL: test15:
419 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
420 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
421 ; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2
422 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
423 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
424 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
425 ; AVX512-NEXT: vbroadcastss %xmm2, %xmm1
426 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
428 %1 = extractelement <4 x float> %A, i32 1
429 %2 = extractelement <4 x float> %B, i32 1
430 %add = fadd float %1, %2
431 %3 = extractelement <4 x float> %A, i32 3
432 %4 = extractelement <4 x float> %B, i32 3
433 %add2 = fadd float %3, %4
434 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
435 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
436 ret <4 x float> %vecinsert2
439 define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
442 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
443 ; SSE-NEXT: movaps %xmm0, %xmm2
444 ; SSE-NEXT: subss %xmm3, %xmm2
445 ; SSE-NEXT: movaps %xmm0, %xmm4
446 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
447 ; SSE-NEXT: movaps %xmm1, %xmm5
448 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
449 ; SSE-NEXT: subss %xmm5, %xmm4
450 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
451 ; SSE-NEXT: addss %xmm3, %xmm5
452 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
453 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
454 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
455 ; SSE-NEXT: addss %xmm0, %xmm1
456 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
457 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
458 ; SSE-NEXT: movaps %xmm2, %xmm0
463 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
464 ; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3
465 ; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
466 ; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
467 ; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4
468 ; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
469 ; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2
470 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
471 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
472 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
473 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
474 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
475 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
477 %1 = extractelement <4 x float> %A, i32 0
478 %2 = extractelement <4 x float> %B, i32 0
479 %sub = fsub float %1, 42.0
480 %3 = extractelement <4 x float> %A, i32 2
481 %4 = extractelement <4 x float> %B, i32 2
482 %sub2 = fsub float %3, %4
483 %5 = extractelement <4 x float> %A, i32 1
484 %6 = extractelement <4 x float> %B, i32 1
485 %add = fadd float %5, 42.0
486 %7 = extractelement <4 x float> %A, i32 3
487 %8 = extractelement <4 x float> %B, i32 3
488 %add2 = fadd float %7, %8
489 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
490 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
491 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
492 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
493 ret <4 x float> %vecinsert4
496 define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
497 ; SSE-LABEL: test_v2f32:
499 ; SSE-NEXT: addsubps %xmm1, %xmm0
502 ; AVX-LABEL: test_v2f32:
504 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
506 %v2 = extractelement <2 x float> %v0, i32 0
507 %v3 = extractelement <2 x float> %v1, i32 0
508 %v4 = extractelement <2 x float> %v0, i32 1
509 %v5 = extractelement <2 x float> %v1, i32 1
510 %sub = fsub float %v2, %v3
511 %add = fadd float %v5, %v4
512 %res0 = insertelement <2 x float> undef, float %sub, i32 0
513 %res1 = insertelement <2 x float> %res0, float %add, i32 1
514 ret <2 x float> %res1
517 define <16 x float> @test17(<16 x float> %A, <16 x float> %B) {
520 ; SSE-NEXT: addsubps %xmm4, %xmm0
521 ; SSE-NEXT: addsubps %xmm5, %xmm1
522 ; SSE-NEXT: addsubps %xmm6, %xmm2
523 ; SSE-NEXT: addsubps %xmm7, %xmm3
526 ; AVX1-LABEL: test17:
528 ; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
529 ; AVX1-NEXT: vaddsubps %ymm3, %ymm1, %ymm1
532 ; AVX512-LABEL: test17:
534 ; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2
535 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA
536 ; AVX512-NEXT: kmovw %eax, %k1
537 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
538 ; AVX512-NEXT: vmovaps %zmm2, %zmm0
540 %1 = extractelement <16 x float> %A, i32 0
541 %2 = extractelement <16 x float> %B, i32 0
542 %sub = fsub float %1, %2
543 %3 = extractelement <16 x float> %A, i32 2
544 %4 = extractelement <16 x float> %B, i32 2
545 %sub2 = fsub float %3, %4
546 %5 = extractelement <16 x float> %A, i32 1
547 %6 = extractelement <16 x float> %B, i32 1
548 %add = fadd float %5, %6
549 %7 = extractelement <16 x float> %A, i32 3
550 %8 = extractelement <16 x float> %B, i32 3
551 %add2 = fadd float %7, %8
552 %9 = extractelement <16 x float> %A, i32 4
553 %10 = extractelement <16 x float> %B, i32 4
554 %sub3 = fsub float %9, %10
555 %11 = extractelement <16 x float> %A, i32 6
556 %12 = extractelement <16 x float> %B, i32 6
557 %sub4 = fsub float %11, %12
558 %13 = extractelement <16 x float> %A, i32 5
559 %14 = extractelement <16 x float> %B, i32 5
560 %add3 = fadd float %13, %14
561 %15 = extractelement <16 x float> %A, i32 7
562 %16 = extractelement <16 x float> %B, i32 7
563 %add4 = fadd float %15, %16
564 %17 = extractelement <16 x float> %A, i32 8
565 %18 = extractelement <16 x float> %B, i32 8
566 %sub5 = fsub float %17, %18
567 %19 = extractelement <16 x float> %A, i32 10
568 %20 = extractelement <16 x float> %B, i32 10
569 %sub6 = fsub float %19, %20
570 %21 = extractelement <16 x float> %A, i32 9
571 %22 = extractelement <16 x float> %B, i32 9
572 %add5 = fadd float %21, %22
573 %23 = extractelement <16 x float> %A, i32 11
574 %24 = extractelement <16 x float> %B, i32 11
575 %add6 = fadd float %23, %24
576 %25 = extractelement <16 x float> %A, i32 12
577 %26 = extractelement <16 x float> %B, i32 12
578 %sub7 = fsub float %25, %26
579 %27 = extractelement <16 x float> %A, i32 14
580 %28 = extractelement <16 x float> %B, i32 14
581 %sub8 = fsub float %27, %28
582 %29 = extractelement <16 x float> %A, i32 13
583 %30 = extractelement <16 x float> %B, i32 13
584 %add7 = fadd float %29, %30
585 %31 = extractelement <16 x float> %A, i32 15
586 %32 = extractelement <16 x float> %B, i32 15
587 %add8 = fadd float %31, %32
588 %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1
589 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
590 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
591 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
592 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
593 %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
594 %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
595 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
596 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
597 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
598 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
599 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
600 %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
601 %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
602 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
603 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
604 ret <16 x float> %vecinsert16
607 define <8 x double> @test18(<8 x double> %A, <8 x double> %B) {
610 ; SSE-NEXT: addsubpd %xmm4, %xmm0
611 ; SSE-NEXT: addsubpd %xmm5, %xmm1
612 ; SSE-NEXT: addsubpd %xmm6, %xmm2
613 ; SSE-NEXT: addsubpd %xmm7, %xmm3
616 ; AVX1-LABEL: test18:
618 ; AVX1-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
619 ; AVX1-NEXT: vaddsubpd %ymm3, %ymm1, %ymm1
622 ; AVX512-LABEL: test18:
624 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm2
625 ; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0
626 ; AVX512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm2[1],zmm0[2],zmm2[3],zmm0[4],zmm2[5],zmm0[6],zmm2[7]
628 %1 = extractelement <8 x double> %A, i32 0
629 %2 = extractelement <8 x double> %B, i32 0
630 %sub = fsub double %1, %2
631 %3 = extractelement <8 x double> %A, i32 2
632 %4 = extractelement <8 x double> %B, i32 2
633 %sub2 = fsub double %3, %4
634 %5 = extractelement <8 x double> %A, i32 1
635 %6 = extractelement <8 x double> %B, i32 1
636 %add = fadd double %5, %6
637 %7 = extractelement <8 x double> %A, i32 3
638 %8 = extractelement <8 x double> %B, i32 3
639 %add2 = fadd double %7, %8
640 %9 = extractelement <8 x double> %A, i32 4
641 %10 = extractelement <8 x double> %B, i32 4
642 %sub3 = fsub double %9, %10
643 %11 = extractelement <8 x double> %A, i32 6
644 %12 = extractelement <8 x double> %B, i32 6
645 %sub4 = fsub double %11, %12
646 %13 = extractelement <8 x double> %A, i32 5
647 %14 = extractelement <8 x double> %B, i32 5
648 %add3 = fadd double %13, %14
649 %15 = extractelement <8 x double> %A, i32 7
650 %16 = extractelement <8 x double> %B, i32 7
651 %add4 = fadd double %15, %16
652 %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1
653 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
654 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
655 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
656 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
657 %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
658 %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
659 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
660 ret <8 x double> %vecinsert8