1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
6 ; Verify that we correctly generate 'addsub' instructions from
7 ; a sequence of vector extracts + float add/sub + vector inserts.
9 define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
12 ; SSE-NEXT: addsubps %xmm1, %xmm0
17 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
19 %1 = extractelement <4 x float> %A, i32 0
20 %2 = extractelement <4 x float> %B, i32 0
21 %sub = fsub float %1, %2
22 %3 = extractelement <4 x float> %A, i32 2
23 %4 = extractelement <4 x float> %B, i32 2
24 %sub2 = fsub float %3, %4
25 %5 = extractelement <4 x float> %A, i32 1
26 %6 = extractelement <4 x float> %B, i32 1
27 %add = fadd float %5, %6
28 %7 = extractelement <4 x float> %A, i32 3
29 %8 = extractelement <4 x float> %B, i32 3
30 %add2 = fadd float %7, %8
31 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
32 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
33 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
34 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
35 ret <4 x float> %vecinsert4
38 define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
41 ; SSE-NEXT: addsubps %xmm1, %xmm0
46 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
48 %1 = extractelement <4 x float> %A, i32 2
49 %2 = extractelement <4 x float> %B, i32 2
50 %sub2 = fsub float %1, %2
51 %3 = extractelement <4 x float> %A, i32 3
52 %4 = extractelement <4 x float> %B, i32 3
53 %add2 = fadd float %3, %4
54 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
55 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
56 ret <4 x float> %vecinsert2
59 define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
62 ; SSE-NEXT: addsubps %xmm1, %xmm0
67 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
69 %1 = extractelement <4 x float> %A, i32 0
70 %2 = extractelement <4 x float> %B, i32 0
71 %sub = fsub float %1, %2
72 %3 = extractelement <4 x float> %A, i32 3
73 %4 = extractelement <4 x float> %B, i32 3
74 %add = fadd float %4, %3
75 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
76 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
77 ret <4 x float> %vecinsert2
80 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
83 ; SSE-NEXT: addsubps %xmm1, %xmm0
88 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
90 %1 = extractelement <4 x float> %A, i32 2
91 %2 = extractelement <4 x float> %B, i32 2
92 %sub = fsub float %1, %2
93 %3 = extractelement <4 x float> %A, i32 1
94 %4 = extractelement <4 x float> %B, i32 1
95 %add = fadd float %3, %4
96 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
97 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
98 ret <4 x float> %vecinsert2
101 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
104 ; SSE-NEXT: addsubps %xmm1, %xmm0
109 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
111 %1 = extractelement <4 x float> %A, i32 0
112 %2 = extractelement <4 x float> %B, i32 0
113 %sub2 = fsub float %1, %2
114 %3 = extractelement <4 x float> %A, i32 1
115 %4 = extractelement <4 x float> %B, i32 1
116 %add2 = fadd float %3, %4
117 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
118 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
119 ret <4 x float> %vecinsert2
122 define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
125 ; SSE-NEXT: addsubps %xmm1, %xmm0
130 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
132 %1 = extractelement <4 x float> %A, i32 0
133 %2 = extractelement <4 x float> %B, i32 0
134 %sub = fsub float %1, %2
135 %3 = extractelement <4 x float> %A, i32 2
136 %4 = extractelement <4 x float> %B, i32 2
137 %sub2 = fsub float %3, %4
138 %5 = extractelement <4 x float> %A, i32 1
139 %6 = extractelement <4 x float> %B, i32 1
140 %add = fadd float %5, %6
141 %7 = extractelement <4 x float> %A, i32 3
142 %8 = extractelement <4 x float> %B, i32 3
143 %add2 = fadd float %7, %8
144 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
145 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
146 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
147 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
148 ret <4 x float> %vecinsert4
151 define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
154 ; SSE-NEXT: addsubpd %xmm2, %xmm0
155 ; SSE-NEXT: addsubpd %xmm3, %xmm1
160 ; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
162 %1 = extractelement <4 x double> %A, i32 0
163 %2 = extractelement <4 x double> %B, i32 0
164 %sub = fsub double %1, %2
165 %3 = extractelement <4 x double> %A, i32 2
166 %4 = extractelement <4 x double> %B, i32 2
167 %sub2 = fsub double %3, %4
168 %5 = extractelement <4 x double> %A, i32 1
169 %6 = extractelement <4 x double> %B, i32 1
170 %add = fadd double %5, %6
171 %7 = extractelement <4 x double> %A, i32 3
172 %8 = extractelement <4 x double> %B, i32 3
173 %add2 = fadd double %7, %8
174 %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
175 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
176 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
177 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
178 ret <4 x double> %vecinsert4
181 define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
184 ; SSE-NEXT: addsubpd %xmm1, %xmm0
189 ; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
191 %1 = extractelement <2 x double> %A, i32 0
192 %2 = extractelement <2 x double> %B, i32 0
193 %sub = fsub double %1, %2
194 %3 = extractelement <2 x double> %A, i32 1
195 %4 = extractelement <2 x double> %B, i32 1
196 %add = fadd double %3, %4
197 %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
198 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
199 ret <2 x double> %vecinsert2
202 define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
205 ; SSE-NEXT: addsubps %xmm2, %xmm0
206 ; SSE-NEXT: addsubps %xmm3, %xmm1
211 ; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
213 %1 = extractelement <8 x float> %A, i32 0
214 %2 = extractelement <8 x float> %B, i32 0
215 %sub = fsub float %1, %2
216 %3 = extractelement <8 x float> %A, i32 2
217 %4 = extractelement <8 x float> %B, i32 2
218 %sub2 = fsub float %3, %4
219 %5 = extractelement <8 x float> %A, i32 1
220 %6 = extractelement <8 x float> %B, i32 1
221 %add = fadd float %5, %6
222 %7 = extractelement <8 x float> %A, i32 3
223 %8 = extractelement <8 x float> %B, i32 3
224 %add2 = fadd float %7, %8
225 %9 = extractelement <8 x float> %A, i32 4
226 %10 = extractelement <8 x float> %B, i32 4
227 %sub3 = fsub float %9, %10
228 %11 = extractelement <8 x float> %A, i32 6
229 %12 = extractelement <8 x float> %B, i32 6
230 %sub4 = fsub float %11, %12
231 %13 = extractelement <8 x float> %A, i32 5
232 %14 = extractelement <8 x float> %B, i32 5
233 %add3 = fadd float %13, %14
234 %15 = extractelement <8 x float> %A, i32 7
235 %16 = extractelement <8 x float> %B, i32 7
236 %add4 = fadd float %15, %16
237 %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
238 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
239 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
240 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
241 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
242 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
243 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
244 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
245 ret <8 x float> %vecinsert8
248 ; Verify that we don't generate addsub instruction for the following
251 define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
254 ; SSE-NEXT: subss %xmm1, %xmm0
259 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
261 %1 = extractelement <4 x float> %A, i32 0
262 %2 = extractelement <4 x float> %B, i32 0
263 %sub = fsub float %1, %2
264 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
265 ret <4 x float> %vecinsert1
268 define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
271 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
272 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
273 ; SSE-NEXT: subss %xmm1, %xmm0
274 ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
277 ; AVX1-LABEL: test11:
279 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
280 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
281 ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
282 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
285 ; AVX512-LABEL: test11:
287 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
288 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
289 ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
290 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
292 %1 = extractelement <4 x float> %A, i32 2
293 %2 = extractelement <4 x float> %B, i32 2
294 %sub = fsub float %1, %2
295 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
296 ret <4 x float> %vecinsert1
299 define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
302 ; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
303 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
304 ; SSE-NEXT: addss %xmm0, %xmm1
305 ; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
308 ; AVX1-LABEL: test12:
310 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
311 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
312 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
313 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
316 ; AVX512-LABEL: test12:
318 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
319 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
320 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
321 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
323 %1 = extractelement <4 x float> %A, i32 1
324 %2 = extractelement <4 x float> %B, i32 1
325 %add = fadd float %1, %2
326 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
327 ret <4 x float> %vecinsert1
330 define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
333 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
334 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
335 ; SSE-NEXT: addss %xmm0, %xmm1
336 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
337 ; SSE-NEXT: movaps %xmm1, %xmm0
340 ; AVX1-LABEL: test13:
342 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
343 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
344 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
345 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
348 ; AVX512-LABEL: test13:
350 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
351 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
352 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
353 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
355 %1 = extractelement <4 x float> %A, i32 3
356 %2 = extractelement <4 x float> %B, i32 3
357 %add = fadd float %1, %2
358 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
359 ret <4 x float> %vecinsert1
362 define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
365 ; SSE-NEXT: movaps %xmm0, %xmm2
366 ; SSE-NEXT: subss %xmm1, %xmm2
367 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
368 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
369 ; SSE-NEXT: subss %xmm1, %xmm0
370 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
371 ; SSE-NEXT: movaps %xmm2, %xmm0
376 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2
377 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
378 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
379 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
380 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
382 %1 = extractelement <4 x float> %A, i32 0
383 %2 = extractelement <4 x float> %B, i32 0
384 %sub = fsub float %1, %2
385 %3 = extractelement <4 x float> %A, i32 2
386 %4 = extractelement <4 x float> %B, i32 2
387 %sub2 = fsub float %3, %4
388 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
389 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
390 ret <4 x float> %vecinsert2
393 define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
396 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
397 ; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
398 ; SSE-NEXT: addss %xmm3, %xmm2
399 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
400 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
401 ; SSE-NEXT: addss %xmm0, %xmm1
402 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
403 ; SSE-NEXT: movaps %xmm2, %xmm0
406 ; AVX1-LABEL: test15:
408 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
409 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
410 ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2
411 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
412 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
413 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
414 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
415 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
418 ; AVX512-LABEL: test15:
420 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
421 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
422 ; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2
423 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
424 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
425 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
426 ; AVX512-NEXT: vbroadcastss %xmm2, %xmm1
427 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
429 %1 = extractelement <4 x float> %A, i32 1
430 %2 = extractelement <4 x float> %B, i32 1
431 %add = fadd float %1, %2
432 %3 = extractelement <4 x float> %A, i32 3
433 %4 = extractelement <4 x float> %B, i32 3
434 %add2 = fadd float %3, %4
435 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
436 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
437 ret <4 x float> %vecinsert2
440 define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
443 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
444 ; SSE-NEXT: movaps %xmm0, %xmm2
445 ; SSE-NEXT: subss %xmm3, %xmm2
446 ; SSE-NEXT: movaps %xmm0, %xmm4
447 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
448 ; SSE-NEXT: movaps %xmm1, %xmm5
449 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
450 ; SSE-NEXT: subss %xmm5, %xmm4
451 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
452 ; SSE-NEXT: addss %xmm3, %xmm5
453 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
454 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
455 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
456 ; SSE-NEXT: addss %xmm0, %xmm1
457 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
458 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
459 ; SSE-NEXT: movaps %xmm2, %xmm0
464 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
465 ; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3
466 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
467 ; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
468 ; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4
469 ; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
470 ; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2
471 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
472 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
473 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
474 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
475 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
476 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
478 %1 = extractelement <4 x float> %A, i32 0
479 %2 = extractelement <4 x float> %B, i32 0
480 %sub = fsub float %1, 42.0
481 %3 = extractelement <4 x float> %A, i32 2
482 %4 = extractelement <4 x float> %B, i32 2
483 %sub2 = fsub float %3, %4
484 %5 = extractelement <4 x float> %A, i32 1
485 %6 = extractelement <4 x float> %B, i32 1
486 %add = fadd float %5, 42.0
487 %7 = extractelement <4 x float> %A, i32 3
488 %8 = extractelement <4 x float> %B, i32 3
489 %add2 = fadd float %7, %8
490 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
491 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
492 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
493 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
494 ret <4 x float> %vecinsert4
497 define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
498 ; SSE-LABEL: test_v2f32:
500 ; SSE-NEXT: addsubps %xmm1, %xmm0
503 ; AVX-LABEL: test_v2f32:
505 ; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
507 %v2 = extractelement <2 x float> %v0, i32 0
508 %v3 = extractelement <2 x float> %v1, i32 0
509 %v4 = extractelement <2 x float> %v0, i32 1
510 %v5 = extractelement <2 x float> %v1, i32 1
511 %sub = fsub float %v2, %v3
512 %add = fadd float %v5, %v4
513 %res0 = insertelement <2 x float> undef, float %sub, i32 0
514 %res1 = insertelement <2 x float> %res0, float %add, i32 1
515 ret <2 x float> %res1
518 define <16 x float> @test17(<16 x float> %A, <16 x float> %B) {
521 ; SSE-NEXT: addsubps %xmm4, %xmm0
522 ; SSE-NEXT: addsubps %xmm5, %xmm1
523 ; SSE-NEXT: movaps %xmm0, %xmm2
524 ; SSE-NEXT: movaps %xmm1, %xmm3
527 ; AVX1-LABEL: test17:
529 ; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
530 ; AVX1-NEXT: vmovaps %ymm0, %ymm1
533 ; AVX512-LABEL: test17:
535 ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm2
536 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
537 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
538 ; AVX512-NEXT: vsubss %xmm4, %xmm3, %xmm3
539 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
540 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
541 ; AVX512-NEXT: vaddss %xmm5, %xmm4, %xmm4
542 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
543 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
544 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
545 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[3,3,3,3]
546 ; AVX512-NEXT: vaddss %xmm4, %xmm3, %xmm3
547 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
548 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
549 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
550 ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm3
551 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
552 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
553 ; AVX512-NEXT: vsubss %xmm5, %xmm4, %xmm4
554 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
555 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
556 ; AVX512-NEXT: vaddss %xmm6, %xmm5, %xmm5
557 ; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
558 ; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
559 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
560 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
561 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
562 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
563 ; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
564 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
566 %1 = extractelement <16 x float> %A, i32 0
567 %2 = extractelement <16 x float> %B, i32 0
568 %sub = fsub float %1, %2
569 %3 = extractelement <16 x float> %A, i32 2
570 %4 = extractelement <16 x float> %B, i32 2
571 %sub2 = fsub float %3, %4
572 %5 = extractelement <16 x float> %A, i32 1
573 %6 = extractelement <16 x float> %B, i32 1
574 %add = fadd float %5, %6
575 %7 = extractelement <16 x float> %A, i32 3
576 %8 = extractelement <16 x float> %B, i32 3
577 %add2 = fadd float %7, %8
578 %9 = extractelement <16 x float> %A, i32 4
579 %10 = extractelement <16 x float> %B, i32 4
580 %sub3 = fsub float %9, %10
581 %11 = extractelement <16 x float> %A, i32 6
582 %12 = extractelement <16 x float> %B, i32 6
583 %sub4 = fsub float %11, %12
584 %13 = extractelement <16 x float> %A, i32 5
585 %14 = extractelement <16 x float> %B, i32 5
586 %add3 = fadd float %13, %14
587 %15 = extractelement <16 x float> %A, i32 7
588 %16 = extractelement <16 x float> %B, i32 7
589 %add4 = fadd float %15, %16
590 %17 = extractelement <16 x float> %A, i32 8
591 %18 = extractelement <16 x float> %B, i32 8
592 %sub5 = fsub float %1, %2
593 %19 = extractelement <16 x float> %A, i32 10
594 %20 = extractelement <16 x float> %B, i32 10
595 %sub6 = fsub float %3, %4
596 %21 = extractelement <16 x float> %A, i32 9
597 %22 = extractelement <16 x float> %B, i32 9
598 %add5 = fadd float %5, %6
599 %23 = extractelement <16 x float> %A, i32 11
600 %24 = extractelement <16 x float> %B, i32 11
601 %add6 = fadd float %7, %8
602 %25 = extractelement <16 x float> %A, i32 12
603 %26 = extractelement <16 x float> %B, i32 12
604 %sub7 = fsub float %9, %10
605 %27 = extractelement <16 x float> %A, i32 14
606 %28 = extractelement <16 x float> %B, i32 14
607 %sub8 = fsub float %11, %12
608 %29 = extractelement <16 x float> %A, i32 13
609 %30 = extractelement <16 x float> %B, i32 13
610 %add7 = fadd float %13, %14
611 %31 = extractelement <16 x float> %A, i32 15
612 %32 = extractelement <16 x float> %B, i32 15
613 %add8 = fadd float %15, %16
614 %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1
615 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
616 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
617 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
618 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
619 %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
620 %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
621 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
622 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
623 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
624 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
625 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
626 %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
627 %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
628 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
629 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
630 ret <16 x float> %vecinsert16
633 define <8 x double> @test18(<8 x double> %A, <8 x double> %B) {
636 ; SSE-NEXT: addsubpd %xmm4, %xmm0
637 ; SSE-NEXT: addsubpd %xmm5, %xmm1
638 ; SSE-NEXT: addsubpd %xmm6, %xmm2
639 ; SSE-NEXT: addsubpd %xmm7, %xmm3
642 ; AVX1-LABEL: test18:
644 ; AVX1-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
645 ; AVX1-NEXT: vaddsubpd %ymm3, %ymm1, %ymm1
648 ; AVX512-LABEL: test18:
650 ; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm2
651 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
652 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4
653 ; AVX512-NEXT: vsubsd %xmm4, %xmm3, %xmm5
654 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
655 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm1[1,0]
656 ; AVX512-NEXT: vaddsd %xmm7, %xmm6, %xmm6
657 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm6[0]
658 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
659 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
660 ; AVX512-NEXT: vaddsd %xmm4, %xmm3, %xmm3
661 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm5[0],xmm3[0]
662 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm4
663 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5
664 ; AVX512-NEXT: vsubsd %xmm5, %xmm4, %xmm6
665 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
666 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
667 ; AVX512-NEXT: vsubsd %xmm1, %xmm0, %xmm7
668 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
669 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
670 ; AVX512-NEXT: vaddsd %xmm5, %xmm4, %xmm4
671 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm6[0],xmm4[0]
672 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
673 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
674 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
675 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm7[0],xmm0[0]
676 ; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
677 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
678 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
680 %1 = extractelement <8 x double> %A, i32 0
681 %2 = extractelement <8 x double> %B, i32 0
682 %sub = fsub double %1, %2
683 %3 = extractelement <8 x double> %A, i32 2
684 %4 = extractelement <8 x double> %B, i32 2
685 %sub2 = fsub double %3, %4
686 %5 = extractelement <8 x double> %A, i32 1
687 %6 = extractelement <8 x double> %B, i32 1
688 %add = fadd double %5, %6
689 %7 = extractelement <8 x double> %A, i32 3
690 %8 = extractelement <8 x double> %B, i32 3
691 %add2 = fadd double %7, %8
692 %9 = extractelement <8 x double> %A, i32 4
693 %10 = extractelement <8 x double> %B, i32 4
694 %sub3 = fsub double %9, %10
695 %11 = extractelement <8 x double> %A, i32 6
696 %12 = extractelement <8 x double> %B, i32 6
697 %sub4 = fsub double %11, %12
698 %13 = extractelement <8 x double> %A, i32 5
699 %14 = extractelement <8 x double> %B, i32 5
700 %add3 = fadd double %13, %14
701 %15 = extractelement <8 x double> %A, i32 7
702 %16 = extractelement <8 x double> %B, i32 7
703 %add4 = fadd double %15, %16
704 %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1
705 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
706 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
707 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
708 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
709 %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
710 %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
711 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
712 ret <8 x double> %vecinsert8