1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7 define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) {
8 ; CHECK-LABEL: combine_vec_add_to_zero:
11 %1 = add <4 x i32> %a, zeroinitializer
15 ; fold ((c1-A)+c2) -> (c1+c2)-A
16 define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) {
17 ; SSE-LABEL: combine_vec_add_constant_sub:
19 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2,4,6]
20 ; SSE-NEXT: psubd %xmm0, %xmm1
21 ; SSE-NEXT: movdqa %xmm1, %xmm0
24 ; AVX-LABEL: combine_vec_add_constant_sub:
26 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6]
27 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
29 %1 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %a
30 %2 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %1
34 ; fold ((0-A) + B) -> B-A
35 define <4 x i32> @combine_vec_add_neg0(<4 x i32> %a, <4 x i32> %b) {
36 ; SSE-LABEL: combine_vec_add_neg0:
38 ; SSE-NEXT: psubd %xmm0, %xmm1
39 ; SSE-NEXT: movdqa %xmm1, %xmm0
42 ; AVX-LABEL: combine_vec_add_neg0:
44 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
46 %1 = sub <4 x i32> zeroinitializer, %a
47 %2 = add <4 x i32> %1, %b
51 ; fold (A + (0-B)) -> A-B
52 define <4 x i32> @combine_vec_add_neg1(<4 x i32> %a, <4 x i32> %b) {
53 ; SSE-LABEL: combine_vec_add_neg1:
55 ; SSE-NEXT: psubd %xmm1, %xmm0
58 ; AVX-LABEL: combine_vec_add_neg1:
60 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
62 %1 = sub <4 x i32> zeroinitializer, %b
63 %2 = add <4 x i32> %a, %1
68 define <4 x i32> @combine_vec_add_sub0(<4 x i32> %a, <4 x i32> %b) {
69 ; SSE-LABEL: combine_vec_add_sub0:
71 ; SSE-NEXT: movaps %xmm1, %xmm0
74 ; AVX-LABEL: combine_vec_add_sub0:
76 ; AVX-NEXT: vmovaps %xmm1, %xmm0
78 %1 = sub <4 x i32> %b, %a
79 %2 = add <4 x i32> %a, %1
84 define <4 x i32> @combine_vec_add_sub1(<4 x i32> %a, <4 x i32> %b) {
85 ; SSE-LABEL: combine_vec_add_sub1:
87 ; SSE-NEXT: movaps %xmm1, %xmm0
90 ; AVX-LABEL: combine_vec_add_sub1:
92 ; AVX-NEXT: vmovaps %xmm1, %xmm0
94 %1 = sub <4 x i32> %b, %a
95 %2 = add <4 x i32> %1, %a
99 ; fold ((A-B)+(C-A)) -> (C-B)
100 define <4 x i32> @combine_vec_add_sub_sub0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
101 ; SSE-LABEL: combine_vec_add_sub_sub0:
103 ; SSE-NEXT: movdqa %xmm2, %xmm0
104 ; SSE-NEXT: psubd %xmm1, %xmm0
107 ; AVX-LABEL: combine_vec_add_sub_sub0:
109 ; AVX-NEXT: vpsubd %xmm1, %xmm2, %xmm0
111 %1 = sub <4 x i32> %a, %b
112 %2 = sub <4 x i32> %c, %a
113 %3 = add <4 x i32> %1, %2
117 ; fold ((A-B)+(B-C)) -> (A-C)
118 define <4 x i32> @combine_vec_add_sub_sub1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
119 ; SSE-LABEL: combine_vec_add_sub_sub1:
121 ; SSE-NEXT: psubd %xmm2, %xmm0
124 ; AVX-LABEL: combine_vec_add_sub_sub1:
126 ; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0
128 %1 = sub <4 x i32> %a, %b
129 %2 = sub <4 x i32> %b, %c
130 %3 = add <4 x i32> %1, %2
134 ; fold (A+(B-(A+C))) to (B-C)
135 define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
136 ; SSE-LABEL: combine_vec_add_sub_add0:
138 ; SSE-NEXT: movdqa %xmm1, %xmm0
139 ; SSE-NEXT: psubd %xmm2, %xmm0
142 ; AVX-LABEL: combine_vec_add_sub_add0:
144 ; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
146 %1 = add <4 x i32> %a, %c
147 %2 = sub <4 x i32> %b, %1
148 %3 = add <4 x i32> %a, %2
152 ; fold (A+(B-(C+A))) to (B-C)
153 define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
154 ; SSE-LABEL: combine_vec_add_sub_add1:
156 ; SSE-NEXT: movdqa %xmm1, %xmm0
157 ; SSE-NEXT: psubd %xmm2, %xmm0
160 ; AVX-LABEL: combine_vec_add_sub_add1:
162 ; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
164 %1 = add <4 x i32> %c, %a
165 %2 = sub <4 x i32> %b, %1
166 %3 = add <4 x i32> %a, %2
170 ; fold (A+((B-A)+C)) to (B+C)
171 define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
172 ; SSE-LABEL: combine_vec_add_sub_add2:
174 ; SSE-NEXT: movdqa %xmm1, %xmm0
175 ; SSE-NEXT: paddd %xmm2, %xmm0
178 ; AVX-LABEL: combine_vec_add_sub_add2:
180 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm0
182 %1 = sub <4 x i32> %b, %a
183 %2 = add <4 x i32> %1, %c
184 %3 = add <4 x i32> %a, %2
188 ; fold (A+((B-A)-C)) to (B-C)
189 define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
190 ; SSE-LABEL: combine_vec_add_sub_add3:
192 ; SSE-NEXT: movdqa %xmm1, %xmm0
193 ; SSE-NEXT: psubd %xmm2, %xmm0
196 ; AVX-LABEL: combine_vec_add_sub_add3:
198 ; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
200 %1 = sub <4 x i32> %b, %a
201 %2 = sub <4 x i32> %1, %c
202 %3 = add <4 x i32> %a, %2
206 ; fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
207 define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d) {
208 ; SSE-LABEL: combine_vec_add_sub_sub:
210 ; SSE-NEXT: paddd %xmm2, %xmm1
211 ; SSE-NEXT: psubd %xmm1, %xmm0
212 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
215 ; AVX-LABEL: combine_vec_add_sub_sub:
217 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
218 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
219 ; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
221 %1 = sub <4 x i32> %a, %b
222 %2 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %d
223 %3 = add <4 x i32> %1, %2
227 ; Check for oneuse limit on fold
228 define void @PR52039(ptr %pa, ptr %pb) {
229 ; SSE-LABEL: PR52039:
231 ; SSE-NEXT: movdqu (%rdi), %xmm0
232 ; SSE-NEXT: movdqu 16(%rdi), %xmm1
233 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [10,10,10,10]
234 ; SSE-NEXT: movdqa %xmm2, %xmm3
235 ; SSE-NEXT: psubd %xmm1, %xmm3
236 ; SSE-NEXT: psubd %xmm0, %xmm2
237 ; SSE-NEXT: movdqa %xmm2, %xmm0
238 ; SSE-NEXT: paddd %xmm2, %xmm0
239 ; SSE-NEXT: paddd %xmm2, %xmm0
240 ; SSE-NEXT: movdqa %xmm3, %xmm1
241 ; SSE-NEXT: paddd %xmm3, %xmm1
242 ; SSE-NEXT: paddd %xmm3, %xmm1
243 ; SSE-NEXT: movdqu %xmm3, 16(%rsi)
244 ; SSE-NEXT: movdqu %xmm2, (%rsi)
245 ; SSE-NEXT: movdqu %xmm1, 16(%rdi)
246 ; SSE-NEXT: movdqu %xmm0, (%rdi)
249 ; AVX1-LABEL: PR52039:
251 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [10,10,10,10]
252 ; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1
253 ; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0
254 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
255 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
256 ; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm3
257 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm3
258 ; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
259 ; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
260 ; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi)
261 ; AVX1-NEXT: vmovdqu %xmm2, (%rdi)
264 ; AVX2-LABEL: PR52039:
266 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [10,10,10,10,10,10,10,10]
267 ; AVX2-NEXT: vpsubd (%rdi), %ymm0, %ymm0
268 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3]
269 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm1
270 ; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
271 ; AVX2-NEXT: vmovdqu %ymm1, (%rdi)
272 ; AVX2-NEXT: vzeroupper
274 %load = load <8 x i32>, ptr %pa, align 4
275 %sub = sub nsw <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>, %load
276 %mul = mul nsw <8 x i32> %sub, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
277 store <8 x i32> %sub, ptr %pb, align 4
278 store <8 x i32> %mul, ptr %pa, align 4
282 ; fold (a+b) -> (a|b) iff a and b share no bits.
283 define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) {
284 ; SSE-LABEL: combine_vec_add_uniquebits:
286 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
287 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
288 ; SSE-NEXT: orps %xmm1, %xmm0
291 ; AVX1-LABEL: combine_vec_add_uniquebits:
293 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
294 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
295 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
298 ; AVX2-LABEL: combine_vec_add_uniquebits:
300 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680]
301 ; AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0
302 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855]
303 ; AVX2-NEXT: vandps %xmm2, %xmm1, %xmm1
304 ; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
306 %1 = and <4 x i32> %a, <i32 61680, i32 61680, i32 61680, i32 61680>
307 %2 = and <4 x i32> %b, <i32 3855, i32 3855, i32 3855, i32 3855>
308 %3 = add <4 x i32> %1, %2
312 ; fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
313 define <4 x i32> @combine_vec_add_shl_neg0(<4 x i32> %x, <4 x i32> %y) {
314 ; SSE-LABEL: combine_vec_add_shl_neg0:
316 ; SSE-NEXT: pslld $5, %xmm1
317 ; SSE-NEXT: psubd %xmm1, %xmm0
320 ; AVX-LABEL: combine_vec_add_shl_neg0:
322 ; AVX-NEXT: vpslld $5, %xmm1, %xmm1
323 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
325 %1 = sub <4 x i32> zeroinitializer, %y
326 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
327 %3 = add <4 x i32> %x, %2
331 ; fold (add shl(0 - y, n), x) -> sub(x, shl(y, n))
332 define <4 x i32> @combine_vec_add_shl_neg1(<4 x i32> %x, <4 x i32> %y) {
333 ; SSE-LABEL: combine_vec_add_shl_neg1:
335 ; SSE-NEXT: pslld $5, %xmm1
336 ; SSE-NEXT: psubd %xmm1, %xmm0
339 ; AVX-LABEL: combine_vec_add_shl_neg1:
341 ; AVX-NEXT: vpslld $5, %xmm1, %xmm1
342 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
344 %1 = sub <4 x i32> zeroinitializer, %y
345 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
346 %3 = add <4 x i32> %2, %x
350 ; (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
351 ; and similar xforms where the inner op is either ~0 or 0.
352 define <4 x i32> @combine_vec_add_and_compare(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
353 ; SSE-LABEL: combine_vec_add_and_compare:
355 ; SSE-NEXT: pcmpeqd %xmm2, %xmm1
356 ; SSE-NEXT: psubd %xmm1, %xmm0
359 ; AVX-LABEL: combine_vec_add_and_compare:
361 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
362 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
364 %1 = icmp eq <4 x i32> %a1, %a2
365 %2 = sext <4 x i1> %1 to <4 x i32>
366 %3 = and <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
367 %4 = add <4 x i32> %a0, %3
371 ; add (sext i1), X -> sub X, (zext i1)
372 define <4 x i32> @combine_vec_add_sext(<4 x i1> %a0, <4 x i32> %a1) {
373 ; SSE-LABEL: combine_vec_add_sext:
375 ; SSE-NEXT: pslld $31, %xmm0
376 ; SSE-NEXT: psrad $31, %xmm0
377 ; SSE-NEXT: paddd %xmm1, %xmm0
380 ; AVX-LABEL: combine_vec_add_sext:
382 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0
383 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
384 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
386 %1 = sext <4 x i1> %a0 to <4 x i32>
387 %2 = add <4 x i32> %1, %a1
391 ; add (sext i1), X -> sub X, (zext i1)
392 define <4 x i32> @combine_vec_add_sextinreg(<4 x i32> %a0, <4 x i32> %a1) {
393 ; SSE-LABEL: combine_vec_add_sextinreg:
395 ; SSE-NEXT: pslld $31, %xmm0
396 ; SSE-NEXT: psrad $31, %xmm0
397 ; SSE-NEXT: paddd %xmm1, %xmm0
400 ; AVX-LABEL: combine_vec_add_sextinreg:
402 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0
403 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
404 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
406 %1 = shl <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
407 %2 = ashr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
408 %3 = add <4 x i32> %2, %a1
412 ; (add (add (xor a, -1), b), 1) -> (sub b, a)
413 define i32 @combine_add_add_not(i32 %a, i32 %b) {
414 ; CHECK-LABEL: combine_add_add_not:
416 ; CHECK-NEXT: movl %esi, %eax
417 ; CHECK-NEXT: subl %edi, %eax
419 %nota = xor i32 %a, -1
420 %add = add i32 %nota, %b
425 define <4 x i32> @combine_vec_add_add_not(<4 x i32> %a, <4 x i32> %b) {
426 ; SSE-LABEL: combine_vec_add_add_not:
428 ; SSE-NEXT: psubd %xmm0, %xmm1
429 ; SSE-NEXT: movdqa %xmm1, %xmm0
432 ; AVX-LABEL: combine_vec_add_add_not:
434 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
436 %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
437 %add = add <4 x i32> %nota, %b
438 %r = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
442 define i32 @combine_add_adc_constant(i32 %x, i32 %y, i32 %z) {
443 ; CHECK-LABEL: combine_add_adc_constant:
445 ; CHECK-NEXT: movl %edi, %eax
446 ; CHECK-NEXT: btl $7, %edx
447 ; CHECK-NEXT: adcl $32, %eax
449 %and = lshr i32 %z, 7
450 %bit = and i32 %and, 1
451 %add = add i32 %x, 32
452 %r = add i32 %add, %bit
456 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
458 define i1 @sadd_add(i32 %a, i32 %b, ptr %p) {
459 ; CHECK-LABEL: sadd_add:
461 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
462 ; CHECK-NEXT: notl %edi
463 ; CHECK-NEXT: addl %esi, %edi
464 ; CHECK-NEXT: seto %al
465 ; CHECK-NEXT: leal 1(%rdi), %ecx
466 ; CHECK-NEXT: movl %ecx, (%rdx)
468 %nota = xor i32 %a, -1
469 %a0 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %nota, i32 %b)
470 %e0 = extractvalue {i32, i1} %a0, 0
471 %e1 = extractvalue {i32, i1} %a0, 1
472 %res = add i32 %e0, 1
473 store i32 %res, ptr %p
477 declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
479 define i1 @uadd_add(i8 %a, i8 %b, ptr %p) {
480 ; CHECK-LABEL: uadd_add:
482 ; CHECK-NEXT: notb %dil
483 ; CHECK-NEXT: addb %sil, %dil
484 ; CHECK-NEXT: setb %al
485 ; CHECK-NEXT: incb %dil
486 ; CHECK-NEXT: movb %dil, (%rdx)
488 %nota = xor i8 %a, -1
489 %a0 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %nota, i8 %b)
490 %e0 = extractvalue {i8, i1} %a0, 0
491 %e1 = extractvalue {i8, i1} %a0, 1
493 store i8 %res, ptr %p
497 ; This would crash because we tried to transform an add-with-overflow
498 ; based on the wrong result value.
500 define i1 @PR51238(i1 %b, i8 %x, i8 %y, i8 %z) {
501 ; CHECK-LABEL: PR51238:
503 ; CHECK-NEXT: notb %cl
504 ; CHECK-NEXT: xorl %eax, %eax
505 ; CHECK-NEXT: addb %dl, %cl
506 ; CHECK-NEXT: adcb $1, %al
507 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
511 %minxz = select i1 %b, i8 %x, i8 %nz
512 %cmpyz = icmp ult i8 %ny, %nz
513 %r = add i1 %cmpyz, true
517 define <2 x i64> @add_vec_x_notx(<2 x i64> %v0) nounwind {
518 ; SSE-LABEL: add_vec_x_notx:
520 ; SSE-NEXT: pcmpeqd %xmm0, %xmm0
523 ; AVX-LABEL: add_vec_x_notx:
525 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
527 %x = xor <2 x i64> %v0, <i64 -1, i64 -1>
528 %y = add <2 x i64> %v0, %x
532 define <2 x i64> @add_vec_notx_x(<2 x i64> %v0) nounwind {
533 ; SSE-LABEL: add_vec_notx_x:
535 ; SSE-NEXT: pcmpeqd %xmm0, %xmm0
538 ; AVX-LABEL: add_vec_notx_x:
540 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
542 %x = xor <2 x i64> %v0, <i64 -1, i64 -1>
543 %y = add <2 x i64> %x, %v0
547 define i64 @add_x_notx(i64 %v0) nounwind {
548 ; CHECK-LABEL: add_x_notx:
550 ; CHECK-NEXT: movq $-1, %rax
557 define i64 @add_notx_x(i64 %v0) nounwind {
558 ; CHECK-LABEL: add_notx_x:
560 ; CHECK-NEXT: movq $-1, %rax