1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6 define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) {
7 ; SSE-LABEL: combine_vec_add_to_zero:
11 ; AVX-LABEL: combine_vec_add_to_zero:
14 %1 = add <4 x i32> %a, zeroinitializer
18 ; fold ((c1-A)+c2) -> (c1+c2)-A
19 define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) {
20 ; SSE-LABEL: combine_vec_add_constant_sub:
22 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2,4,6]
23 ; SSE-NEXT: psubd %xmm0, %xmm1
24 ; SSE-NEXT: movdqa %xmm1, %xmm0
27 ; AVX-LABEL: combine_vec_add_constant_sub:
29 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6]
30 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
32 %1 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %a
33 %2 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %1
37 ; fold ((0-A) + B) -> B-A
38 define <4 x i32> @combine_vec_add_neg0(<4 x i32> %a, <4 x i32> %b) {
39 ; SSE-LABEL: combine_vec_add_neg0:
41 ; SSE-NEXT: psubd %xmm0, %xmm1
42 ; SSE-NEXT: movdqa %xmm1, %xmm0
45 ; AVX-LABEL: combine_vec_add_neg0:
47 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
49 %1 = sub <4 x i32> zeroinitializer, %a
50 %2 = add <4 x i32> %1, %b
54 ; fold (A + (0-B)) -> A-B
55 define <4 x i32> @combine_vec_add_neg1(<4 x i32> %a, <4 x i32> %b) {
56 ; SSE-LABEL: combine_vec_add_neg1:
58 ; SSE-NEXT: psubd %xmm1, %xmm0
61 ; AVX-LABEL: combine_vec_add_neg1:
63 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
65 %1 = sub <4 x i32> zeroinitializer, %b
66 %2 = add <4 x i32> %a, %1
71 define <4 x i32> @combine_vec_add_sub0(<4 x i32> %a, <4 x i32> %b) {
72 ; SSE-LABEL: combine_vec_add_sub0:
74 ; SSE-NEXT: movaps %xmm1, %xmm0
77 ; AVX-LABEL: combine_vec_add_sub0:
79 ; AVX-NEXT: vmovaps %xmm1, %xmm0
81 %1 = sub <4 x i32> %b, %a
82 %2 = add <4 x i32> %a, %1
87 define <4 x i32> @combine_vec_add_sub1(<4 x i32> %a, <4 x i32> %b) {
88 ; SSE-LABEL: combine_vec_add_sub1:
90 ; SSE-NEXT: movaps %xmm1, %xmm0
93 ; AVX-LABEL: combine_vec_add_sub1:
95 ; AVX-NEXT: vmovaps %xmm1, %xmm0
97 %1 = sub <4 x i32> %b, %a
98 %2 = add <4 x i32> %1, %a
102 ; fold ((A-B)+(C-A)) -> (C-B)
103 define <4 x i32> @combine_vec_add_sub_sub0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
104 ; SSE-LABEL: combine_vec_add_sub_sub0:
106 ; SSE-NEXT: movdqa %xmm2, %xmm0
107 ; SSE-NEXT: psubd %xmm1, %xmm0
110 ; AVX-LABEL: combine_vec_add_sub_sub0:
112 ; AVX-NEXT: vpsubd %xmm1, %xmm2, %xmm0
114 %1 = sub <4 x i32> %a, %b
115 %2 = sub <4 x i32> %c, %a
116 %3 = add <4 x i32> %1, %2
120 ; fold ((A-B)+(B-C)) -> (A-C)
121 define <4 x i32> @combine_vec_add_sub_sub1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
122 ; SSE-LABEL: combine_vec_add_sub_sub1:
124 ; SSE-NEXT: psubd %xmm2, %xmm0
127 ; AVX-LABEL: combine_vec_add_sub_sub1:
129 ; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0
131 %1 = sub <4 x i32> %a, %b
132 %2 = sub <4 x i32> %b, %c
133 %3 = add <4 x i32> %1, %2
137 ; fold (A+(B-(A+C))) to (B-C)
138 define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
139 ; SSE-LABEL: combine_vec_add_sub_add0:
141 ; SSE-NEXT: movdqa %xmm1, %xmm0
142 ; SSE-NEXT: psubd %xmm2, %xmm0
145 ; AVX-LABEL: combine_vec_add_sub_add0:
147 ; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
149 %1 = add <4 x i32> %a, %c
150 %2 = sub <4 x i32> %b, %1
151 %3 = add <4 x i32> %a, %2
155 ; fold (A+(B-(C+A))) to (B-C)
156 define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
157 ; SSE-LABEL: combine_vec_add_sub_add1:
159 ; SSE-NEXT: movdqa %xmm1, %xmm0
160 ; SSE-NEXT: psubd %xmm2, %xmm0
163 ; AVX-LABEL: combine_vec_add_sub_add1:
165 ; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
167 %1 = add <4 x i32> %c, %a
168 %2 = sub <4 x i32> %b, %1
169 %3 = add <4 x i32> %a, %2
173 ; fold (A+((B-A)+C)) to (B+C)
174 define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
175 ; SSE-LABEL: combine_vec_add_sub_add2:
177 ; SSE-NEXT: movdqa %xmm1, %xmm0
178 ; SSE-NEXT: paddd %xmm2, %xmm0
181 ; AVX-LABEL: combine_vec_add_sub_add2:
183 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm0
185 %1 = sub <4 x i32> %b, %a
186 %2 = add <4 x i32> %1, %c
187 %3 = add <4 x i32> %a, %2
191 ; fold (A+((B-A)-C)) to (B-C)
192 define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
193 ; SSE-LABEL: combine_vec_add_sub_add3:
195 ; SSE-NEXT: movdqa %xmm1, %xmm0
196 ; SSE-NEXT: psubd %xmm2, %xmm0
199 ; AVX-LABEL: combine_vec_add_sub_add3:
201 ; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
203 %1 = sub <4 x i32> %b, %a
204 %2 = sub <4 x i32> %1, %c
205 %3 = add <4 x i32> %a, %2
209 ; fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
210 define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d) {
211 ; SSE-LABEL: combine_vec_add_sub_sub:
213 ; SSE-NEXT: paddd %xmm2, %xmm1
214 ; SSE-NEXT: psubd %xmm1, %xmm0
215 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
218 ; AVX-LABEL: combine_vec_add_sub_sub:
220 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
221 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
222 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
224 %1 = sub <4 x i32> %a, %b
225 %2 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %d
226 %3 = add <4 x i32> %1, %2
230 ; fold (a+b) -> (a|b) iff a and b share no bits.
231 define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) {
232 ; SSE-LABEL: combine_vec_add_uniquebits:
234 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
235 ; SSE-NEXT: andps {{.*}}(%rip), %xmm1
236 ; SSE-NEXT: orps %xmm1, %xmm0
239 ; AVX-LABEL: combine_vec_add_uniquebits:
241 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680]
242 ; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
243 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855]
244 ; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
245 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
247 %1 = and <4 x i32> %a, <i32 61680, i32 61680, i32 61680, i32 61680>
248 %2 = and <4 x i32> %b, <i32 3855, i32 3855, i32 3855, i32 3855>
249 %3 = add <4 x i32> %1, %2
253 ; fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
254 define <4 x i32> @combine_vec_add_shl_neg0(<4 x i32> %x, <4 x i32> %y) {
255 ; SSE-LABEL: combine_vec_add_shl_neg0:
257 ; SSE-NEXT: pslld $5, %xmm1
258 ; SSE-NEXT: psubd %xmm1, %xmm0
261 ; AVX-LABEL: combine_vec_add_shl_neg0:
263 ; AVX-NEXT: vpslld $5, %xmm1, %xmm1
264 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
266 %1 = sub <4 x i32> zeroinitializer, %y
267 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
268 %3 = add <4 x i32> %x, %2
272 ; fold (add shl(0 - y, n), x) -> sub(x, shl(y, n))
273 define <4 x i32> @combine_vec_add_shl_neg1(<4 x i32> %x, <4 x i32> %y) {
274 ; SSE-LABEL: combine_vec_add_shl_neg1:
276 ; SSE-NEXT: pslld $5, %xmm1
277 ; SSE-NEXT: psubd %xmm1, %xmm0
280 ; AVX-LABEL: combine_vec_add_shl_neg1:
282 ; AVX-NEXT: vpslld $5, %xmm1, %xmm1
283 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
285 %1 = sub <4 x i32> zeroinitializer, %y
286 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
287 %3 = add <4 x i32> %2, %x
291 ; (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
292 ; and similar xforms where the inner op is either ~0 or 0.
293 define <4 x i32> @combine_vec_add_and_compare(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
294 ; SSE-LABEL: combine_vec_add_and_compare:
296 ; SSE-NEXT: pcmpeqd %xmm2, %xmm1
297 ; SSE-NEXT: psubd %xmm1, %xmm0
300 ; AVX-LABEL: combine_vec_add_and_compare:
302 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
303 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
305 %1 = icmp eq <4 x i32> %a1, %a2
306 %2 = sext <4 x i1> %1 to <4 x i32>
307 %3 = and <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
308 %4 = add <4 x i32> %a0, %3
312 ; add (sext i1), X -> sub X, (zext i1)
313 define <4 x i32> @combine_vec_add_sext(<4 x i1> %a0, <4 x i32> %a1) {
314 ; SSE-LABEL: combine_vec_add_sext:
316 ; SSE-NEXT: pslld $31, %xmm0
317 ; SSE-NEXT: psrad $31, %xmm0
318 ; SSE-NEXT: paddd %xmm1, %xmm0
321 ; AVX-LABEL: combine_vec_add_sext:
323 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0
324 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
325 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
327 %1 = sext <4 x i1> %a0 to <4 x i32>
328 %2 = add <4 x i32> %1, %a1
332 ; add (sext i1), X -> sub X, (zext i1)
333 define <4 x i32> @combine_vec_add_sextinreg(<4 x i32> %a0, <4 x i32> %a1) {
334 ; SSE-LABEL: combine_vec_add_sextinreg:
336 ; SSE-NEXT: pslld $31, %xmm0
337 ; SSE-NEXT: psrad $31, %xmm0
338 ; SSE-NEXT: paddd %xmm1, %xmm0
341 ; AVX-LABEL: combine_vec_add_sextinreg:
343 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0
344 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
345 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
347 %1 = shl <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
348 %2 = ashr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
349 %3 = add <4 x i32> %2, %a1
353 ; (add (add (xor a, -1), b), 1) -> (sub b, a)
354 define i32 @combine_add_add_not(i32 %a, i32 %b) {
355 ; SSE-LABEL: combine_add_add_not:
357 ; SSE-NEXT: movl %esi, %eax
358 ; SSE-NEXT: subl %edi, %eax
361 ; AVX-LABEL: combine_add_add_not:
363 ; AVX-NEXT: movl %esi, %eax
364 ; AVX-NEXT: subl %edi, %eax
366 %nota = xor i32 %a, -1
367 %add = add i32 %nota, %b
372 define <4 x i32> @combine_vec_add_add_not(<4 x i32> %a, <4 x i32> %b) {
373 ; SSE-LABEL: combine_vec_add_add_not:
375 ; SSE-NEXT: psubd %xmm0, %xmm1
376 ; SSE-NEXT: movdqa %xmm1, %xmm0
379 ; AVX-LABEL: combine_vec_add_add_not:
381 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
383 %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
384 %add = add <4 x i32> %nota, %b
385 %r = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>