1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
6 define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
7 ; SSE-LABEL: fold_urem_vec_1:
9 ; SSE-NEXT: pextrw $1, %xmm0, %eax
10 ; SSE-NEXT: movl %eax, %ecx
11 ; SSE-NEXT: shrl $2, %ecx
12 ; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
13 ; SSE-NEXT: shrl $19, %ecx
14 ; SSE-NEXT: imull $124, %ecx, %ecx
15 ; SSE-NEXT: subl %ecx, %eax
16 ; SSE-NEXT: movd %xmm0, %ecx
17 ; SSE-NEXT: movzwl %cx, %edx
18 ; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
19 ; SSE-NEXT: shrl $22, %edx
20 ; SSE-NEXT: imull $95, %edx, %edx
21 ; SSE-NEXT: subl %edx, %ecx
22 ; SSE-NEXT: movd %ecx, %xmm1
23 ; SSE-NEXT: pinsrw $1, %eax, %xmm1
24 ; SSE-NEXT: pextrw $2, %xmm0, %eax
25 ; SSE-NEXT: movl %eax, %ecx
27 ; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
28 ; SSE-NEXT: shrl $17, %ecx
29 ; SSE-NEXT: imull $98, %ecx, %ecx
30 ; SSE-NEXT: subl %ecx, %eax
31 ; SSE-NEXT: pinsrw $2, %eax, %xmm1
32 ; SSE-NEXT: pextrw $3, %xmm0, %eax
33 ; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
34 ; SSE-NEXT: shrl $16, %ecx
35 ; SSE-NEXT: movl %eax, %edx
36 ; SSE-NEXT: subl %ecx, %edx
37 ; SSE-NEXT: movzwl %dx, %edx
39 ; SSE-NEXT: addl %ecx, %edx
40 ; SSE-NEXT: shrl $9, %edx
41 ; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
42 ; SSE-NEXT: subl %ecx, %eax
43 ; SSE-NEXT: pinsrw $3, %eax, %xmm1
44 ; SSE-NEXT: movdqa %xmm1, %xmm0
47 ; AVX-LABEL: fold_urem_vec_1:
49 ; AVX-NEXT: vpextrw $1, %xmm0, %eax
50 ; AVX-NEXT: movl %eax, %ecx
51 ; AVX-NEXT: shrl $2, %ecx
52 ; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
53 ; AVX-NEXT: shrl $19, %ecx
54 ; AVX-NEXT: imull $124, %ecx, %ecx
55 ; AVX-NEXT: subl %ecx, %eax
56 ; AVX-NEXT: vmovd %xmm0, %ecx
57 ; AVX-NEXT: movzwl %cx, %edx
58 ; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
59 ; AVX-NEXT: shrl $22, %edx
60 ; AVX-NEXT: imull $95, %edx, %edx
61 ; AVX-NEXT: subl %edx, %ecx
62 ; AVX-NEXT: vmovd %ecx, %xmm1
63 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
64 ; AVX-NEXT: vpextrw $2, %xmm0, %eax
65 ; AVX-NEXT: movl %eax, %ecx
67 ; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
68 ; AVX-NEXT: shrl $17, %ecx
69 ; AVX-NEXT: imull $98, %ecx, %ecx
70 ; AVX-NEXT: subl %ecx, %eax
71 ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
72 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
73 ; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
74 ; AVX-NEXT: shrl $16, %ecx
75 ; AVX-NEXT: movl %eax, %edx
76 ; AVX-NEXT: subl %ecx, %edx
77 ; AVX-NEXT: movzwl %dx, %edx
79 ; AVX-NEXT: addl %ecx, %edx
80 ; AVX-NEXT: shrl $9, %edx
81 ; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
82 ; AVX-NEXT: subl %ecx, %eax
83 ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
85 %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
89 define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
90 ; SSE-LABEL: fold_urem_vec_2:
92 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
93 ; SSE-NEXT: pmulhuw %xmm0, %xmm1
94 ; SSE-NEXT: psrlw $6, %xmm1
95 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
96 ; SSE-NEXT: psubw %xmm1, %xmm0
99 ; AVX-LABEL: fold_urem_vec_2:
101 ; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
102 ; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
103 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
104 ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
106 %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
111 ; Don't fold if we can combine urem with udiv.
112 define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
113 ; SSE-LABEL: combine_urem_udiv:
115 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
116 ; SSE-NEXT: pmulhuw %xmm0, %xmm1
117 ; SSE-NEXT: psrlw $6, %xmm1
118 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
119 ; SSE-NEXT: pmullw %xmm1, %xmm2
120 ; SSE-NEXT: psubw %xmm2, %xmm0
121 ; SSE-NEXT: paddw %xmm1, %xmm0
124 ; AVX-LABEL: combine_urem_udiv:
126 ; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
127 ; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
128 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2
129 ; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
130 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
132 %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
133 %2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
134 %3 = add <4 x i16> %1, %2
138 ; Don't fold for divisors that are a power of two.
139 define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
140 ; SSE-LABEL: dont_fold_urem_power_of_two:
142 ; SSE-NEXT: pextrw $3, %xmm0, %eax
143 ; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
144 ; SSE-NEXT: shrl $22, %ecx
145 ; SSE-NEXT: imull $95, %ecx, %ecx
146 ; SSE-NEXT: subl %ecx, %eax
147 ; SSE-NEXT: pextrw $1, %xmm0, %ecx
148 ; SSE-NEXT: andl $31, %ecx
149 ; SSE-NEXT: movd %xmm0, %edx
150 ; SSE-NEXT: andl $63, %edx
151 ; SSE-NEXT: movd %edx, %xmm1
152 ; SSE-NEXT: pinsrw $1, %ecx, %xmm1
153 ; SSE-NEXT: pextrw $2, %xmm0, %ecx
154 ; SSE-NEXT: andl $7, %ecx
155 ; SSE-NEXT: pinsrw $2, %ecx, %xmm1
156 ; SSE-NEXT: pinsrw $3, %eax, %xmm1
157 ; SSE-NEXT: movdqa %xmm1, %xmm0
160 ; AVX-LABEL: dont_fold_urem_power_of_two:
162 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
163 ; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
164 ; AVX-NEXT: shrl $22, %ecx
165 ; AVX-NEXT: imull $95, %ecx, %ecx
166 ; AVX-NEXT: subl %ecx, %eax
167 ; AVX-NEXT: vpextrw $1, %xmm0, %ecx
168 ; AVX-NEXT: andl $31, %ecx
169 ; AVX-NEXT: vmovd %xmm0, %edx
170 ; AVX-NEXT: andl $63, %edx
171 ; AVX-NEXT: vmovd %edx, %xmm1
172 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
173 ; AVX-NEXT: vpextrw $2, %xmm0, %ecx
174 ; AVX-NEXT: andl $7, %ecx
175 ; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
176 ; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
178 %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
182 ; Don't fold if the divisor is one.
183 define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
184 ; SSE-LABEL: dont_fold_urem_one:
186 ; SSE-NEXT: pextrw $2, %xmm0, %eax
187 ; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
188 ; SSE-NEXT: shrl $16, %ecx
189 ; SSE-NEXT: movl %eax, %edx
190 ; SSE-NEXT: subl %ecx, %edx
191 ; SSE-NEXT: movzwl %dx, %edx
192 ; SSE-NEXT: shrl %edx
193 ; SSE-NEXT: addl %ecx, %edx
194 ; SSE-NEXT: shrl $4, %edx
195 ; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
196 ; SSE-NEXT: shll $3, %ecx
197 ; SSE-NEXT: subl %ecx, %edx
198 ; SSE-NEXT: addl %eax, %edx
199 ; SSE-NEXT: pextrw $1, %xmm0, %eax
200 ; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
201 ; SSE-NEXT: shrl $25, %ecx
202 ; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
203 ; SSE-NEXT: subl %ecx, %eax
204 ; SSE-NEXT: pxor %xmm1, %xmm1
205 ; SSE-NEXT: pinsrw $1, %eax, %xmm1
206 ; SSE-NEXT: pinsrw $2, %edx, %xmm1
207 ; SSE-NEXT: pextrw $3, %xmm0, %eax
208 ; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
209 ; SSE-NEXT: shrl $26, %ecx
210 ; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
211 ; SSE-NEXT: subl %ecx, %eax
212 ; SSE-NEXT: pinsrw $3, %eax, %xmm1
213 ; SSE-NEXT: movdqa %xmm1, %xmm0
216 ; AVX-LABEL: dont_fold_urem_one:
218 ; AVX-NEXT: vpextrw $2, %xmm0, %eax
219 ; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
220 ; AVX-NEXT: shrl $16, %ecx
221 ; AVX-NEXT: movl %eax, %edx
222 ; AVX-NEXT: subl %ecx, %edx
223 ; AVX-NEXT: movzwl %dx, %edx
224 ; AVX-NEXT: shrl %edx
225 ; AVX-NEXT: addl %ecx, %edx
226 ; AVX-NEXT: shrl $4, %edx
227 ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
228 ; AVX-NEXT: shll $3, %ecx
229 ; AVX-NEXT: subl %ecx, %edx
230 ; AVX-NEXT: addl %eax, %edx
231 ; AVX-NEXT: vpextrw $1, %xmm0, %eax
232 ; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
233 ; AVX-NEXT: shrl $25, %ecx
234 ; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
235 ; AVX-NEXT: subl %ecx, %eax
236 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
237 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
238 ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
239 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
240 ; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
241 ; AVX-NEXT: shrl $26, %ecx
242 ; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
243 ; AVX-NEXT: subl %ecx, %eax
244 ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
246 %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
250 ; Don't fold if the divisor is 2^16.
251 define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
252 ; CHECK-LABEL: dont_fold_urem_i16_smax:
255 %1 = urem <4 x i16> %x, <i16 1, i16 65536, i16 23, i16 5423>
259 ; Don't fold i64 urem.
260 define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
261 ; SSE-LABEL: dont_fold_urem_i64:
263 ; SSE-NEXT: movq %xmm1, %rcx
264 ; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
265 ; SSE-NEXT: movq %rcx, %rax
266 ; SSE-NEXT: mulq %rdx
267 ; SSE-NEXT: movq %rcx, %rax
268 ; SSE-NEXT: subq %rdx, %rax
269 ; SSE-NEXT: shrq %rax
270 ; SSE-NEXT: addq %rdx, %rax
271 ; SSE-NEXT: shrq $4, %rax
272 ; SSE-NEXT: leaq (%rax,%rax,2), %rdx
273 ; SSE-NEXT: shlq $3, %rdx
274 ; SSE-NEXT: subq %rdx, %rax
275 ; SSE-NEXT: addq %rcx, %rax
276 ; SSE-NEXT: movq %rax, %xmm2
277 ; SSE-NEXT: pextrq $1, %xmm1, %rcx
278 ; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
279 ; SSE-NEXT: movq %rcx, %rax
280 ; SSE-NEXT: mulq %rdx
281 ; SSE-NEXT: shrq $12, %rdx
282 ; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
283 ; SSE-NEXT: subq %rax, %rcx
284 ; SSE-NEXT: movq %rcx, %xmm1
285 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
286 ; SSE-NEXT: pextrq $1, %xmm0, %rcx
287 ; SSE-NEXT: movq %rcx, %rax
288 ; SSE-NEXT: shrq %rax
289 ; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
290 ; SSE-NEXT: mulq %rdx
291 ; SSE-NEXT: shrq $7, %rdx
292 ; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
293 ; SSE-NEXT: subq %rax, %rcx
294 ; SSE-NEXT: movq %rcx, %xmm0
295 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
296 ; SSE-NEXT: movdqa %xmm2, %xmm1
299 ; AVX1-LABEL: dont_fold_urem_i64:
301 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
302 ; AVX1-NEXT: vmovq %xmm1, %rcx
303 ; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
304 ; AVX1-NEXT: movq %rcx, %rax
305 ; AVX1-NEXT: mulq %rdx
306 ; AVX1-NEXT: movq %rcx, %rax
307 ; AVX1-NEXT: subq %rdx, %rax
308 ; AVX1-NEXT: shrq %rax
309 ; AVX1-NEXT: addq %rdx, %rax
310 ; AVX1-NEXT: shrq $4, %rax
311 ; AVX1-NEXT: leaq (%rax,%rax,2), %rdx
312 ; AVX1-NEXT: shlq $3, %rdx
313 ; AVX1-NEXT: subq %rdx, %rax
314 ; AVX1-NEXT: addq %rcx, %rax
315 ; AVX1-NEXT: vmovq %rax, %xmm2
316 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
317 ; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
318 ; AVX1-NEXT: movq %rcx, %rax
319 ; AVX1-NEXT: mulq %rdx
320 ; AVX1-NEXT: shrq $12, %rdx
321 ; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
322 ; AVX1-NEXT: subq %rax, %rcx
323 ; AVX1-NEXT: vmovq %rcx, %xmm1
324 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
325 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
326 ; AVX1-NEXT: movq %rcx, %rax
327 ; AVX1-NEXT: shrq %rax
328 ; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
329 ; AVX1-NEXT: mulq %rdx
330 ; AVX1-NEXT: shrq $7, %rdx
331 ; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
332 ; AVX1-NEXT: subq %rax, %rcx
333 ; AVX1-NEXT: vmovq %rcx, %xmm0
334 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
335 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
338 ; AVX2-LABEL: dont_fold_urem_i64:
340 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
341 ; AVX2-NEXT: vmovq %xmm1, %rcx
342 ; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
343 ; AVX2-NEXT: movq %rcx, %rax
344 ; AVX2-NEXT: mulq %rdx
345 ; AVX2-NEXT: movq %rcx, %rax
346 ; AVX2-NEXT: subq %rdx, %rax
347 ; AVX2-NEXT: shrq %rax
348 ; AVX2-NEXT: addq %rdx, %rax
349 ; AVX2-NEXT: shrq $4, %rax
350 ; AVX2-NEXT: leaq (%rax,%rax,2), %rdx
351 ; AVX2-NEXT: shlq $3, %rdx
352 ; AVX2-NEXT: subq %rdx, %rax
353 ; AVX2-NEXT: addq %rcx, %rax
354 ; AVX2-NEXT: vmovq %rax, %xmm2
355 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
356 ; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
357 ; AVX2-NEXT: movq %rcx, %rax
358 ; AVX2-NEXT: mulq %rdx
359 ; AVX2-NEXT: shrq $12, %rdx
360 ; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
361 ; AVX2-NEXT: subq %rax, %rcx
362 ; AVX2-NEXT: vmovq %rcx, %xmm1
363 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
364 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
365 ; AVX2-NEXT: movq %rcx, %rax
366 ; AVX2-NEXT: shrq %rax
367 ; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
368 ; AVX2-NEXT: mulq %rdx
369 ; AVX2-NEXT: shrq $7, %rdx
370 ; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
371 ; AVX2-NEXT: subq %rax, %rcx
372 ; AVX2-NEXT: vmovq %rcx, %xmm0
373 ; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
374 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
376 %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>