1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
7 ; SSE-LABEL: fold_srem_vec_1:
9 ; SSE-NEXT: pextrw $3, %xmm0, %eax
10 ; SSE-NEXT: movswl %ax, %ecx
11 ; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
12 ; SSE-NEXT: shrl $16, %ecx
13 ; SSE-NEXT: subl %eax, %ecx
14 ; SSE-NEXT: movzwl %cx, %ecx
15 ; SSE-NEXT: movswl %cx, %edx
16 ; SSE-NEXT: shrl $15, %ecx
17 ; SSE-NEXT: sarl $9, %edx
18 ; SSE-NEXT: addl %ecx, %edx
19 ; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
20 ; SSE-NEXT: subl %ecx, %eax
21 ; SSE-NEXT: movd %xmm0, %ecx
22 ; SSE-NEXT: movswl %cx, %edx
23 ; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
24 ; SSE-NEXT: shrl $16, %edx
25 ; SSE-NEXT: addl %ecx, %edx
26 ; SSE-NEXT: movzwl %dx, %edx
27 ; SSE-NEXT: movswl %dx, %esi
28 ; SSE-NEXT: shrl $15, %edx
29 ; SSE-NEXT: sarl $6, %esi
30 ; SSE-NEXT: addl %edx, %esi
31 ; SSE-NEXT: imull $95, %esi, %edx
32 ; SSE-NEXT: subl %edx, %ecx
33 ; SSE-NEXT: movd %ecx, %xmm1
34 ; SSE-NEXT: pextrw $1, %xmm0, %ecx
35 ; SSE-NEXT: movswl %cx, %edx
36 ; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
37 ; SSE-NEXT: movl %edx, %esi
38 ; SSE-NEXT: shrl $31, %esi
39 ; SSE-NEXT: sarl $21, %edx
40 ; SSE-NEXT: addl %esi, %edx
41 ; SSE-NEXT: imull $-124, %edx, %edx
42 ; SSE-NEXT: subl %edx, %ecx
43 ; SSE-NEXT: pinsrw $1, %ecx, %xmm1
44 ; SSE-NEXT: pextrw $2, %xmm0, %ecx
45 ; SSE-NEXT: movswl %cx, %edx
46 ; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73
47 ; SSE-NEXT: movl %edx, %esi
48 ; SSE-NEXT: shrl $31, %esi
49 ; SSE-NEXT: sarl $18, %edx
50 ; SSE-NEXT: addl %esi, %edx
51 ; SSE-NEXT: imull $98, %edx, %edx
52 ; SSE-NEXT: subl %edx, %ecx
53 ; SSE-NEXT: pinsrw $2, %ecx, %xmm1
54 ; SSE-NEXT: pinsrw $3, %eax, %xmm1
55 ; SSE-NEXT: movdqa %xmm1, %xmm0
58 ; AVX-LABEL: fold_srem_vec_1:
60 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
61 ; AVX-NEXT: movswl %ax, %ecx
62 ; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
63 ; AVX-NEXT: shrl $16, %ecx
64 ; AVX-NEXT: subl %eax, %ecx
65 ; AVX-NEXT: movzwl %cx, %ecx
66 ; AVX-NEXT: movswl %cx, %edx
67 ; AVX-NEXT: shrl $15, %ecx
68 ; AVX-NEXT: sarl $9, %edx
69 ; AVX-NEXT: addl %ecx, %edx
70 ; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
71 ; AVX-NEXT: subl %ecx, %eax
72 ; AVX-NEXT: vmovd %xmm0, %ecx
73 ; AVX-NEXT: movswl %cx, %edx
74 ; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
75 ; AVX-NEXT: shrl $16, %edx
76 ; AVX-NEXT: addl %ecx, %edx
77 ; AVX-NEXT: movzwl %dx, %edx
78 ; AVX-NEXT: movswl %dx, %esi
79 ; AVX-NEXT: shrl $15, %edx
80 ; AVX-NEXT: sarl $6, %esi
81 ; AVX-NEXT: addl %edx, %esi
82 ; AVX-NEXT: imull $95, %esi, %edx
83 ; AVX-NEXT: subl %edx, %ecx
84 ; AVX-NEXT: vmovd %ecx, %xmm1
85 ; AVX-NEXT: vpextrw $1, %xmm0, %ecx
86 ; AVX-NEXT: movswl %cx, %edx
87 ; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
88 ; AVX-NEXT: movl %edx, %esi
89 ; AVX-NEXT: shrl $31, %esi
90 ; AVX-NEXT: sarl $21, %edx
91 ; AVX-NEXT: addl %esi, %edx
92 ; AVX-NEXT: imull $-124, %edx, %edx
93 ; AVX-NEXT: subl %edx, %ecx
94 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
95 ; AVX-NEXT: vpextrw $2, %xmm0, %ecx
96 ; AVX-NEXT: movswl %cx, %edx
97 ; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73
98 ; AVX-NEXT: movl %edx, %esi
99 ; AVX-NEXT: shrl $31, %esi
100 ; AVX-NEXT: sarl $18, %edx
101 ; AVX-NEXT: addl %esi, %edx
102 ; AVX-NEXT: imull $98, %edx, %edx
103 ; AVX-NEXT: subl %edx, %ecx
104 ; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
105 ; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
107 %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
111 define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
112 ; SSE-LABEL: fold_srem_vec_2:
114 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
115 ; SSE-NEXT: pmulhw %xmm0, %xmm1
116 ; SSE-NEXT: paddw %xmm0, %xmm1
117 ; SSE-NEXT: movdqa %xmm1, %xmm2
118 ; SSE-NEXT: psrlw $15, %xmm2
119 ; SSE-NEXT: psraw $6, %xmm1
120 ; SSE-NEXT: paddw %xmm2, %xmm1
121 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
122 ; SSE-NEXT: psubw %xmm1, %xmm0
125 ; AVX-LABEL: fold_srem_vec_2:
127 ; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
128 ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1
129 ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
130 ; AVX-NEXT: vpsraw $6, %xmm1, %xmm1
131 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
132 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
133 ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
135 %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
140 ; Don't fold if we can combine srem with sdiv.
141 define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
142 ; SSE-LABEL: combine_srem_sdiv:
144 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
145 ; SSE-NEXT: pmulhw %xmm0, %xmm1
146 ; SSE-NEXT: paddw %xmm0, %xmm1
147 ; SSE-NEXT: movdqa %xmm1, %xmm2
148 ; SSE-NEXT: psrlw $15, %xmm2
149 ; SSE-NEXT: psraw $6, %xmm1
150 ; SSE-NEXT: paddw %xmm2, %xmm1
151 ; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
152 ; SSE-NEXT: pmullw %xmm1, %xmm2
153 ; SSE-NEXT: psubw %xmm2, %xmm0
154 ; SSE-NEXT: paddw %xmm1, %xmm0
157 ; AVX-LABEL: combine_srem_sdiv:
159 ; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
160 ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1
161 ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
162 ; AVX-NEXT: vpsraw $6, %xmm1, %xmm1
163 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
164 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
165 ; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
166 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
168 %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
169 %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
170 %3 = add <4 x i16> %1, %2
174 ; Don't fold for divisors that are a power of two.
175 define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
176 ; SSE-LABEL: dont_fold_srem_power_of_two:
178 ; SSE-NEXT: movdqa %xmm0, %xmm1
179 ; SSE-NEXT: pextrw $1, %xmm0, %eax
180 ; SSE-NEXT: leal 31(%rax), %ecx
181 ; SSE-NEXT: testw %ax, %ax
182 ; SSE-NEXT: cmovnsl %eax, %ecx
183 ; SSE-NEXT: andl $-32, %ecx
184 ; SSE-NEXT: subl %ecx, %eax
185 ; SSE-NEXT: movd %xmm0, %ecx
186 ; SSE-NEXT: leal 63(%rcx), %edx
187 ; SSE-NEXT: testw %cx, %cx
188 ; SSE-NEXT: cmovnsl %ecx, %edx
189 ; SSE-NEXT: andl $-64, %edx
190 ; SSE-NEXT: subl %edx, %ecx
191 ; SSE-NEXT: movd %ecx, %xmm0
192 ; SSE-NEXT: pinsrw $1, %eax, %xmm0
193 ; SSE-NEXT: pextrw $2, %xmm1, %eax
194 ; SSE-NEXT: leal 7(%rax), %ecx
195 ; SSE-NEXT: testw %ax, %ax
196 ; SSE-NEXT: cmovnsl %eax, %ecx
197 ; SSE-NEXT: andl $-8, %ecx
198 ; SSE-NEXT: subl %ecx, %eax
199 ; SSE-NEXT: pinsrw $2, %eax, %xmm0
200 ; SSE-NEXT: pextrw $3, %xmm1, %eax
201 ; SSE-NEXT: movswl %ax, %ecx
202 ; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
203 ; SSE-NEXT: shrl $16, %ecx
204 ; SSE-NEXT: addl %eax, %ecx
205 ; SSE-NEXT: movzwl %cx, %ecx
206 ; SSE-NEXT: movswl %cx, %edx
207 ; SSE-NEXT: shrl $15, %ecx
208 ; SSE-NEXT: sarl $6, %edx
209 ; SSE-NEXT: addl %ecx, %edx
210 ; SSE-NEXT: imull $95, %edx, %ecx
211 ; SSE-NEXT: subl %ecx, %eax
212 ; SSE-NEXT: pinsrw $3, %eax, %xmm0
215 ; AVX-LABEL: dont_fold_srem_power_of_two:
217 ; AVX-NEXT: vpextrw $1, %xmm0, %eax
218 ; AVX-NEXT: leal 31(%rax), %ecx
219 ; AVX-NEXT: testw %ax, %ax
220 ; AVX-NEXT: cmovnsl %eax, %ecx
221 ; AVX-NEXT: andl $-32, %ecx
222 ; AVX-NEXT: subl %ecx, %eax
223 ; AVX-NEXT: vmovd %xmm0, %ecx
224 ; AVX-NEXT: leal 63(%rcx), %edx
225 ; AVX-NEXT: testw %cx, %cx
226 ; AVX-NEXT: cmovnsl %ecx, %edx
227 ; AVX-NEXT: andl $-64, %edx
228 ; AVX-NEXT: subl %edx, %ecx
229 ; AVX-NEXT: vmovd %ecx, %xmm1
230 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
231 ; AVX-NEXT: vpextrw $2, %xmm0, %eax
232 ; AVX-NEXT: leal 7(%rax), %ecx
233 ; AVX-NEXT: testw %ax, %ax
234 ; AVX-NEXT: cmovnsl %eax, %ecx
235 ; AVX-NEXT: andl $-8, %ecx
236 ; AVX-NEXT: subl %ecx, %eax
237 ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
238 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
239 ; AVX-NEXT: movswl %ax, %ecx
240 ; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
241 ; AVX-NEXT: shrl $16, %ecx
242 ; AVX-NEXT: addl %eax, %ecx
243 ; AVX-NEXT: movzwl %cx, %ecx
244 ; AVX-NEXT: movswl %cx, %edx
245 ; AVX-NEXT: shrl $15, %ecx
246 ; AVX-NEXT: sarl $6, %edx
247 ; AVX-NEXT: addl %ecx, %edx
248 ; AVX-NEXT: imull $95, %edx, %ecx
249 ; AVX-NEXT: subl %ecx, %eax
250 ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
252 %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
256 ; Don't fold if the divisor is one.
257 define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
258 ; SSE-LABEL: dont_fold_srem_one:
260 ; SSE-NEXT: pextrw $2, %xmm0, %ecx
261 ; SSE-NEXT: movswl %cx, %eax
262 ; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217
263 ; SSE-NEXT: shrl $16, %eax
264 ; SSE-NEXT: addl %ecx, %eax
265 ; SSE-NEXT: movzwl %ax, %edx
266 ; SSE-NEXT: movswl %dx, %eax
267 ; SSE-NEXT: shrl $15, %edx
268 ; SSE-NEXT: sarl $4, %eax
269 ; SSE-NEXT: addl %edx, %eax
270 ; SSE-NEXT: leal (%rax,%rax,2), %edx
271 ; SSE-NEXT: shll $3, %edx
272 ; SSE-NEXT: subl %edx, %eax
273 ; SSE-NEXT: addl %ecx, %eax
274 ; SSE-NEXT: pextrw $1, %xmm0, %ecx
275 ; SSE-NEXT: movswl %cx, %edx
276 ; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B
277 ; SSE-NEXT: movl %edx, %esi
278 ; SSE-NEXT: shrl $31, %esi
279 ; SSE-NEXT: sarl $23, %edx
280 ; SSE-NEXT: addl %esi, %edx
281 ; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E
282 ; SSE-NEXT: subl %edx, %ecx
283 ; SSE-NEXT: pxor %xmm1, %xmm1
284 ; SSE-NEXT: pinsrw $1, %ecx, %xmm1
285 ; SSE-NEXT: pinsrw $2, %eax, %xmm1
286 ; SSE-NEXT: pextrw $3, %xmm0, %eax
287 ; SSE-NEXT: movswl %ax, %ecx
288 ; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
289 ; SSE-NEXT: movl %ecx, %edx
290 ; SSE-NEXT: shrl $31, %edx
291 ; SSE-NEXT: sarl $26, %ecx
292 ; SSE-NEXT: addl %edx, %ecx
293 ; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
294 ; SSE-NEXT: subl %ecx, %eax
295 ; SSE-NEXT: pinsrw $3, %eax, %xmm1
296 ; SSE-NEXT: movdqa %xmm1, %xmm0
299 ; AVX-LABEL: dont_fold_srem_one:
301 ; AVX-NEXT: vpextrw $2, %xmm0, %eax
302 ; AVX-NEXT: movswl %ax, %ecx
303 ; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
304 ; AVX-NEXT: shrl $16, %ecx
305 ; AVX-NEXT: addl %eax, %ecx
306 ; AVX-NEXT: movzwl %cx, %ecx
307 ; AVX-NEXT: movswl %cx, %edx
308 ; AVX-NEXT: shrl $15, %ecx
309 ; AVX-NEXT: sarl $4, %edx
310 ; AVX-NEXT: addl %ecx, %edx
311 ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
312 ; AVX-NEXT: shll $3, %ecx
313 ; AVX-NEXT: subl %ecx, %edx
314 ; AVX-NEXT: addl %eax, %edx
315 ; AVX-NEXT: vpextrw $1, %xmm0, %eax
316 ; AVX-NEXT: movswl %ax, %ecx
317 ; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B
318 ; AVX-NEXT: movl %ecx, %esi
319 ; AVX-NEXT: shrl $31, %esi
320 ; AVX-NEXT: sarl $23, %ecx
321 ; AVX-NEXT: addl %esi, %ecx
322 ; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
323 ; AVX-NEXT: subl %ecx, %eax
324 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
325 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
326 ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
327 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
328 ; AVX-NEXT: movswl %ax, %ecx
329 ; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
330 ; AVX-NEXT: movl %ecx, %edx
331 ; AVX-NEXT: shrl $31, %edx
332 ; AVX-NEXT: sarl $26, %ecx
333 ; AVX-NEXT: addl %edx, %ecx
334 ; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
335 ; AVX-NEXT: subl %ecx, %eax
336 ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
338 %1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
342 ; Don't fold if the divisor is 2^15.
343 define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
344 ; SSE-LABEL: dont_fold_urem_i16_smax:
346 ; SSE-NEXT: pextrw $2, %xmm0, %eax
347 ; SSE-NEXT: movswl %ax, %ecx
348 ; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
349 ; SSE-NEXT: shrl $16, %ecx
350 ; SSE-NEXT: addl %eax, %ecx
351 ; SSE-NEXT: movzwl %cx, %ecx
352 ; SSE-NEXT: movswl %cx, %edx
353 ; SSE-NEXT: shrl $15, %ecx
354 ; SSE-NEXT: sarl $4, %edx
355 ; SSE-NEXT: addl %ecx, %edx
356 ; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
357 ; SSE-NEXT: shll $3, %ecx
358 ; SSE-NEXT: subl %ecx, %edx
359 ; SSE-NEXT: addl %eax, %edx
360 ; SSE-NEXT: pextrw $1, %xmm0, %eax
361 ; SSE-NEXT: leal 32767(%rax), %ecx
362 ; SSE-NEXT: testw %ax, %ax
363 ; SSE-NEXT: cmovnsl %eax, %ecx
364 ; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000
365 ; SSE-NEXT: addl %eax, %ecx
366 ; SSE-NEXT: pxor %xmm1, %xmm1
367 ; SSE-NEXT: pinsrw $1, %ecx, %xmm1
368 ; SSE-NEXT: pinsrw $2, %edx, %xmm1
369 ; SSE-NEXT: pextrw $3, %xmm0, %eax
370 ; SSE-NEXT: movswl %ax, %ecx
371 ; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
372 ; SSE-NEXT: movl %ecx, %edx
373 ; SSE-NEXT: shrl $31, %edx
374 ; SSE-NEXT: sarl $26, %ecx
375 ; SSE-NEXT: addl %edx, %ecx
376 ; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
377 ; SSE-NEXT: subl %ecx, %eax
378 ; SSE-NEXT: pinsrw $3, %eax, %xmm1
379 ; SSE-NEXT: movdqa %xmm1, %xmm0
382 ; AVX-LABEL: dont_fold_urem_i16_smax:
384 ; AVX-NEXT: vpextrw $2, %xmm0, %eax
385 ; AVX-NEXT: movswl %ax, %ecx
386 ; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
387 ; AVX-NEXT: shrl $16, %ecx
388 ; AVX-NEXT: addl %eax, %ecx
389 ; AVX-NEXT: movzwl %cx, %ecx
390 ; AVX-NEXT: movswl %cx, %edx
391 ; AVX-NEXT: shrl $15, %ecx
392 ; AVX-NEXT: sarl $4, %edx
393 ; AVX-NEXT: addl %ecx, %edx
394 ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
395 ; AVX-NEXT: shll $3, %ecx
396 ; AVX-NEXT: subl %ecx, %edx
397 ; AVX-NEXT: addl %eax, %edx
398 ; AVX-NEXT: vpextrw $1, %xmm0, %eax
399 ; AVX-NEXT: leal 32767(%rax), %ecx
400 ; AVX-NEXT: testw %ax, %ax
401 ; AVX-NEXT: cmovnsl %eax, %ecx
402 ; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000
403 ; AVX-NEXT: addl %eax, %ecx
404 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
405 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
406 ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
407 ; AVX-NEXT: vpextrw $3, %xmm0, %eax
408 ; AVX-NEXT: movswl %ax, %ecx
409 ; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
410 ; AVX-NEXT: movl %ecx, %edx
411 ; AVX-NEXT: shrl $31, %edx
412 ; AVX-NEXT: sarl $26, %ecx
413 ; AVX-NEXT: addl %edx, %ecx
414 ; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
415 ; AVX-NEXT: subl %ecx, %eax
416 ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
418 %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
422 ; Don't fold i64 srem.
423 define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
424 ; SSE-LABEL: dont_fold_srem_i64:
426 ; SSE-NEXT: movdqa %xmm1, %xmm2
427 ; SSE-NEXT: movq %xmm1, %rcx
428 ; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
429 ; SSE-NEXT: movq %rcx, %rax
430 ; SSE-NEXT: imulq %rdx
431 ; SSE-NEXT: addq %rcx, %rdx
432 ; SSE-NEXT: movq %rdx, %rax
433 ; SSE-NEXT: shrq $63, %rax
434 ; SSE-NEXT: sarq $4, %rdx
435 ; SSE-NEXT: addq %rax, %rdx
436 ; SSE-NEXT: leaq (%rdx,%rdx,2), %rax
437 ; SSE-NEXT: shlq $3, %rax
438 ; SSE-NEXT: subq %rax, %rdx
439 ; SSE-NEXT: addq %rcx, %rdx
440 ; SSE-NEXT: movq %rdx, %xmm1
441 ; SSE-NEXT: pextrq $1, %xmm2, %rcx
442 ; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
443 ; SSE-NEXT: movq %rcx, %rax
444 ; SSE-NEXT: imulq %rdx
445 ; SSE-NEXT: movq %rdx, %rax
446 ; SSE-NEXT: shrq $63, %rax
447 ; SSE-NEXT: sarq $11, %rdx
448 ; SSE-NEXT: addq %rax, %rdx
449 ; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
450 ; SSE-NEXT: subq %rax, %rcx
451 ; SSE-NEXT: movq %rcx, %xmm2
452 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
453 ; SSE-NEXT: pextrq $1, %xmm0, %rcx
454 ; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
455 ; SSE-NEXT: movq %rcx, %rax
456 ; SSE-NEXT: imulq %rdx
457 ; SSE-NEXT: movq %rdx, %rax
458 ; SSE-NEXT: shrq $63, %rax
459 ; SSE-NEXT: sarq $8, %rdx
460 ; SSE-NEXT: addq %rax, %rdx
461 ; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
462 ; SSE-NEXT: subq %rax, %rcx
463 ; SSE-NEXT: movq %rcx, %xmm0
464 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
467 ; AVX1-LABEL: dont_fold_srem_i64:
469 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
470 ; AVX1-NEXT: vmovq %xmm1, %rcx
471 ; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
472 ; AVX1-NEXT: movq %rcx, %rax
473 ; AVX1-NEXT: imulq %rdx
474 ; AVX1-NEXT: addq %rcx, %rdx
475 ; AVX1-NEXT: movq %rdx, %rax
476 ; AVX1-NEXT: shrq $63, %rax
477 ; AVX1-NEXT: sarq $4, %rdx
478 ; AVX1-NEXT: addq %rax, %rdx
479 ; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax
480 ; AVX1-NEXT: shlq $3, %rax
481 ; AVX1-NEXT: subq %rax, %rdx
482 ; AVX1-NEXT: addq %rcx, %rdx
483 ; AVX1-NEXT: vmovq %rdx, %xmm2
484 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
485 ; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
486 ; AVX1-NEXT: movq %rcx, %rax
487 ; AVX1-NEXT: imulq %rdx
488 ; AVX1-NEXT: movq %rdx, %rax
489 ; AVX1-NEXT: shrq $63, %rax
490 ; AVX1-NEXT: sarq $11, %rdx
491 ; AVX1-NEXT: addq %rax, %rdx
492 ; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
493 ; AVX1-NEXT: subq %rax, %rcx
494 ; AVX1-NEXT: vmovq %rcx, %xmm1
495 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
496 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
497 ; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
498 ; AVX1-NEXT: movq %rcx, %rax
499 ; AVX1-NEXT: imulq %rdx
500 ; AVX1-NEXT: movq %rdx, %rax
501 ; AVX1-NEXT: shrq $63, %rax
502 ; AVX1-NEXT: sarq $8, %rdx
503 ; AVX1-NEXT: addq %rax, %rdx
504 ; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
505 ; AVX1-NEXT: subq %rax, %rcx
506 ; AVX1-NEXT: vmovq %rcx, %xmm0
507 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
508 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
511 ; AVX2-LABEL: dont_fold_srem_i64:
513 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
514 ; AVX2-NEXT: vmovq %xmm1, %rcx
515 ; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
516 ; AVX2-NEXT: movq %rcx, %rax
517 ; AVX2-NEXT: imulq %rdx
518 ; AVX2-NEXT: addq %rcx, %rdx
519 ; AVX2-NEXT: movq %rdx, %rax
520 ; AVX2-NEXT: shrq $63, %rax
521 ; AVX2-NEXT: sarq $4, %rdx
522 ; AVX2-NEXT: addq %rax, %rdx
523 ; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax
524 ; AVX2-NEXT: shlq $3, %rax
525 ; AVX2-NEXT: subq %rax, %rdx
526 ; AVX2-NEXT: addq %rcx, %rdx
527 ; AVX2-NEXT: vmovq %rdx, %xmm2
528 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
529 ; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
530 ; AVX2-NEXT: movq %rcx, %rax
531 ; AVX2-NEXT: imulq %rdx
532 ; AVX2-NEXT: movq %rdx, %rax
533 ; AVX2-NEXT: shrq $63, %rax
534 ; AVX2-NEXT: sarq $11, %rdx
535 ; AVX2-NEXT: addq %rax, %rdx
536 ; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
537 ; AVX2-NEXT: subq %rax, %rcx
538 ; AVX2-NEXT: vmovq %rcx, %xmm1
539 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
540 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
541 ; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
542 ; AVX2-NEXT: movq %rcx, %rax
543 ; AVX2-NEXT: imulq %rdx
544 ; AVX2-NEXT: movq %rdx, %rax
545 ; AVX2-NEXT: shrq $63, %rax
546 ; AVX2-NEXT: sarq $8, %rdx
547 ; AVX2-NEXT: addq %rax, %rdx
548 ; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
549 ; AVX2-NEXT: subq %rax, %rcx
550 ; AVX2-NEXT: vmovq %rcx, %xmm0
551 ; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
552 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
554 %1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>