1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
5 ; If the target does not have a single div/rem operation,
6 ; -div-rem-pairs pass will decompose the remainder calculation as:
7 ; X % Y --> X - ((X / Y) * Y)
8 ; But if the target does have a single div/rem operation,
9 ; the opposite transform is likely beneficial.
11 define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
12 ; X86-LABEL: scalar_i8:
14 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
15 ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
16 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
17 ; X86-NEXT: movsbl %cl, %eax
19 ; X86-NEXT: movb %al, (%edx)
21 ; X86-NEXT: subb %al, %cl
22 ; X86-NEXT: movl %ecx, %eax
25 ; X64-LABEL: scalar_i8:
27 ; X64-NEXT: movsbl %dil, %ecx
28 ; X64-NEXT: movl %ecx, %eax
29 ; X64-NEXT: idivb %sil
30 ; X64-NEXT: movb %al, (%rdx)
32 ; X64-NEXT: subb %al, %cl
33 ; X64-NEXT: movl %ecx, %eax
36 store i8 %div, ptr %divdst, align 4
42 define i16 @scalar_i16(i16 %x, i16 %y, ptr %divdst) nounwind {
43 ; X86-LABEL: scalar_i16:
45 ; X86-NEXT: pushl %edi
46 ; X86-NEXT: pushl %esi
47 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
48 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
49 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
50 ; X86-NEXT: movl %ecx, %eax
53 ; X86-NEXT: # kill: def $ax killed $ax def $eax
54 ; X86-NEXT: movw %ax, (%edi)
55 ; X86-NEXT: imull %eax, %esi
56 ; X86-NEXT: subl %esi, %ecx
57 ; X86-NEXT: movl %ecx, %eax
62 ; X64-LABEL: scalar_i16:
64 ; X64-NEXT: movq %rdx, %rcx
65 ; X64-NEXT: movl %edi, %eax
68 ; X64-NEXT: # kill: def $ax killed $ax def $eax
69 ; X64-NEXT: movw %ax, (%rcx)
70 ; X64-NEXT: imull %eax, %esi
71 ; X64-NEXT: subl %esi, %edi
72 ; X64-NEXT: movl %edi, %eax
74 %div = sdiv i16 %x, %y
75 store i16 %div, ptr %divdst, align 4
76 %t1 = mul i16 %div, %y
81 define i32 @scalar_i32(i32 %x, i32 %y, ptr %divdst) nounwind {
82 ; X86-LABEL: scalar_i32:
84 ; X86-NEXT: pushl %edi
85 ; X86-NEXT: pushl %esi
86 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
87 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
88 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
89 ; X86-NEXT: movl %ecx, %eax
91 ; X86-NEXT: idivl %edi
92 ; X86-NEXT: movl %eax, (%esi)
93 ; X86-NEXT: imull %edi, %eax
94 ; X86-NEXT: subl %eax, %ecx
95 ; X86-NEXT: movl %ecx, %eax
100 ; X64-LABEL: scalar_i32:
102 ; X64-NEXT: movq %rdx, %rcx
103 ; X64-NEXT: movl %edi, %eax
105 ; X64-NEXT: idivl %esi
106 ; X64-NEXT: movl %eax, (%rcx)
107 ; X64-NEXT: imull %esi, %eax
108 ; X64-NEXT: subl %eax, %edi
109 ; X64-NEXT: movl %edi, %eax
111 %div = sdiv i32 %x, %y
112 store i32 %div, ptr %divdst, align 4
113 %t1 = mul i32 %div, %y
114 %t2 = sub i32 %x, %t1
118 define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
119 ; X86-LABEL: scalar_i64:
121 ; X86-NEXT: pushl %ebp
122 ; X86-NEXT: pushl %ebx
123 ; X86-NEXT: pushl %edi
124 ; X86-NEXT: pushl %esi
125 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
126 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
127 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
128 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
129 ; X86-NEXT: pushl %ebp
130 ; X86-NEXT: pushl %ebx
131 ; X86-NEXT: pushl %edi
132 ; X86-NEXT: pushl %esi
133 ; X86-NEXT: calll __divdi3
134 ; X86-NEXT: addl $16, %esp
135 ; X86-NEXT: movl %edx, %ecx
136 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
137 ; X86-NEXT: movl %ecx, 4(%edx)
138 ; X86-NEXT: movl %eax, (%edx)
139 ; X86-NEXT: imull %eax, %ebp
140 ; X86-NEXT: mull %ebx
141 ; X86-NEXT: addl %ebp, %edx
142 ; X86-NEXT: imull %ebx, %ecx
143 ; X86-NEXT: addl %edx, %ecx
144 ; X86-NEXT: subl %eax, %esi
145 ; X86-NEXT: sbbl %ecx, %edi
146 ; X86-NEXT: movl %esi, %eax
147 ; X86-NEXT: movl %edi, %edx
148 ; X86-NEXT: popl %esi
149 ; X86-NEXT: popl %edi
150 ; X86-NEXT: popl %ebx
151 ; X86-NEXT: popl %ebp
154 ; X64-LABEL: scalar_i64:
156 ; X64-NEXT: movq %rdx, %rcx
157 ; X64-NEXT: movq %rdi, %rax
159 ; X64-NEXT: idivq %rsi
160 ; X64-NEXT: movq %rax, (%rcx)
161 ; X64-NEXT: imulq %rsi, %rax
162 ; X64-NEXT: subq %rax, %rdi
163 ; X64-NEXT: movq %rdi, %rax
165 %div = sdiv i64 %x, %y
166 store i64 %div, ptr %divdst, align 4
167 %t1 = mul i64 %div, %y
168 %t2 = sub i64 %x, %t1
172 ; X86 doesn't have __divti3, so the urem is expanded into a loop.
173 define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
174 ; X86-LABEL: scalar_i128:
175 ; X86: # %bb.0: # %_udiv-special-cases
176 ; X86-NEXT: pushl %ebp
177 ; X86-NEXT: pushl %ebx
178 ; X86-NEXT: pushl %edi
179 ; X86-NEXT: pushl %esi
180 ; X86-NEXT: subl $152, %esp
181 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
182 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
183 ; X86-NEXT: movl %ecx, %eax
184 ; X86-NEXT: sarl $31, %eax
185 ; X86-NEXT: movl %edx, %edi
186 ; X86-NEXT: sarl $31, %edi
187 ; X86-NEXT: movl %eax, %esi
188 ; X86-NEXT: xorl %ecx, %esi
189 ; X86-NEXT: movl %esi, %ebp
190 ; X86-NEXT: movl %eax, %ecx
191 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
192 ; X86-NEXT: movl %ecx, %ebx
193 ; X86-NEXT: movl %eax, %ecx
194 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
195 ; X86-NEXT: movl %eax, %esi
196 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
197 ; X86-NEXT: subl %eax, %esi
198 ; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
199 ; X86-NEXT: sbbl %eax, %ecx
200 ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
201 ; X86-NEXT: sbbl %eax, %ebx
202 ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
203 ; X86-NEXT: sbbl %eax, %ebp
204 ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
205 ; X86-NEXT: movl %edi, %esi
206 ; X86-NEXT: xorl %edx, %esi
207 ; X86-NEXT: movl %edi, %edx
208 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
209 ; X86-NEXT: movl %edi, %ebx
210 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx
211 ; X86-NEXT: movl %edi, %ebp
212 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp
213 ; X86-NEXT: subl %edi, %ebp
214 ; X86-NEXT: sbbl %edi, %ebx
215 ; X86-NEXT: sbbl %edi, %edx
216 ; X86-NEXT: sbbl %edi, %esi
217 ; X86-NEXT: xorl %eax, %edi
218 ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
219 ; X86-NEXT: movl %ebx, %eax
220 ; X86-NEXT: orl %esi, %eax
221 ; X86-NEXT: movl %ebp, %ecx
222 ; X86-NEXT: orl %edx, %ecx
223 ; X86-NEXT: movl %edx, %edi
224 ; X86-NEXT: orl %eax, %ecx
226 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
227 ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
228 ; X86-NEXT: movl (%esp), %edx # 4-byte Reload
229 ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
230 ; X86-NEXT: orl %eax, %edx
232 ; X86-NEXT: orb %cl, %al
233 ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
234 ; X86-NEXT: bsrl %esi, %edx
235 ; X86-NEXT: xorl $31, %edx
236 ; X86-NEXT: bsrl %edi, %ecx
237 ; X86-NEXT: xorl $31, %ecx
238 ; X86-NEXT: addl $32, %ecx
239 ; X86-NEXT: testl %esi, %esi
240 ; X86-NEXT: cmovnel %edx, %ecx
241 ; X86-NEXT: bsrl %ebx, %edx
242 ; X86-NEXT: xorl $31, %edx
243 ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
244 ; X86-NEXT: bsrl %ebp, %ebp
245 ; X86-NEXT: xorl $31, %ebp
246 ; X86-NEXT: addl $32, %ebp
247 ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
248 ; X86-NEXT: testl %ebx, %ebx
249 ; X86-NEXT: cmovnel %edx, %ebp
250 ; X86-NEXT: addl $64, %ebp
251 ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
252 ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
253 ; X86-NEXT: orl %esi, %edi
254 ; X86-NEXT: cmovnel %ecx, %ebp
255 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
256 ; X86-NEXT: bsrl %edi, %edx
257 ; X86-NEXT: xorl $31, %edx
258 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
259 ; X86-NEXT: bsrl %eax, %ecx
260 ; X86-NEXT: xorl $31, %ecx
261 ; X86-NEXT: addl $32, %ecx
262 ; X86-NEXT: testl %edi, %edi
263 ; X86-NEXT: cmovnel %edx, %ecx
264 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
265 ; X86-NEXT: bsrl %ebx, %esi
266 ; X86-NEXT: xorl $31, %esi
267 ; X86-NEXT: bsrl (%esp), %edx # 4-byte Folded Reload
268 ; X86-NEXT: xorl $31, %edx
269 ; X86-NEXT: addl $32, %edx
270 ; X86-NEXT: testl %ebx, %ebx
271 ; X86-NEXT: cmovnel %esi, %edx
272 ; X86-NEXT: addl $64, %edx
273 ; X86-NEXT: movl %eax, %esi
274 ; X86-NEXT: orl %edi, %esi
275 ; X86-NEXT: cmovnel %ecx, %edx
276 ; X86-NEXT: xorl %esi, %esi
277 ; X86-NEXT: subl %edx, %ebp
278 ; X86-NEXT: movl $0, %ebx
279 ; X86-NEXT: sbbl %ebx, %ebx
280 ; X86-NEXT: movl $0, %edx
281 ; X86-NEXT: sbbl %edx, %edx
282 ; X86-NEXT: movl $0, %eax
283 ; X86-NEXT: sbbl %eax, %eax
284 ; X86-NEXT: movl $127, %ecx
285 ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
286 ; X86-NEXT: cmpl %ebp, %ecx
287 ; X86-NEXT: movl $0, %ecx
288 ; X86-NEXT: sbbl %ebx, %ecx
289 ; X86-NEXT: movl $0, %ecx
290 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
291 ; X86-NEXT: sbbl %edx, %ecx
292 ; X86-NEXT: movl $0, %ecx
293 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
294 ; X86-NEXT: sbbl %eax, %ecx
296 ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
297 ; X86-NEXT: cmovnel %esi, %edi
298 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
299 ; X86-NEXT: cmovnel %esi, %edx
300 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
301 ; X86-NEXT: cmovnel %esi, %eax
302 ; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload
303 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
304 ; X86-NEXT: jne .LBB4_8
305 ; X86-NEXT: # %bb.1: # %_udiv-special-cases
306 ; X86-NEXT: movl %ebx, %ecx
307 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
308 ; X86-NEXT: xorl $127, %ebx
309 ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
310 ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
311 ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
312 ; X86-NEXT: orl %ebx, %ecx
313 ; X86-NEXT: je .LBB4_8
314 ; X86-NEXT: # %bb.2: # %udiv-bb1
315 ; X86-NEXT: movl (%esp), %eax # 4-byte Reload
316 ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
317 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
318 ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
319 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
320 ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
321 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
322 ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
323 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
324 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
325 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
326 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
327 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
328 ; X86-NEXT: # kill: def $al killed $al killed $eax
329 ; X86-NEXT: xorb $127, %al
330 ; X86-NEXT: movb %al, %ch
331 ; X86-NEXT: andb $7, %ch
332 ; X86-NEXT: shrb $3, %al
333 ; X86-NEXT: andb $15, %al
335 ; X86-NEXT: movsbl %al, %ebx
336 ; X86-NEXT: movl 144(%esp,%ebx), %edx
337 ; X86-NEXT: movl 148(%esp,%ebx), %edi
338 ; X86-NEXT: movb %ch, %cl
339 ; X86-NEXT: shldl %cl, %edx, %edi
340 ; X86-NEXT: shll %cl, %edx
342 ; X86-NEXT: movl 140(%esp,%ebx), %eax
343 ; X86-NEXT: movl %eax, %esi
344 ; X86-NEXT: shrl %esi
345 ; X86-NEXT: shrl %cl, %esi
346 ; X86-NEXT: orl %edx, %esi
347 ; X86-NEXT: movl %esi, %edx
348 ; X86-NEXT: movl 136(%esp,%ebx), %esi
349 ; X86-NEXT: movb %ch, %cl
350 ; X86-NEXT: shldl %cl, %esi, %eax
351 ; X86-NEXT: shll %cl, %esi
352 ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
353 ; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
354 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
355 ; X86-NEXT: adcl $0, %ecx
356 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
357 ; X86-NEXT: adcl $0, %ebx
358 ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
359 ; X86-NEXT: jae .LBB4_3
361 ; X86-NEXT: xorl %ebx, %ebx
362 ; X86-NEXT: xorl %esi, %esi
363 ; X86-NEXT: jmp .LBB4_7
364 ; X86-NEXT: .LBB4_3: # %udiv-preheader
365 ; X86-NEXT: movl (%esp), %esi # 4-byte Reload
366 ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
367 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
368 ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
369 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
370 ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
371 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
372 ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
373 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
374 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
375 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
376 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
377 ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
378 ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
379 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
380 ; X86-NEXT: movb %bl, %ch
381 ; X86-NEXT: andb $7, %ch
382 ; X86-NEXT: movb %bl, %cl
383 ; X86-NEXT: shrb $3, %cl
384 ; X86-NEXT: andb $15, %cl
385 ; X86-NEXT: movzbl %cl, %ebp
386 ; X86-NEXT: movl 100(%esp,%ebp), %esi
387 ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
388 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
389 ; X86-NEXT: movl 96(%esp,%ebp), %ebx
390 ; X86-NEXT: movl %ebp, %eax
391 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
392 ; X86-NEXT: movl %ebx, %edx
393 ; X86-NEXT: movb %ch, %cl
394 ; X86-NEXT: shrdl %cl, %esi, %edx
395 ; X86-NEXT: movl 88(%esp,%ebp), %ebp
396 ; X86-NEXT: movl 92(%esp,%eax), %esi
397 ; X86-NEXT: movl %esi, %eax
398 ; X86-NEXT: shrl %cl, %eax
400 ; X86-NEXT: addl %ebx, %ebx
401 ; X86-NEXT: shll %cl, %ebx
402 ; X86-NEXT: orl %eax, %ebx
403 ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
404 ; X86-NEXT: movb %ch, %cl
405 ; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
406 ; X86-NEXT: shrdl %cl, %esi, %ebp
407 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
408 ; X86-NEXT: addl $-1, %eax
409 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
410 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
411 ; X86-NEXT: adcl $-1, %eax
412 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
413 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
414 ; X86-NEXT: adcl $-1, %eax
415 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
416 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
417 ; X86-NEXT: movl %ecx, %eax
418 ; X86-NEXT: adcl $-1, %eax
419 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
420 ; X86-NEXT: xorl %esi, %esi
421 ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
422 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
423 ; X86-NEXT: .p2align 4, 0x90
424 ; X86-NEXT: .LBB4_4: # %udiv-do-while
425 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
426 ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
427 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
428 ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
429 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
430 ; X86-NEXT: shldl $1, %edx, %ebx
431 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
432 ; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
433 ; X86-NEXT: shldl $1, %ebp, %edx
434 ; X86-NEXT: shldl $1, %edi, %ebp
435 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
436 ; X86-NEXT: shldl $1, %eax, %edi
437 ; X86-NEXT: orl %esi, %edi
438 ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
439 ; X86-NEXT: movl %ecx, %edi
440 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
441 ; X86-NEXT: shldl $1, %ecx, %eax
442 ; X86-NEXT: orl %esi, %eax
443 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
444 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
445 ; X86-NEXT: shldl $1, %eax, %ecx
446 ; X86-NEXT: orl %esi, %ecx
447 ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
448 ; X86-NEXT: addl %eax, %eax
449 ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
450 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
451 ; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
452 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
453 ; X86-NEXT: sbbl %edx, %ecx
454 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
455 ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
456 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
457 ; X86-NEXT: sbbl %ebx, %ecx
458 ; X86-NEXT: sarl $31, %ecx
459 ; X86-NEXT: movl %ecx, %eax
460 ; X86-NEXT: andl $1, %eax
461 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
462 ; X86-NEXT: movl %ecx, %esi
463 ; X86-NEXT: andl %edi, %esi
464 ; X86-NEXT: movl %ecx, %edi
465 ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
466 ; X86-NEXT: movl %ecx, %eax
467 ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
468 ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
469 ; X86-NEXT: subl %ecx, %ebp
470 ; X86-NEXT: sbbl %eax, %edx
471 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
472 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
473 ; X86-NEXT: sbbl %edi, %edx
474 ; X86-NEXT: movl (%esp), %edi # 4-byte Reload
475 ; X86-NEXT: sbbl %esi, %ebx
476 ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
477 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
478 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
479 ; X86-NEXT: addl $-1, %ecx
480 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
481 ; X86-NEXT: adcl $-1, %eax
482 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
483 ; X86-NEXT: adcl $-1, %esi
484 ; X86-NEXT: adcl $-1, %ebx
485 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
486 ; X86-NEXT: orl %ebx, %eax
487 ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
488 ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
489 ; X86-NEXT: orl %esi, %ecx
490 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
491 ; X86-NEXT: orl %eax, %ecx
492 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
493 ; X86-NEXT: jne .LBB4_4
495 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
496 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
497 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
498 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
499 ; X86-NEXT: .LBB4_7: # %udiv-loop-exit
500 ; X86-NEXT: shldl $1, %edx, %edi
501 ; X86-NEXT: orl %esi, %edi
502 ; X86-NEXT: shldl $1, %eax, %edx
503 ; X86-NEXT: orl %esi, %edx
504 ; X86-NEXT: movl %esi, %ecx
505 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
506 ; X86-NEXT: shldl $1, %esi, %eax
507 ; X86-NEXT: orl %ecx, %eax
508 ; X86-NEXT: addl %esi, %esi
509 ; X86-NEXT: orl %ebx, %esi
510 ; X86-NEXT: .LBB4_8: # %udiv-end
511 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
512 ; X86-NEXT: xorl %ecx, %edi
513 ; X86-NEXT: xorl %ecx, %edx
514 ; X86-NEXT: xorl %ecx, %eax
515 ; X86-NEXT: xorl %ecx, %esi
516 ; X86-NEXT: subl %ecx, %esi
517 ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
518 ; X86-NEXT: sbbl %ecx, %eax
519 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
520 ; X86-NEXT: sbbl %ecx, %edx
521 ; X86-NEXT: sbbl %ecx, %edi
522 ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
523 ; X86-NEXT: movl %esi, (%ebp)
524 ; X86-NEXT: movl %eax, 4(%ebp)
525 ; X86-NEXT: movl %edx, 8(%ebp)
526 ; X86-NEXT: movl %edi, 12(%ebp)
527 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
528 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
529 ; X86-NEXT: movl %edx, %ebx
530 ; X86-NEXT: mull %edi
531 ; X86-NEXT: movl %edx, %ecx
532 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
533 ; X86-NEXT: movl %esi, %eax
534 ; X86-NEXT: mull %edi
535 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
536 ; X86-NEXT: movl %edx, %edi
537 ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
538 ; X86-NEXT: adcl $0, %ecx
539 ; X86-NEXT: movl %esi, %eax
540 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
541 ; X86-NEXT: mull %esi
542 ; X86-NEXT: addl %edi, %eax
543 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
544 ; X86-NEXT: adcl %ecx, %edx
545 ; X86-NEXT: movl %edx, %edi
547 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
548 ; X86-NEXT: movl %esi, %eax
549 ; X86-NEXT: mull {{[0-9]+}}(%esp)
550 ; X86-NEXT: addl %edi, %eax
551 ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
552 ; X86-NEXT: movzbl %cl, %eax
553 ; X86-NEXT: adcl %eax, %edx
554 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
555 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
556 ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
557 ; X86-NEXT: imull %eax, %ecx
558 ; X86-NEXT: mull %ebx
559 ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
560 ; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
561 ; X86-NEXT: addl %edx, %ebx
562 ; X86-NEXT: addl %ecx, %ebx
563 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
564 ; X86-NEXT: movl %eax, %ecx
565 ; X86-NEXT: imull %esi, %ecx
566 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
567 ; X86-NEXT: imull %edx, %ebp
568 ; X86-NEXT: mull %edx
569 ; X86-NEXT: addl %edx, %ebp
570 ; X86-NEXT: addl %ecx, %ebp
571 ; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
572 ; X86-NEXT: adcl %ebx, %ebp
573 ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
574 ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
575 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
576 ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
577 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
578 ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
579 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
580 ; X86-NEXT: sbbl %eax, %esi
581 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
582 ; X86-NEXT: sbbl %ebp, %edi
583 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
584 ; X86-NEXT: movl %edx, (%eax)
585 ; X86-NEXT: movl %ecx, 4(%eax)
586 ; X86-NEXT: movl %esi, 8(%eax)
587 ; X86-NEXT: movl %edi, 12(%eax)
588 ; X86-NEXT: addl $152, %esp
589 ; X86-NEXT: popl %esi
590 ; X86-NEXT: popl %edi
591 ; X86-NEXT: popl %ebx
592 ; X86-NEXT: popl %ebp
595 ; X64-LABEL: scalar_i128:
597 ; X64-NEXT: pushq %r15
598 ; X64-NEXT: pushq %r14
599 ; X64-NEXT: pushq %r13
600 ; X64-NEXT: pushq %r12
601 ; X64-NEXT: pushq %rbx
602 ; X64-NEXT: movq %r8, %r15
603 ; X64-NEXT: movq %rcx, %r12
604 ; X64-NEXT: movq %rdx, %r13
605 ; X64-NEXT: movq %rsi, %rbx
606 ; X64-NEXT: movq %rdi, %r14
607 ; X64-NEXT: callq __divti3@PLT
608 ; X64-NEXT: movq %rdx, %rcx
609 ; X64-NEXT: movq %rdx, 8(%r15)
610 ; X64-NEXT: movq %rax, (%r15)
611 ; X64-NEXT: imulq %rax, %r12
612 ; X64-NEXT: mulq %r13
613 ; X64-NEXT: addq %r12, %rdx
614 ; X64-NEXT: imulq %r13, %rcx
615 ; X64-NEXT: addq %rdx, %rcx
616 ; X64-NEXT: subq %rax, %r14
617 ; X64-NEXT: sbbq %rcx, %rbx
618 ; X64-NEXT: movq %r14, %rax
619 ; X64-NEXT: movq %rbx, %rdx
620 ; X64-NEXT: popq %rbx
621 ; X64-NEXT: popq %r12
622 ; X64-NEXT: popq %r13
623 ; X64-NEXT: popq %r14
624 ; X64-NEXT: popq %r15
626 %div = sdiv i128 %x, %y
627 store i128 %div, ptr %divdst, align 4
628 %t1 = mul i128 %div, %y
629 %t2 = sub i128 %x, %t1
633 define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwind {
634 ; X86-LABEL: vector_i128_i8:
636 ; X86-NEXT: pushl %ebp
637 ; X86-NEXT: movl %esp, %ebp
638 ; X86-NEXT: pushl %ebx
639 ; X86-NEXT: pushl %edi
640 ; X86-NEXT: pushl %esi
641 ; X86-NEXT: andl $-16, %esp
642 ; X86-NEXT: subl $48, %esp
643 ; X86-NEXT: movdqa %xmm0, (%esp)
644 ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
645 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
646 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
647 ; X86-NEXT: movzbl %al, %eax
648 ; X86-NEXT: movd %eax, %xmm2
649 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
650 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
651 ; X86-NEXT: movzbl %al, %eax
652 ; X86-NEXT: movd %eax, %xmm3
653 ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
654 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
655 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
656 ; X86-NEXT: movzbl %al, %eax
657 ; X86-NEXT: movd %eax, %xmm4
658 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
659 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
660 ; X86-NEXT: movzbl %al, %eax
661 ; X86-NEXT: movd %eax, %xmm2
662 ; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
663 ; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
664 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
665 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
666 ; X86-NEXT: movzbl %al, %eax
667 ; X86-NEXT: movd %eax, %xmm3
668 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
669 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
670 ; X86-NEXT: movzbl %al, %eax
671 ; X86-NEXT: movd %eax, %xmm4
672 ; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
673 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
674 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
675 ; X86-NEXT: movzbl %al, %eax
676 ; X86-NEXT: movd %eax, %xmm5
677 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
678 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
679 ; X86-NEXT: movzbl %al, %eax
680 ; X86-NEXT: movd %eax, %xmm3
681 ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
682 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
683 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
684 ; X86-NEXT: movzbl %al, %eax
685 ; X86-NEXT: movd %eax, %xmm5
686 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
687 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
688 ; X86-NEXT: movzbl %al, %eax
689 ; X86-NEXT: movd %eax, %xmm6
690 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
691 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
692 ; X86-NEXT: movzbl %al, %edx
693 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
694 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
695 ; X86-NEXT: movzbl %al, %esi
696 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
697 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
698 ; X86-NEXT: movzbl %al, %edi
699 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
700 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
701 ; X86-NEXT: movzbl %al, %ebx
702 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
703 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
704 ; X86-NEXT: movl %eax, %ecx
705 ; X86-NEXT: movsbl (%esp), %eax
706 ; X86-NEXT: idivb {{[0-9]+}}(%esp)
707 ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
708 ; X86-NEXT: movd %edx, %xmm7
709 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
710 ; X86-NEXT: movd %esi, %xmm4
711 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
712 ; X86-NEXT: movd %edi, %xmm2
713 ; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
714 ; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
715 ; X86-NEXT: movd %ebx, %xmm5
716 ; X86-NEXT: movzbl %cl, %ecx
717 ; X86-NEXT: movd %ecx, %xmm6
718 ; X86-NEXT: movl 8(%ebp), %ecx
719 ; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
720 ; X86-NEXT: movzbl %al, %eax
721 ; X86-NEXT: movd %eax, %xmm2
722 ; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
723 ; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
724 ; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
725 ; X86-NEXT: movdqa %xmm2, %xmm4
726 ; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
727 ; X86-NEXT: movdqa %xmm4, (%ecx)
728 ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
729 ; X86-NEXT: movdqa %xmm1, %xmm4
730 ; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
731 ; X86-NEXT: pmullw %xmm3, %xmm4
732 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
733 ; X86-NEXT: pand %xmm3, %xmm4
734 ; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
735 ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
736 ; X86-NEXT: pmullw %xmm2, %xmm1
737 ; X86-NEXT: pand %xmm3, %xmm1
738 ; X86-NEXT: packuswb %xmm4, %xmm1
739 ; X86-NEXT: psubb %xmm1, %xmm0
740 ; X86-NEXT: leal -12(%ebp), %esp
741 ; X86-NEXT: popl %esi
742 ; X86-NEXT: popl %edi
743 ; X86-NEXT: popl %ebx
744 ; X86-NEXT: popl %ebp
747 ; X64-LABEL: vector_i128_i8:
749 ; X64-NEXT: pushq %rbp
750 ; X64-NEXT: pushq %r15
751 ; X64-NEXT: pushq %r14
752 ; X64-NEXT: pushq %r13
753 ; X64-NEXT: pushq %r12
754 ; X64-NEXT: pushq %rbx
755 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
756 ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
757 ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
758 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
759 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
760 ; X64-NEXT: movzbl %al, %eax
761 ; X64-NEXT: movd %eax, %xmm2
762 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
763 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
764 ; X64-NEXT: movzbl %al, %edi
765 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
766 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
767 ; X64-NEXT: movzbl %al, %esi
768 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
769 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
770 ; X64-NEXT: movzbl %al, %r8d
771 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
772 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
773 ; X64-NEXT: movzbl %al, %r9d
774 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
775 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
776 ; X64-NEXT: movzbl %al, %r10d
777 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
778 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
779 ; X64-NEXT: movzbl %al, %r11d
780 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
781 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
782 ; X64-NEXT: movzbl %al, %ebx
783 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
784 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
785 ; X64-NEXT: movzbl %al, %ebp
786 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
787 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
788 ; X64-NEXT: movzbl %al, %r14d
789 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
790 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
791 ; X64-NEXT: movzbl %al, %r15d
792 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
793 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
794 ; X64-NEXT: movzbl %al, %r12d
795 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
796 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
797 ; X64-NEXT: movzbl %al, %r13d
798 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
799 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
800 ; X64-NEXT: movzbl %al, %edx
801 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
802 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
803 ; X64-NEXT: movl %eax, %ecx
804 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
805 ; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
806 ; X64-NEXT: movd %edi, %xmm3
807 ; X64-NEXT: movd %esi, %xmm4
808 ; X64-NEXT: movd %r8d, %xmm5
809 ; X64-NEXT: movd %r9d, %xmm6
810 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
811 ; X64-NEXT: movd %r10d, %xmm7
812 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
813 ; X64-NEXT: movd %r11d, %xmm4
814 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
815 ; X64-NEXT: movd %ebx, %xmm2
816 ; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
817 ; X64-NEXT: movd %ebp, %xmm3
818 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
819 ; X64-NEXT: movd %r14d, %xmm4
820 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
821 ; X64-NEXT: movd %r15d, %xmm6
822 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
823 ; X64-NEXT: movd %r12d, %xmm5
824 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
825 ; X64-NEXT: movd %r13d, %xmm3
826 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
827 ; X64-NEXT: movd %edx, %xmm6
828 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
829 ; X64-NEXT: movzbl %cl, %ecx
830 ; X64-NEXT: movd %ecx, %xmm4
831 ; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
832 ; X64-NEXT: movzbl %al, %eax
833 ; X64-NEXT: movd %eax, %xmm3
834 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
835 ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
836 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
837 ; X64-NEXT: movdqa %xmm3, %xmm4
838 ; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
839 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
840 ; X64-NEXT: movdqa %xmm4, (%rax)
841 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
842 ; X64-NEXT: movdqa %xmm1, %xmm4
843 ; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
844 ; X64-NEXT: pmullw %xmm2, %xmm4
845 ; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
846 ; X64-NEXT: pand %xmm2, %xmm4
847 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
848 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
849 ; X64-NEXT: pmullw %xmm3, %xmm1
850 ; X64-NEXT: pand %xmm2, %xmm1
851 ; X64-NEXT: packuswb %xmm4, %xmm1
852 ; X64-NEXT: psubb %xmm1, %xmm0
853 ; X64-NEXT: popq %rbx
854 ; X64-NEXT: popq %r12
855 ; X64-NEXT: popq %r13
856 ; X64-NEXT: popq %r14
857 ; X64-NEXT: popq %r15
858 ; X64-NEXT: popq %rbp
860 %div = sdiv <16 x i8> %x, %y
861 store <16 x i8> %div, ptr %divdst, align 16
862 %t1 = mul <16 x i8> %div, %y
863 %t2 = sub <16 x i8> %x, %t1
867 define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind {
868 ; X86-LABEL: vector_i128_i16:
870 ; X86-NEXT: pushl %esi
871 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
872 ; X86-NEXT: pextrw $7, %xmm0, %eax
873 ; X86-NEXT: pextrw $7, %xmm1, %esi
874 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
876 ; X86-NEXT: idivw %si
877 ; X86-NEXT: # kill: def $ax killed $ax def $eax
878 ; X86-NEXT: movd %eax, %xmm2
879 ; X86-NEXT: pextrw $6, %xmm0, %eax
880 ; X86-NEXT: pextrw $6, %xmm1, %esi
881 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
883 ; X86-NEXT: idivw %si
884 ; X86-NEXT: # kill: def $ax killed $ax def $eax
885 ; X86-NEXT: movd %eax, %xmm3
886 ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
887 ; X86-NEXT: pextrw $5, %xmm0, %eax
888 ; X86-NEXT: pextrw $5, %xmm1, %esi
889 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
891 ; X86-NEXT: idivw %si
892 ; X86-NEXT: # kill: def $ax killed $ax def $eax
893 ; X86-NEXT: movd %eax, %xmm4
894 ; X86-NEXT: pextrw $4, %xmm0, %eax
895 ; X86-NEXT: pextrw $4, %xmm1, %esi
896 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
898 ; X86-NEXT: idivw %si
899 ; X86-NEXT: # kill: def $ax killed $ax def $eax
900 ; X86-NEXT: movd %eax, %xmm2
901 ; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
902 ; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
903 ; X86-NEXT: pextrw $3, %xmm0, %eax
904 ; X86-NEXT: pextrw $3, %xmm1, %esi
905 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
907 ; X86-NEXT: idivw %si
908 ; X86-NEXT: # kill: def $ax killed $ax def $eax
909 ; X86-NEXT: movd %eax, %xmm4
910 ; X86-NEXT: pextrw $2, %xmm0, %eax
911 ; X86-NEXT: pextrw $2, %xmm1, %esi
912 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
914 ; X86-NEXT: idivw %si
915 ; X86-NEXT: # kill: def $ax killed $ax def $eax
916 ; X86-NEXT: movd %eax, %xmm3
917 ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
918 ; X86-NEXT: pextrw $1, %xmm0, %eax
919 ; X86-NEXT: pextrw $1, %xmm1, %esi
920 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
922 ; X86-NEXT: idivw %si
923 ; X86-NEXT: # kill: def $ax killed $ax def $eax
924 ; X86-NEXT: movd %eax, %xmm4
925 ; X86-NEXT: movd %xmm0, %eax
926 ; X86-NEXT: movd %xmm1, %esi
927 ; X86-NEXT: # kill: def $ax killed $ax killed $eax
929 ; X86-NEXT: idivw %si
930 ; X86-NEXT: # kill: def $ax killed $ax def $eax
931 ; X86-NEXT: movd %eax, %xmm5
932 ; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
933 ; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
934 ; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
935 ; X86-NEXT: movdqa %xmm5, (%ecx)
936 ; X86-NEXT: pmullw %xmm1, %xmm5
937 ; X86-NEXT: psubw %xmm5, %xmm0
938 ; X86-NEXT: popl %esi
941 ; X64-LABEL: vector_i128_i16:
943 ; X64-NEXT: pextrw $7, %xmm0, %eax
944 ; X64-NEXT: pextrw $7, %xmm1, %ecx
945 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
947 ; X64-NEXT: idivw %cx
948 ; X64-NEXT: # kill: def $ax killed $ax def $eax
949 ; X64-NEXT: movd %eax, %xmm2
950 ; X64-NEXT: pextrw $6, %xmm0, %eax
951 ; X64-NEXT: pextrw $6, %xmm1, %ecx
952 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
954 ; X64-NEXT: idivw %cx
955 ; X64-NEXT: # kill: def $ax killed $ax def $eax
956 ; X64-NEXT: movd %eax, %xmm3
957 ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
958 ; X64-NEXT: pextrw $5, %xmm0, %eax
959 ; X64-NEXT: pextrw $5, %xmm1, %ecx
960 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
962 ; X64-NEXT: idivw %cx
963 ; X64-NEXT: # kill: def $ax killed $ax def $eax
964 ; X64-NEXT: movd %eax, %xmm4
965 ; X64-NEXT: pextrw $4, %xmm0, %eax
966 ; X64-NEXT: pextrw $4, %xmm1, %ecx
967 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
969 ; X64-NEXT: idivw %cx
970 ; X64-NEXT: # kill: def $ax killed $ax def $eax
971 ; X64-NEXT: movd %eax, %xmm2
972 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
973 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
974 ; X64-NEXT: pextrw $3, %xmm0, %eax
975 ; X64-NEXT: pextrw $3, %xmm1, %ecx
976 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
978 ; X64-NEXT: idivw %cx
979 ; X64-NEXT: # kill: def $ax killed $ax def $eax
980 ; X64-NEXT: movd %eax, %xmm3
981 ; X64-NEXT: pextrw $2, %xmm0, %eax
982 ; X64-NEXT: pextrw $2, %xmm1, %ecx
983 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
985 ; X64-NEXT: idivw %cx
986 ; X64-NEXT: # kill: def $ax killed $ax def $eax
987 ; X64-NEXT: movd %eax, %xmm4
988 ; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
989 ; X64-NEXT: pextrw $1, %xmm0, %eax
990 ; X64-NEXT: pextrw $1, %xmm1, %ecx
991 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
993 ; X64-NEXT: idivw %cx
994 ; X64-NEXT: # kill: def $ax killed $ax def $eax
995 ; X64-NEXT: movd %eax, %xmm3
996 ; X64-NEXT: movd %xmm0, %eax
997 ; X64-NEXT: movd %xmm1, %ecx
998 ; X64-NEXT: # kill: def $ax killed $ax killed $eax
1000 ; X64-NEXT: idivw %cx
1001 ; X64-NEXT: # kill: def $ax killed $ax def $eax
1002 ; X64-NEXT: movd %eax, %xmm5
1003 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1004 ; X64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1005 ; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
1006 ; X64-NEXT: movdqa %xmm5, (%rdi)
1007 ; X64-NEXT: pmullw %xmm1, %xmm5
1008 ; X64-NEXT: psubw %xmm5, %xmm0
1010 %div = sdiv <8 x i16> %x, %y
1011 store <8 x i16> %div, ptr %divdst, align 16
1012 %t1 = mul <8 x i16> %div, %y
1013 %t2 = sub <8 x i16> %x, %t1
1017 define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounwind {
1018 ; X86-LABEL: vector_i128_i32:
1020 ; X86-NEXT: pushl %esi
1021 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1022 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
1023 ; X86-NEXT: movd %xmm2, %eax
1024 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1025 ; X86-NEXT: movd %xmm2, %esi
1027 ; X86-NEXT: idivl %esi
1028 ; X86-NEXT: movd %eax, %xmm3
1029 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1030 ; X86-NEXT: movd %xmm2, %eax
1031 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1032 ; X86-NEXT: movd %xmm2, %esi
1034 ; X86-NEXT: idivl %esi
1035 ; X86-NEXT: movd %eax, %xmm2
1036 ; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1037 ; X86-NEXT: movd %xmm0, %eax
1038 ; X86-NEXT: movd %xmm1, %esi
1040 ; X86-NEXT: idivl %esi
1041 ; X86-NEXT: movd %eax, %xmm3
1042 ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1043 ; X86-NEXT: movd %xmm4, %eax
1044 ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
1045 ; X86-NEXT: movd %xmm4, %esi
1047 ; X86-NEXT: idivl %esi
1048 ; X86-NEXT: movd %eax, %xmm4
1049 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1050 ; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1051 ; X86-NEXT: movdqa %xmm3, (%ecx)
1052 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1053 ; X86-NEXT: pmuludq %xmm1, %xmm3
1054 ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1055 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1056 ; X86-NEXT: pmuludq %xmm2, %xmm1
1057 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1058 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1059 ; X86-NEXT: psubd %xmm3, %xmm0
1060 ; X86-NEXT: popl %esi
1063 ; X64-LABEL: vector_i128_i32:
1065 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
1066 ; X64-NEXT: movd %xmm2, %eax
1067 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1068 ; X64-NEXT: movd %xmm2, %ecx
1070 ; X64-NEXT: idivl %ecx
1071 ; X64-NEXT: movd %eax, %xmm2
1072 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1073 ; X64-NEXT: movd %xmm3, %eax
1074 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1075 ; X64-NEXT: movd %xmm3, %ecx
1077 ; X64-NEXT: idivl %ecx
1078 ; X64-NEXT: movd %eax, %xmm3
1079 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1080 ; X64-NEXT: movd %xmm0, %eax
1081 ; X64-NEXT: movd %xmm1, %ecx
1083 ; X64-NEXT: idivl %ecx
1084 ; X64-NEXT: movd %eax, %xmm2
1085 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1086 ; X64-NEXT: movd %xmm4, %eax
1087 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
1088 ; X64-NEXT: movd %xmm4, %ecx
1090 ; X64-NEXT: idivl %ecx
1091 ; X64-NEXT: movd %eax, %xmm4
1092 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1093 ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1094 ; X64-NEXT: movdqa %xmm2, (%rdi)
1095 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1096 ; X64-NEXT: pmuludq %xmm1, %xmm2
1097 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1098 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1099 ; X64-NEXT: pmuludq %xmm3, %xmm1
1100 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1101 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1102 ; X64-NEXT: psubd %xmm2, %xmm0
1104 %div = sdiv <4 x i32> %x, %y
1105 store <4 x i32> %div, ptr %divdst, align 16
1106 %t1 = mul <4 x i32> %div, %y
1107 %t2 = sub <4 x i32> %x, %t1
1111 define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounwind {
1112 ; X86-LABEL: vector_i128_i64:
1114 ; X86-NEXT: pushl %esi
1115 ; X86-NEXT: subl $64, %esp
1116 ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1117 ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1118 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1119 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1120 ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
1121 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1122 ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
1123 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
1124 ; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
1125 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1126 ; X86-NEXT: movd %xmm1, (%esp)
1127 ; X86-NEXT: calll __divdi3
1128 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1129 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1130 ; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
1131 ; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
1132 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1133 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1134 ; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
1135 ; X86-NEXT: movd %xmm1, (%esp)
1136 ; X86-NEXT: movd %edx, %xmm0
1137 ; X86-NEXT: movd %eax, %xmm1
1138 ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1139 ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1140 ; X86-NEXT: calll __divdi3
1141 ; X86-NEXT: movd %edx, %xmm1
1142 ; X86-NEXT: movd %eax, %xmm3
1143 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1144 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1145 ; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1146 ; X86-NEXT: movdqa %xmm3, (%esi)
1147 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1148 ; X86-NEXT: movdqa %xmm0, %xmm1
1149 ; X86-NEXT: psrlq $32, %xmm1
1150 ; X86-NEXT: pmuludq %xmm3, %xmm1
1151 ; X86-NEXT: movdqa %xmm3, %xmm2
1152 ; X86-NEXT: psrlq $32, %xmm2
1153 ; X86-NEXT: pmuludq %xmm0, %xmm2
1154 ; X86-NEXT: paddq %xmm1, %xmm2
1155 ; X86-NEXT: psllq $32, %xmm2
1156 ; X86-NEXT: pmuludq %xmm0, %xmm3
1157 ; X86-NEXT: paddq %xmm2, %xmm3
1158 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1159 ; X86-NEXT: psubq %xmm3, %xmm0
1160 ; X86-NEXT: addl $64, %esp
1161 ; X86-NEXT: popl %esi
1164 ; X64-LABEL: vector_i128_i64:
1166 ; X64-NEXT: movq %xmm0, %rax
1167 ; X64-NEXT: movq %xmm1, %rcx
1169 ; X64-NEXT: idivq %rcx
1170 ; X64-NEXT: movq %rax, %xmm2
1171 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1172 ; X64-NEXT: movq %xmm3, %rax
1173 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1174 ; X64-NEXT: movq %xmm3, %rcx
1176 ; X64-NEXT: idivq %rcx
1177 ; X64-NEXT: movq %rax, %xmm3
1178 ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1179 ; X64-NEXT: movdqa %xmm2, (%rdi)
1180 ; X64-NEXT: movdqa %xmm1, %xmm3
1181 ; X64-NEXT: psrlq $32, %xmm3
1182 ; X64-NEXT: pmuludq %xmm2, %xmm3
1183 ; X64-NEXT: movdqa %xmm2, %xmm4
1184 ; X64-NEXT: psrlq $32, %xmm4
1185 ; X64-NEXT: pmuludq %xmm1, %xmm4
1186 ; X64-NEXT: paddq %xmm3, %xmm4
1187 ; X64-NEXT: psllq $32, %xmm4
1188 ; X64-NEXT: pmuludq %xmm1, %xmm2
1189 ; X64-NEXT: paddq %xmm4, %xmm2
1190 ; X64-NEXT: psubq %xmm2, %xmm0
1192 %div = sdiv <2 x i64> %x, %y
1193 store <2 x i64> %div, ptr %divdst, align 16
1194 %t1 = mul <2 x i64> %div, %y
1195 %t2 = sub <2 x i64> %x, %t1
1201 define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
1202 ; X86-LABEL: scalar_i32_commutative:
1204 ; X86-NEXT: pushl %edi
1205 ; X86-NEXT: pushl %esi
1206 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1207 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1208 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1209 ; X86-NEXT: movl (%eax), %edi
1210 ; X86-NEXT: movl %ecx, %eax
1212 ; X86-NEXT: idivl %edi
1213 ; X86-NEXT: movl %eax, (%esi)
1214 ; X86-NEXT: imull %eax, %edi
1215 ; X86-NEXT: subl %edi, %ecx
1216 ; X86-NEXT: movl %ecx, %eax
1217 ; X86-NEXT: popl %esi
1218 ; X86-NEXT: popl %edi
1221 ; X64-LABEL: scalar_i32_commutative:
1223 ; X64-NEXT: movq %rdx, %rcx
1224 ; X64-NEXT: movl (%rsi), %esi
1225 ; X64-NEXT: movl %edi, %eax
1227 ; X64-NEXT: idivl %esi
1228 ; X64-NEXT: movl %eax, (%rcx)
1229 ; X64-NEXT: imull %eax, %esi
1230 ; X64-NEXT: subl %esi, %edi
1231 ; X64-NEXT: movl %edi, %eax
1233 %y = load i32, ptr %ysrc, align 4
1234 %div = sdiv i32 %x, %y
1235 store i32 %div, ptr %divdst, align 4
1236 %t1 = mul i32 %y, %div ; commutative
1237 %t2 = sub i32 %x, %t1
1241 ; We do not care about extra uses.
1242 define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
1243 ; X86-LABEL: extrause:
1245 ; X86-NEXT: pushl %ebx
1246 ; X86-NEXT: pushl %edi
1247 ; X86-NEXT: pushl %esi
1248 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1249 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1250 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1251 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
1252 ; X86-NEXT: movl %ecx, %eax
1254 ; X86-NEXT: idivl %ebx
1255 ; X86-NEXT: movl %eax, (%edi)
1256 ; X86-NEXT: imull %ebx, %eax
1257 ; X86-NEXT: movl %eax, (%esi)
1258 ; X86-NEXT: subl %eax, %ecx
1259 ; X86-NEXT: movl %ecx, %eax
1260 ; X86-NEXT: popl %esi
1261 ; X86-NEXT: popl %edi
1262 ; X86-NEXT: popl %ebx
1265 ; X64-LABEL: extrause:
1267 ; X64-NEXT: movq %rdx, %r8
1268 ; X64-NEXT: movl %edi, %eax
1270 ; X64-NEXT: idivl %esi
1271 ; X64-NEXT: movl %eax, (%r8)
1272 ; X64-NEXT: imull %esi, %eax
1273 ; X64-NEXT: movl %eax, (%rcx)
1274 ; X64-NEXT: subl %eax, %edi
1275 ; X64-NEXT: movl %edi, %eax
1277 %div = sdiv i32 %x, %y
1278 store i32 %div, ptr %divdst, align 4
1279 %t1 = mul i32 %div, %y
1280 store i32 %t1, ptr %t1dst, align 4
1281 %t2 = sub i32 %x, %t1
1285 ; 'rem' should appear next to 'div'.
1286 define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_srem, ptr %sremdst) nounwind {
1287 ; X86-LABEL: multiple_bb:
1289 ; X86-NEXT: pushl %ebx
1290 ; X86-NEXT: pushl %edi
1291 ; X86-NEXT: pushl %esi
1292 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1293 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1294 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
1295 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1296 ; X86-NEXT: movl %ecx, %eax
1298 ; X86-NEXT: idivl %esi
1299 ; X86-NEXT: movl %eax, (%edi)
1300 ; X86-NEXT: testb %bl, %bl
1301 ; X86-NEXT: je .LBB11_2
1302 ; X86-NEXT: # %bb.1: # %do_srem
1303 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1304 ; X86-NEXT: movl %eax, %edi
1305 ; X86-NEXT: imull %esi, %edi
1306 ; X86-NEXT: subl %edi, %ecx
1307 ; X86-NEXT: movl %ecx, (%edx)
1308 ; X86-NEXT: .LBB11_2: # %end
1309 ; X86-NEXT: popl %esi
1310 ; X86-NEXT: popl %edi
1311 ; X86-NEXT: popl %ebx
1314 ; X64-LABEL: multiple_bb:
1316 ; X64-NEXT: movq %rdx, %r9
1317 ; X64-NEXT: movl %edi, %eax
1319 ; X64-NEXT: idivl %esi
1320 ; X64-NEXT: movl %eax, (%r9)
1321 ; X64-NEXT: testl %ecx, %ecx
1322 ; X64-NEXT: je .LBB11_2
1323 ; X64-NEXT: # %bb.1: # %do_srem
1324 ; X64-NEXT: movl %eax, %ecx
1325 ; X64-NEXT: imull %esi, %ecx
1326 ; X64-NEXT: subl %ecx, %edi
1327 ; X64-NEXT: movl %edi, (%r8)
1328 ; X64-NEXT: .LBB11_2: # %end
1330 %div = sdiv i32 %x, %y
1331 store i32 %div, ptr %divdst, align 4
1332 br i1 %store_srem, label %do_srem, label %end
1334 %t1 = mul i32 %div, %y
1335 %t2 = sub i32 %x, %t1
1336 store i32 %t2, ptr %sremdst, align 4
1342 define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
1343 ; X86-LABEL: negative_different_x:
1345 ; X86-NEXT: pushl %edi
1346 ; X86-NEXT: pushl %esi
1347 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1349 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1350 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
1352 ; X86-NEXT: idivl %edi
1353 ; X86-NEXT: movl %eax, (%esi)
1354 ; X86-NEXT: imull %edi, %eax
1355 ; X86-NEXT: subl %eax, %ecx
1356 ; X86-NEXT: movl %ecx, %eax
1357 ; X86-NEXT: popl %esi
1358 ; X86-NEXT: popl %edi
1361 ; X64-LABEL: negative_different_x:
1363 ; X64-NEXT: movl %edx, %r8d
1364 ; X64-NEXT: movl %edi, %eax
1366 ; X64-NEXT: idivl %r8d
1367 ; X64-NEXT: movl %eax, (%rcx)
1368 ; X64-NEXT: imull %r8d, %eax
1369 ; X64-NEXT: subl %eax, %esi
1370 ; X64-NEXT: movl %esi, %eax
1372 %div = sdiv i32 %x0, %y ; not %x1
1373 store i32 %div, ptr %divdst, align 4
1374 %t1 = mul i32 %div, %y
1375 %t2 = sub i32 %x1, %t1 ; not %x0
1379 define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, ptr %divdst) nounwind {
1380 ; X86-LABEL: negative_different_y:
1382 ; X86-NEXT: pushl %esi
1383 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1384 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1385 ; X86-NEXT: movl %ecx, %eax
1387 ; X86-NEXT: idivl {{[0-9]+}}(%esp)
1388 ; X86-NEXT: movl %eax, (%esi)
1389 ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax
1390 ; X86-NEXT: subl %eax, %ecx
1391 ; X86-NEXT: movl %ecx, %eax
1392 ; X86-NEXT: popl %esi
1395 ; X64-LABEL: negative_different_y:
1397 ; X64-NEXT: movl %edx, %edi
1398 ; X64-NEXT: movl %esi, %eax
1400 ; X64-NEXT: idivl %ecx
1401 ; X64-NEXT: movl %eax, (%r8)
1402 ; X64-NEXT: imull %eax, %edi
1403 ; X64-NEXT: subl %edi, %esi
1404 ; X64-NEXT: movl %esi, %eax
1406 %div = sdiv i32 %x1, %z ; not %x0
1407 store i32 %div, ptr %divdst, align 4
1408 %t1 = mul i32 %div, %y
1409 %t2 = sub i32 %x1, %t1
1413 define i32 @negative_inverted_division(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
1414 ; X86-LABEL: negative_inverted_division:
1416 ; X86-NEXT: pushl %esi
1417 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
1418 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1419 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1421 ; X86-NEXT: idivl %ecx
1422 ; X86-NEXT: movl %eax, (%esi)
1423 ; X86-NEXT: imull %ecx, %eax
1424 ; X86-NEXT: subl %eax, %ecx
1425 ; X86-NEXT: movl %ecx, %eax
1426 ; X86-NEXT: popl %esi
1429 ; X64-LABEL: negative_inverted_division:
1431 ; X64-NEXT: movl %edi, %eax
1433 ; X64-NEXT: idivl %esi
1434 ; X64-NEXT: movl %eax, (%rcx)
1435 ; X64-NEXT: imull %esi, %eax
1436 ; X64-NEXT: subl %eax, %esi
1437 ; X64-NEXT: movl %esi, %eax
1439 %div = sdiv i32 %x0, %x1 ; inverted division
1440 store i32 %div, ptr %divdst, align 4
1441 %t1 = mul i32 %div, %x1
1442 %t2 = sub i32 %x1, %t1