1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
10 ; fold (sdiv x, 1) -> x
11 define i32 @combine_sdiv_by_one(i32 %x) {
12 ; CHECK-LABEL: combine_sdiv_by_one:
14 ; CHECK-NEXT: movl %edi, %eax
20 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21 ; CHECK-LABEL: combine_vec_sdiv_by_one:
24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
28 ; fold (sdiv x, -1) -> 0 - x
29 define i32 @combine_sdiv_by_negone(i32 %x) {
30 ; CHECK-LABEL: combine_sdiv_by_negone:
32 ; CHECK-NEXT: movl %edi, %eax
33 ; CHECK-NEXT: negl %eax
39 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40 ; SSE-LABEL: combine_vec_sdiv_by_negone:
42 ; SSE-NEXT: pxor %xmm1, %xmm1
43 ; SSE-NEXT: psubd %xmm0, %xmm1
44 ; SSE-NEXT: movdqa %xmm1, %xmm0
47 ; AVX-LABEL: combine_vec_sdiv_by_negone:
49 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
50 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
56 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57 define i32 @combine_sdiv_by_minsigned(i32 %x) {
58 ; CHECK-LABEL: combine_sdiv_by_minsigned:
60 ; CHECK-NEXT: xorl %eax, %eax
61 ; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000
62 ; CHECK-NEXT: sete %al
64 %1 = sdiv i32 %x, -2147483648
68 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69 ; SSE-LABEL: combine_vec_sdiv_by_minsigned:
71 ; SSE-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
72 ; SSE-NEXT: psrld $31, %xmm0
75 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
77 ; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
78 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
81 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
83 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
85 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
88 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
90 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
92 ; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0
95 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
97 ; AVX512BW-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1
98 ; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
101 ; XOP-LABEL: combine_vec_sdiv_by_minsigned:
103 ; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm0, %xmm0
104 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm0
106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
110 ; fold (sdiv 0, x) -> 0
111 define i32 @combine_sdiv_zero(i32 %x) {
112 ; CHECK-LABEL: combine_sdiv_zero:
114 ; CHECK-NEXT: xorl %eax, %eax
120 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121 ; SSE-LABEL: combine_vec_sdiv_zero:
123 ; SSE-NEXT: xorps %xmm0, %xmm0
126 ; AVX-LABEL: combine_vec_sdiv_zero:
128 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
130 %1 = sdiv <4 x i32> zeroinitializer, %x
134 ; fold (sdiv x, x) -> 1
135 define i32 @combine_sdiv_dupe(i32 %x) {
136 ; CHECK-LABEL: combine_sdiv_dupe:
138 ; CHECK-NEXT: movl $1, %eax
144 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145 ; SSE-LABEL: combine_vec_sdiv_dupe:
147 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
150 ; AVX1-LABEL: combine_vec_sdiv_dupe:
152 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
155 ; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
156 ; AVX2ORLATER: # %bb.0:
157 ; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
158 ; AVX2ORLATER-NEXT: retq
160 ; XOP-LABEL: combine_vec_sdiv_dupe:
162 ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
164 %1 = sdiv <4 x i32> %x, %x
168 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
169 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
170 ; SSE-LABEL: combine_vec_sdiv_by_pos0:
172 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
173 ; SSE-NEXT: psrld $2, %xmm0
176 ; AVX-LABEL: combine_vec_sdiv_by_pos0:
178 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
179 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
181 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
182 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
186 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
187 ; SSE2-LABEL: combine_vec_sdiv_by_pos1:
189 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
190 ; SSE2-NEXT: movdqa %xmm0, %xmm1
191 ; SSE2-NEXT: psrld $4, %xmm1
192 ; SSE2-NEXT: movdqa %xmm0, %xmm2
193 ; SSE2-NEXT: psrld $3, %xmm2
194 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
195 ; SSE2-NEXT: movdqa %xmm0, %xmm1
196 ; SSE2-NEXT: psrld $2, %xmm1
197 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
201 ; SSE41-LABEL: combine_vec_sdiv_by_pos1:
203 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
204 ; SSE41-NEXT: movdqa %xmm0, %xmm2
205 ; SSE41-NEXT: movdqa %xmm0, %xmm1
206 ; SSE41-NEXT: psrld $3, %xmm1
207 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
208 ; SSE41-NEXT: psrld $4, %xmm0
209 ; SSE41-NEXT: psrld $2, %xmm2
210 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
211 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
212 ; SSE41-NEXT: movdqa %xmm1, %xmm0
215 ; AVX1-LABEL: combine_vec_sdiv_by_pos1:
217 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
218 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
219 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
220 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
221 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2
222 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
223 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
226 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1:
227 ; AVX2ORLATER: # %bb.0:
228 ; AVX2ORLATER-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
229 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
230 ; AVX2ORLATER-NEXT: retq
232 ; XOP-LABEL: combine_vec_sdiv_by_pos1:
234 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
235 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
237 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
238 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
242 ; fold (sdiv x, (1 << c)) -> x >>u c
243 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
244 ; SSE-LABEL: combine_vec_sdiv_by_pow2a:
246 ; SSE-NEXT: movdqa %xmm0, %xmm1
247 ; SSE-NEXT: psrad $31, %xmm1
248 ; SSE-NEXT: psrld $30, %xmm1
249 ; SSE-NEXT: paddd %xmm0, %xmm1
250 ; SSE-NEXT: psrad $2, %xmm1
251 ; SSE-NEXT: movdqa %xmm1, %xmm0
254 ; AVX-LABEL: combine_vec_sdiv_by_pow2a:
256 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
257 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
258 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
259 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
261 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
265 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
266 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
268 ; SSE-NEXT: movdqa %xmm0, %xmm1
269 ; SSE-NEXT: psrad $31, %xmm1
270 ; SSE-NEXT: psrld $30, %xmm1
271 ; SSE-NEXT: paddd %xmm0, %xmm1
272 ; SSE-NEXT: psrad $2, %xmm1
273 ; SSE-NEXT: pxor %xmm0, %xmm0
274 ; SSE-NEXT: psubd %xmm1, %xmm0
277 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
279 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
280 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
281 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
282 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
283 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
284 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
286 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
290 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
291 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
293 ; SSE2-NEXT: pxor %xmm1, %xmm1
294 ; SSE2-NEXT: pxor %xmm2, %xmm2
295 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
296 ; SSE2-NEXT: movdqa %xmm2, %xmm3
297 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
298 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2]
299 ; SSE2-NEXT: pmullw %xmm4, %xmm3
300 ; SSE2-NEXT: psrlw $8, %xmm3
301 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
302 ; SSE2-NEXT: pmullw %xmm4, %xmm2
303 ; SSE2-NEXT: psrlw $8, %xmm2
304 ; SSE2-NEXT: packuswb %xmm3, %xmm2
305 ; SSE2-NEXT: paddb %xmm0, %xmm2
306 ; SSE2-NEXT: movdqa %xmm2, %xmm1
307 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
308 ; SSE2-NEXT: psraw $8, %xmm1
309 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
310 ; SSE2-NEXT: pmullw %xmm3, %xmm1
311 ; SSE2-NEXT: psrlw $8, %xmm1
312 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313 ; SSE2-NEXT: psraw $8, %xmm2
314 ; SSE2-NEXT: pmullw %xmm3, %xmm2
315 ; SSE2-NEXT: psrlw $8, %xmm2
316 ; SSE2-NEXT: packuswb %xmm1, %xmm2
317 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
318 ; SSE2-NEXT: pand %xmm1, %xmm2
319 ; SSE2-NEXT: pandn %xmm0, %xmm1
320 ; SSE2-NEXT: por %xmm2, %xmm1
321 ; SSE2-NEXT: movdqa %xmm1, %xmm0
324 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
326 ; SSE41-NEXT: movdqa %xmm0, %xmm1
327 ; SSE41-NEXT: pxor %xmm0, %xmm0
328 ; SSE41-NEXT: pxor %xmm3, %xmm3
329 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
330 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
331 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
332 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
333 ; SSE41-NEXT: pmullw %xmm0, %xmm3
334 ; SSE41-NEXT: psrlw $8, %xmm3
335 ; SSE41-NEXT: pmullw %xmm0, %xmm2
336 ; SSE41-NEXT: psrlw $8, %xmm2
337 ; SSE41-NEXT: packuswb %xmm3, %xmm2
338 ; SSE41-NEXT: paddb %xmm1, %xmm2
339 ; SSE41-NEXT: movdqa %xmm2, %xmm0
340 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
341 ; SSE41-NEXT: psraw $8, %xmm0
342 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
343 ; SSE41-NEXT: pmullw %xmm3, %xmm0
344 ; SSE41-NEXT: psrlw $8, %xmm0
345 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
346 ; SSE41-NEXT: psraw $8, %xmm2
347 ; SSE41-NEXT: pmullw %xmm3, %xmm2
348 ; SSE41-NEXT: psrlw $8, %xmm2
349 ; SSE41-NEXT: packuswb %xmm0, %xmm2
350 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
351 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
352 ; SSE41-NEXT: movdqa %xmm1, %xmm0
355 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
357 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
358 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
359 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
360 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2]
361 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
362 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
363 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
364 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
365 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
366 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
367 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
368 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
369 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
371 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
372 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
373 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
375 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
376 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
377 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
378 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
379 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
382 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
384 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
385 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
386 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
387 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
388 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
389 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
390 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
391 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
392 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
393 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
394 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
395 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
396 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
397 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
398 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
399 ; AVX2-NEXT: vzeroupper
402 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
404 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
405 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
406 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
407 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
408 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
409 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
410 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
411 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
412 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
413 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
414 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
415 ; AVX512F-NEXT: vzeroupper
418 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
420 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
421 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
422 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
423 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
424 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
425 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1
426 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
427 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1
428 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
429 ; AVX512BW-NEXT: movw $257, %ax # imm = 0x101
430 ; AVX512BW-NEXT: kmovd %eax, %k1
431 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
432 ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
433 ; AVX512BW-NEXT: vzeroupper
434 ; AVX512BW-NEXT: retq
436 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
438 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
439 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
440 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
441 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
442 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm1, %xmm1
443 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
444 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
446 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
450 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
451 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
453 ; SSE2-NEXT: movdqa %xmm0, %xmm1
454 ; SSE2-NEXT: psraw $15, %xmm1
455 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1
456 ; SSE2-NEXT: paddw %xmm0, %xmm1
457 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
458 ; SSE2-NEXT: movdqa %xmm1, %xmm3
459 ; SSE2-NEXT: pand %xmm2, %xmm3
460 ; SSE2-NEXT: psraw $4, %xmm1
461 ; SSE2-NEXT: pandn %xmm1, %xmm2
462 ; SSE2-NEXT: por %xmm3, %xmm2
463 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
464 ; SSE2-NEXT: movdqa %xmm2, %xmm3
465 ; SSE2-NEXT: pand %xmm1, %xmm3
466 ; SSE2-NEXT: psraw $2, %xmm2
467 ; SSE2-NEXT: pandn %xmm2, %xmm1
468 ; SSE2-NEXT: por %xmm3, %xmm1
469 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
470 ; SSE2-NEXT: movdqa %xmm1, %xmm3
471 ; SSE2-NEXT: pand %xmm2, %xmm3
472 ; SSE2-NEXT: psraw $1, %xmm1
473 ; SSE2-NEXT: pandn %xmm1, %xmm2
474 ; SSE2-NEXT: por %xmm3, %xmm2
475 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
476 ; SSE2-NEXT: pand %xmm1, %xmm2
477 ; SSE2-NEXT: pandn %xmm0, %xmm1
478 ; SSE2-NEXT: por %xmm2, %xmm1
479 ; SSE2-NEXT: movdqa %xmm1, %xmm0
482 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
484 ; SSE41-NEXT: movdqa %xmm0, %xmm1
485 ; SSE41-NEXT: psraw $15, %xmm1
486 ; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1
487 ; SSE41-NEXT: paddw %xmm0, %xmm1
488 ; SSE41-NEXT: movdqa %xmm1, %xmm2
489 ; SSE41-NEXT: psraw $1, %xmm2
490 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm1
491 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
492 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
493 ; SSE41-NEXT: movdqa %xmm1, %xmm0
496 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
498 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
499 ; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
500 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
501 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
502 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1
503 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
504 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
507 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
509 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
510 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
511 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
512 ; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2
513 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1
514 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
515 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
518 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
520 ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1
521 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
522 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1
523 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
524 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
525 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
526 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
527 ; AVX512F-NEXT: vzeroupper
530 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
532 ; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1
533 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
534 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1
535 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm1, %xmm1
536 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
537 ; AVX512BW-NEXT: retq
539 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
541 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm1
542 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
543 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1
544 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm1, %xmm1
545 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
547 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
551 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
552 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
554 ; SSE2-NEXT: movdqa %xmm0, %xmm3
555 ; SSE2-NEXT: psraw $15, %xmm0
556 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2>
557 ; SSE2-NEXT: pmulhuw %xmm8, %xmm0
558 ; SSE2-NEXT: paddw %xmm3, %xmm0
559 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
560 ; SSE2-NEXT: movdqa %xmm0, %xmm2
561 ; SSE2-NEXT: pand %xmm4, %xmm2
562 ; SSE2-NEXT: psraw $4, %xmm0
563 ; SSE2-NEXT: movdqa %xmm4, %xmm6
564 ; SSE2-NEXT: pandn %xmm0, %xmm6
565 ; SSE2-NEXT: por %xmm2, %xmm6
566 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
567 ; SSE2-NEXT: movdqa %xmm6, %xmm0
568 ; SSE2-NEXT: pand %xmm5, %xmm0
569 ; SSE2-NEXT: psraw $2, %xmm6
570 ; SSE2-NEXT: movdqa %xmm5, %xmm2
571 ; SSE2-NEXT: pandn %xmm6, %xmm2
572 ; SSE2-NEXT: por %xmm0, %xmm2
573 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0]
574 ; SSE2-NEXT: movdqa %xmm2, %xmm0
575 ; SSE2-NEXT: pand %xmm7, %xmm0
576 ; SSE2-NEXT: psraw $1, %xmm2
577 ; SSE2-NEXT: movdqa %xmm7, %xmm6
578 ; SSE2-NEXT: pandn %xmm2, %xmm6
579 ; SSE2-NEXT: por %xmm0, %xmm6
580 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
581 ; SSE2-NEXT: pand %xmm2, %xmm6
582 ; SSE2-NEXT: movdqa %xmm2, %xmm0
583 ; SSE2-NEXT: pandn %xmm3, %xmm0
584 ; SSE2-NEXT: por %xmm6, %xmm0
585 ; SSE2-NEXT: movdqa %xmm1, %xmm3
586 ; SSE2-NEXT: psraw $15, %xmm3
587 ; SSE2-NEXT: pmulhuw %xmm8, %xmm3
588 ; SSE2-NEXT: paddw %xmm1, %xmm3
589 ; SSE2-NEXT: movdqa %xmm3, %xmm6
590 ; SSE2-NEXT: pand %xmm4, %xmm6
591 ; SSE2-NEXT: psraw $4, %xmm3
592 ; SSE2-NEXT: pandn %xmm3, %xmm4
593 ; SSE2-NEXT: por %xmm6, %xmm4
594 ; SSE2-NEXT: movdqa %xmm4, %xmm3
595 ; SSE2-NEXT: pand %xmm5, %xmm3
596 ; SSE2-NEXT: psraw $2, %xmm4
597 ; SSE2-NEXT: pandn %xmm4, %xmm5
598 ; SSE2-NEXT: por %xmm3, %xmm5
599 ; SSE2-NEXT: movdqa %xmm5, %xmm3
600 ; SSE2-NEXT: pand %xmm7, %xmm3
601 ; SSE2-NEXT: psraw $1, %xmm5
602 ; SSE2-NEXT: pandn %xmm5, %xmm7
603 ; SSE2-NEXT: por %xmm3, %xmm7
604 ; SSE2-NEXT: pand %xmm2, %xmm7
605 ; SSE2-NEXT: pandn %xmm1, %xmm2
606 ; SSE2-NEXT: por %xmm7, %xmm2
607 ; SSE2-NEXT: movdqa %xmm2, %xmm1
610 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
612 ; SSE41-NEXT: movdqa %xmm0, %xmm2
613 ; SSE41-NEXT: psraw $15, %xmm2
614 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
615 ; SSE41-NEXT: pmulhuw %xmm4, %xmm2
616 ; SSE41-NEXT: paddw %xmm0, %xmm2
617 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768>
618 ; SSE41-NEXT: movdqa %xmm2, %xmm3
619 ; SSE41-NEXT: pmulhw %xmm5, %xmm3
620 ; SSE41-NEXT: psraw $1, %xmm2
621 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7]
622 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
623 ; SSE41-NEXT: movdqa %xmm1, %xmm3
624 ; SSE41-NEXT: psraw $15, %xmm3
625 ; SSE41-NEXT: pmulhuw %xmm4, %xmm3
626 ; SSE41-NEXT: paddw %xmm1, %xmm3
627 ; SSE41-NEXT: pmulhw %xmm3, %xmm5
628 ; SSE41-NEXT: psraw $1, %xmm3
629 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7]
630 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
631 ; SSE41-NEXT: movdqa %xmm2, %xmm0
632 ; SSE41-NEXT: movdqa %xmm3, %xmm1
635 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
637 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
638 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
639 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
640 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
641 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
642 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
643 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4
644 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
645 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
646 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4
647 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3
648 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3
649 ; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2
650 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
651 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
652 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
653 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
654 ; AVX1-NEXT: # ymm2 = mem[0,1,0,1]
655 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
656 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
657 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
660 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
662 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
663 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
664 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
665 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2
666 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm1, %ymm1
667 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
668 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
671 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
673 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
674 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
675 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
676 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
677 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
678 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
679 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
682 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
684 ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1
685 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
686 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1
687 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1
688 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
689 ; AVX512BW-NEXT: retq
691 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
693 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
694 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2
695 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521>
696 ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2
697 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1
698 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535>
699 ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1
700 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4
701 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3
702 ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3
703 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2
704 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
705 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
706 ; XOP-NEXT: # ymm2 = mem[0,1,0,1]
707 ; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
709 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
713 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
714 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
716 ; SSE2-NEXT: movdqa %xmm1, %xmm8
717 ; SSE2-NEXT: movdqa %xmm0, %xmm1
718 ; SSE2-NEXT: psraw $15, %xmm0
719 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
720 ; SSE2-NEXT: pmulhuw %xmm9, %xmm0
721 ; SSE2-NEXT: paddw %xmm1, %xmm0
722 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535]
723 ; SSE2-NEXT: movdqa %xmm0, %xmm4
724 ; SSE2-NEXT: pand %xmm11, %xmm4
725 ; SSE2-NEXT: psraw $4, %xmm0
726 ; SSE2-NEXT: movdqa %xmm11, %xmm5
727 ; SSE2-NEXT: pandn %xmm0, %xmm5
728 ; SSE2-NEXT: por %xmm4, %xmm5
729 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
730 ; SSE2-NEXT: movdqa %xmm5, %xmm0
731 ; SSE2-NEXT: pand %xmm7, %xmm0
732 ; SSE2-NEXT: psraw $2, %xmm5
733 ; SSE2-NEXT: movdqa %xmm7, %xmm4
734 ; SSE2-NEXT: pandn %xmm5, %xmm4
735 ; SSE2-NEXT: por %xmm0, %xmm4
736 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0]
737 ; SSE2-NEXT: movdqa %xmm4, %xmm0
738 ; SSE2-NEXT: pand %xmm10, %xmm0
739 ; SSE2-NEXT: psraw $1, %xmm4
740 ; SSE2-NEXT: movdqa %xmm10, %xmm5
741 ; SSE2-NEXT: pandn %xmm4, %xmm5
742 ; SSE2-NEXT: por %xmm0, %xmm5
743 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535]
744 ; SSE2-NEXT: pand %xmm12, %xmm5
745 ; SSE2-NEXT: movdqa %xmm12, %xmm0
746 ; SSE2-NEXT: pandn %xmm1, %xmm0
747 ; SSE2-NEXT: por %xmm5, %xmm0
748 ; SSE2-NEXT: movdqa %xmm8, %xmm1
749 ; SSE2-NEXT: psraw $15, %xmm1
750 ; SSE2-NEXT: pmulhuw %xmm9, %xmm1
751 ; SSE2-NEXT: paddw %xmm8, %xmm1
752 ; SSE2-NEXT: movdqa %xmm1, %xmm5
753 ; SSE2-NEXT: pand %xmm11, %xmm5
754 ; SSE2-NEXT: psraw $4, %xmm1
755 ; SSE2-NEXT: movdqa %xmm11, %xmm6
756 ; SSE2-NEXT: pandn %xmm1, %xmm6
757 ; SSE2-NEXT: por %xmm5, %xmm6
758 ; SSE2-NEXT: movdqa %xmm6, %xmm1
759 ; SSE2-NEXT: pand %xmm7, %xmm1
760 ; SSE2-NEXT: psraw $2, %xmm6
761 ; SSE2-NEXT: movdqa %xmm7, %xmm5
762 ; SSE2-NEXT: pandn %xmm6, %xmm5
763 ; SSE2-NEXT: por %xmm1, %xmm5
764 ; SSE2-NEXT: movdqa %xmm5, %xmm1
765 ; SSE2-NEXT: pand %xmm10, %xmm1
766 ; SSE2-NEXT: psraw $1, %xmm5
767 ; SSE2-NEXT: movdqa %xmm10, %xmm6
768 ; SSE2-NEXT: pandn %xmm5, %xmm6
769 ; SSE2-NEXT: por %xmm1, %xmm6
770 ; SSE2-NEXT: pand %xmm12, %xmm6
771 ; SSE2-NEXT: movdqa %xmm12, %xmm1
772 ; SSE2-NEXT: pandn %xmm8, %xmm1
773 ; SSE2-NEXT: por %xmm6, %xmm1
774 ; SSE2-NEXT: movdqa %xmm2, %xmm5
775 ; SSE2-NEXT: psraw $15, %xmm5
776 ; SSE2-NEXT: pmulhuw %xmm9, %xmm5
777 ; SSE2-NEXT: paddw %xmm2, %xmm5
778 ; SSE2-NEXT: movdqa %xmm5, %xmm6
779 ; SSE2-NEXT: pand %xmm11, %xmm6
780 ; SSE2-NEXT: psraw $4, %xmm5
781 ; SSE2-NEXT: movdqa %xmm11, %xmm4
782 ; SSE2-NEXT: pandn %xmm5, %xmm4
783 ; SSE2-NEXT: por %xmm6, %xmm4
784 ; SSE2-NEXT: movdqa %xmm4, %xmm5
785 ; SSE2-NEXT: pand %xmm7, %xmm5
786 ; SSE2-NEXT: psraw $2, %xmm4
787 ; SSE2-NEXT: movdqa %xmm7, %xmm6
788 ; SSE2-NEXT: pandn %xmm4, %xmm6
789 ; SSE2-NEXT: por %xmm5, %xmm6
790 ; SSE2-NEXT: movdqa %xmm6, %xmm4
791 ; SSE2-NEXT: pand %xmm10, %xmm4
792 ; SSE2-NEXT: psraw $1, %xmm6
793 ; SSE2-NEXT: movdqa %xmm10, %xmm5
794 ; SSE2-NEXT: pandn %xmm6, %xmm5
795 ; SSE2-NEXT: por %xmm4, %xmm5
796 ; SSE2-NEXT: pand %xmm12, %xmm5
797 ; SSE2-NEXT: movdqa %xmm12, %xmm8
798 ; SSE2-NEXT: pandn %xmm2, %xmm8
799 ; SSE2-NEXT: por %xmm5, %xmm8
800 ; SSE2-NEXT: movdqa %xmm3, %xmm2
801 ; SSE2-NEXT: psraw $15, %xmm2
802 ; SSE2-NEXT: pmulhuw %xmm9, %xmm2
803 ; SSE2-NEXT: paddw %xmm3, %xmm2
804 ; SSE2-NEXT: movdqa %xmm2, %xmm4
805 ; SSE2-NEXT: pand %xmm11, %xmm4
806 ; SSE2-NEXT: psraw $4, %xmm2
807 ; SSE2-NEXT: pandn %xmm2, %xmm11
808 ; SSE2-NEXT: por %xmm4, %xmm11
809 ; SSE2-NEXT: movdqa %xmm11, %xmm2
810 ; SSE2-NEXT: pand %xmm7, %xmm2
811 ; SSE2-NEXT: psraw $2, %xmm11
812 ; SSE2-NEXT: pandn %xmm11, %xmm7
813 ; SSE2-NEXT: por %xmm2, %xmm7
814 ; SSE2-NEXT: movdqa %xmm7, %xmm2
815 ; SSE2-NEXT: pand %xmm10, %xmm2
816 ; SSE2-NEXT: psraw $1, %xmm7
817 ; SSE2-NEXT: pandn %xmm7, %xmm10
818 ; SSE2-NEXT: por %xmm2, %xmm10
819 ; SSE2-NEXT: pand %xmm12, %xmm10
820 ; SSE2-NEXT: pandn %xmm3, %xmm12
821 ; SSE2-NEXT: por %xmm10, %xmm12
822 ; SSE2-NEXT: movdqa %xmm8, %xmm2
823 ; SSE2-NEXT: movdqa %xmm12, %xmm3
826 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
828 ; SSE41-NEXT: movdqa %xmm1, %xmm4
829 ; SSE41-NEXT: movdqa %xmm0, %xmm1
830 ; SSE41-NEXT: psraw $15, %xmm0
831 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
832 ; SSE41-NEXT: pmulhuw %xmm7, %xmm0
833 ; SSE41-NEXT: paddw %xmm1, %xmm0
834 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768>
835 ; SSE41-NEXT: movdqa %xmm0, %xmm5
836 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
837 ; SSE41-NEXT: psraw $1, %xmm0
838 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7]
839 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
840 ; SSE41-NEXT: movdqa %xmm4, %xmm1
841 ; SSE41-NEXT: psraw $15, %xmm1
842 ; SSE41-NEXT: pmulhuw %xmm7, %xmm1
843 ; SSE41-NEXT: paddw %xmm4, %xmm1
844 ; SSE41-NEXT: movdqa %xmm1, %xmm5
845 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
846 ; SSE41-NEXT: psraw $1, %xmm1
847 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
848 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
849 ; SSE41-NEXT: movdqa %xmm2, %xmm4
850 ; SSE41-NEXT: psraw $15, %xmm4
851 ; SSE41-NEXT: pmulhuw %xmm7, %xmm4
852 ; SSE41-NEXT: paddw %xmm2, %xmm4
853 ; SSE41-NEXT: movdqa %xmm4, %xmm5
854 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
855 ; SSE41-NEXT: psraw $1, %xmm4
856 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
857 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
858 ; SSE41-NEXT: movdqa %xmm3, %xmm5
859 ; SSE41-NEXT: psraw $15, %xmm5
860 ; SSE41-NEXT: pmulhuw %xmm7, %xmm5
861 ; SSE41-NEXT: paddw %xmm3, %xmm5
862 ; SSE41-NEXT: pmulhw %xmm5, %xmm6
863 ; SSE41-NEXT: psraw $1, %xmm5
864 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
865 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
866 ; SSE41-NEXT: movdqa %xmm4, %xmm2
867 ; SSE41-NEXT: movdqa %xmm5, %xmm3
870 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
872 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
873 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3
874 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
875 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3
876 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
877 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
878 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5
879 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
880 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
881 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5
882 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5
883 ; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5
884 ; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6
885 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5
886 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
887 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
888 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
889 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
890 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
891 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
892 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
893 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
894 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6
895 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6
896 ; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2
897 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6
898 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
899 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
900 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6
901 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4
902 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4
903 ; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
904 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4
905 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
906 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
907 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
908 ; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1
909 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
912 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
914 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2
915 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
916 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
917 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
918 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2
919 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
920 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
921 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5
922 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
923 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
924 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
925 ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2
926 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
927 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2
928 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3
929 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
930 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
931 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
934 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
936 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
937 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
938 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
939 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
940 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
941 ; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
942 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
943 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
944 ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
945 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
946 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
947 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
948 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
949 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
950 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
951 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
952 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
953 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
954 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
955 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
958 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
960 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1
961 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
962 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
963 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm1, %zmm1
964 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
965 ; AVX512BW-NEXT: kmovd %eax, %k1
966 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
967 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
968 ; AVX512BW-NEXT: retq
970 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
972 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
973 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3
974 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521>
975 ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3
976 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2
977 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535>
978 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
979 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5
980 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5
981 ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5
982 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5
983 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
984 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
985 ; XOP-NEXT: # ymm5 = mem[0,1,0,1]
986 ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0
987 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
988 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6
989 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6
990 ; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2
991 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
992 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm6
993 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4
994 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4
995 ; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3
996 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
997 ; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1
999 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1003 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1004 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1006 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1007 ; SSE2-NEXT: psrad $31, %xmm1
1008 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1009 ; SSE2-NEXT: psrld $28, %xmm2
1010 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1011 ; SSE2-NEXT: psrld $29, %xmm3
1012 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1013 ; SSE2-NEXT: psrld $30, %xmm1
1014 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1015 ; SSE2-NEXT: paddd %xmm0, %xmm1
1016 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1017 ; SSE2-NEXT: psrad $4, %xmm2
1018 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1019 ; SSE2-NEXT: psrad $3, %xmm3
1020 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1021 ; SSE2-NEXT: psrad $2, %xmm1
1022 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1023 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1024 ; SSE2-NEXT: movaps %xmm1, %xmm0
1027 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1029 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1030 ; SSE41-NEXT: psrad $31, %xmm1
1031 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1032 ; SSE41-NEXT: psrld $28, %xmm2
1033 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1034 ; SSE41-NEXT: psrld $30, %xmm3
1035 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1036 ; SSE41-NEXT: psrld $29, %xmm1
1037 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1038 ; SSE41-NEXT: paddd %xmm0, %xmm1
1039 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1040 ; SSE41-NEXT: psrad $4, %xmm2
1041 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1042 ; SSE41-NEXT: psrad $2, %xmm3
1043 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1044 ; SSE41-NEXT: psrad $3, %xmm1
1045 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1046 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1047 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1050 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1052 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
1053 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1054 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
1055 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1056 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
1057 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1058 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1059 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1060 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1061 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1062 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1063 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1064 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1067 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1068 ; AVX2ORLATER: # %bb.0:
1069 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
1070 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
1071 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1072 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
1073 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1074 ; AVX2ORLATER-NEXT: retq
1076 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1078 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
1079 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
1080 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1081 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
1082 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1084 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1088 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1089 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1091 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1092 ; SSE2-NEXT: psrad $31, %xmm0
1093 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1094 ; SSE2-NEXT: psrld $28, %xmm3
1095 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1096 ; SSE2-NEXT: psrld $29, %xmm4
1097 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1098 ; SSE2-NEXT: psrld $30, %xmm0
1099 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1100 ; SSE2-NEXT: paddd %xmm2, %xmm0
1101 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1102 ; SSE2-NEXT: psrad $4, %xmm3
1103 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1104 ; SSE2-NEXT: psrad $3, %xmm4
1105 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1106 ; SSE2-NEXT: psrad $2, %xmm0
1107 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1108 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1109 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1110 ; SSE2-NEXT: psrad $31, %xmm2
1111 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1112 ; SSE2-NEXT: psrld $28, %xmm3
1113 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1114 ; SSE2-NEXT: psrld $29, %xmm4
1115 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1116 ; SSE2-NEXT: psrld $30, %xmm2
1117 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1118 ; SSE2-NEXT: paddd %xmm1, %xmm2
1119 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1120 ; SSE2-NEXT: psrad $4, %xmm3
1121 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1122 ; SSE2-NEXT: psrad $3, %xmm4
1123 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1124 ; SSE2-NEXT: psrad $2, %xmm2
1125 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1126 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1127 ; SSE2-NEXT: movaps %xmm2, %xmm1
1130 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1132 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1133 ; SSE41-NEXT: psrad $31, %xmm0
1134 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1135 ; SSE41-NEXT: psrld $28, %xmm3
1136 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1137 ; SSE41-NEXT: psrld $30, %xmm4
1138 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1139 ; SSE41-NEXT: psrld $29, %xmm0
1140 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1141 ; SSE41-NEXT: paddd %xmm2, %xmm0
1142 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1143 ; SSE41-NEXT: psrad $4, %xmm3
1144 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1145 ; SSE41-NEXT: psrad $2, %xmm4
1146 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1147 ; SSE41-NEXT: psrad $3, %xmm0
1148 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1149 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1150 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1151 ; SSE41-NEXT: psrad $31, %xmm2
1152 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1153 ; SSE41-NEXT: psrld $28, %xmm3
1154 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1155 ; SSE41-NEXT: psrld $30, %xmm4
1156 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1157 ; SSE41-NEXT: psrld $29, %xmm2
1158 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1159 ; SSE41-NEXT: paddd %xmm1, %xmm2
1160 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1161 ; SSE41-NEXT: psrad $4, %xmm3
1162 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1163 ; SSE41-NEXT: psrad $2, %xmm4
1164 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1165 ; SSE41-NEXT: psrad $3, %xmm2
1166 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1167 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1168 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1171 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1173 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1174 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1175 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1176 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1177 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1178 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1179 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1180 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1181 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1182 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1183 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1184 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1185 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1186 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1187 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1188 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1189 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1190 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1191 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1192 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
1193 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1194 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1195 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1196 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1197 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1198 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1199 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1202 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1203 ; AVX2ORLATER: # %bb.0:
1204 ; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1
1205 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
1206 ; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1
1207 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
1208 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1209 ; AVX2ORLATER-NEXT: retq
1211 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1213 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
1214 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2
1215 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268>
1216 ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2
1217 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1218 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292>
1219 ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1
1220 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4
1221 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3
1222 ; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1223 ; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2
1224 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1225 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1227 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1231 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1232 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1234 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1235 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1236 ; SSE2-NEXT: psrad $31, %xmm0
1237 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1238 ; SSE2-NEXT: psrld $28, %xmm5
1239 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1240 ; SSE2-NEXT: psrld $29, %xmm6
1241 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1242 ; SSE2-NEXT: psrld $30, %xmm0
1243 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1244 ; SSE2-NEXT: paddd %xmm1, %xmm0
1245 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1246 ; SSE2-NEXT: psrad $4, %xmm5
1247 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1248 ; SSE2-NEXT: psrad $3, %xmm6
1249 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1250 ; SSE2-NEXT: psrad $2, %xmm0
1251 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1252 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1253 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1254 ; SSE2-NEXT: psrad $31, %xmm1
1255 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1256 ; SSE2-NEXT: psrld $28, %xmm5
1257 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1258 ; SSE2-NEXT: psrld $29, %xmm6
1259 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1260 ; SSE2-NEXT: psrld $30, %xmm1
1261 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1262 ; SSE2-NEXT: paddd %xmm4, %xmm1
1263 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1264 ; SSE2-NEXT: psrad $4, %xmm5
1265 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1266 ; SSE2-NEXT: psrad $3, %xmm6
1267 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1268 ; SSE2-NEXT: psrad $2, %xmm1
1269 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1270 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1271 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1272 ; SSE2-NEXT: psrad $31, %xmm4
1273 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1274 ; SSE2-NEXT: psrld $28, %xmm5
1275 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1276 ; SSE2-NEXT: psrld $29, %xmm6
1277 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1278 ; SSE2-NEXT: psrld $30, %xmm4
1279 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1280 ; SSE2-NEXT: paddd %xmm2, %xmm4
1281 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1282 ; SSE2-NEXT: psrad $4, %xmm5
1283 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1284 ; SSE2-NEXT: psrad $3, %xmm6
1285 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1286 ; SSE2-NEXT: psrad $2, %xmm4
1287 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1288 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1289 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1290 ; SSE2-NEXT: psrad $31, %xmm5
1291 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1292 ; SSE2-NEXT: psrld $28, %xmm2
1293 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1294 ; SSE2-NEXT: psrld $29, %xmm6
1295 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1296 ; SSE2-NEXT: psrld $30, %xmm5
1297 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1298 ; SSE2-NEXT: paddd %xmm3, %xmm5
1299 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1300 ; SSE2-NEXT: psrad $4, %xmm2
1301 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1302 ; SSE2-NEXT: psrad $3, %xmm6
1303 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1304 ; SSE2-NEXT: psrad $2, %xmm5
1305 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1306 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1307 ; SSE2-NEXT: movaps %xmm4, %xmm2
1308 ; SSE2-NEXT: movaps %xmm5, %xmm3
1311 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1313 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1314 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1315 ; SSE41-NEXT: psrad $31, %xmm0
1316 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1317 ; SSE41-NEXT: psrld $28, %xmm5
1318 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1319 ; SSE41-NEXT: psrld $30, %xmm6
1320 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1321 ; SSE41-NEXT: psrld $29, %xmm0
1322 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1323 ; SSE41-NEXT: paddd %xmm1, %xmm0
1324 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1325 ; SSE41-NEXT: psrad $4, %xmm5
1326 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1327 ; SSE41-NEXT: psrad $2, %xmm6
1328 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1329 ; SSE41-NEXT: psrad $3, %xmm0
1330 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1331 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1332 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1333 ; SSE41-NEXT: psrad $31, %xmm1
1334 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1335 ; SSE41-NEXT: psrld $28, %xmm5
1336 ; SSE41-NEXT: movdqa %xmm1, %xmm6
1337 ; SSE41-NEXT: psrld $30, %xmm6
1338 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1339 ; SSE41-NEXT: psrld $29, %xmm1
1340 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1341 ; SSE41-NEXT: paddd %xmm4, %xmm1
1342 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1343 ; SSE41-NEXT: psrad $4, %xmm5
1344 ; SSE41-NEXT: movdqa %xmm1, %xmm6
1345 ; SSE41-NEXT: psrad $2, %xmm6
1346 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1347 ; SSE41-NEXT: psrad $3, %xmm1
1348 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1349 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
1350 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1351 ; SSE41-NEXT: psrad $31, %xmm4
1352 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1353 ; SSE41-NEXT: psrld $28, %xmm5
1354 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1355 ; SSE41-NEXT: psrld $30, %xmm6
1356 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1357 ; SSE41-NEXT: psrld $29, %xmm4
1358 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1359 ; SSE41-NEXT: paddd %xmm2, %xmm4
1360 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1361 ; SSE41-NEXT: psrad $4, %xmm5
1362 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1363 ; SSE41-NEXT: psrad $2, %xmm6
1364 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1365 ; SSE41-NEXT: psrad $3, %xmm4
1366 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1367 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1368 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1369 ; SSE41-NEXT: psrad $31, %xmm5
1370 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1371 ; SSE41-NEXT: psrld $28, %xmm2
1372 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1373 ; SSE41-NEXT: psrld $30, %xmm6
1374 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1375 ; SSE41-NEXT: psrld $29, %xmm5
1376 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1377 ; SSE41-NEXT: paddd %xmm3, %xmm5
1378 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1379 ; SSE41-NEXT: psrad $4, %xmm2
1380 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1381 ; SSE41-NEXT: psrad $2, %xmm6
1382 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1383 ; SSE41-NEXT: psrad $3, %xmm5
1384 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1385 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
1386 ; SSE41-NEXT: movdqa %xmm4, %xmm2
1387 ; SSE41-NEXT: movdqa %xmm5, %xmm3
1390 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1392 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1393 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1394 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1395 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1396 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1397 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1398 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1399 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1400 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1401 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1402 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1403 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1404 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1405 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
1406 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1407 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1408 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1409 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1410 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1411 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1412 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1413 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1414 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1415 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1416 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1417 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1418 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1419 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1420 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1421 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1422 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1423 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1424 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1425 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1426 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1427 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1428 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1429 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1430 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1431 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1432 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
1433 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1434 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1435 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1436 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1437 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1438 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
1439 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1440 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1441 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1442 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1443 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1444 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1445 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1448 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1450 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
1451 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28]
1452 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1453 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1454 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
1455 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1456 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1457 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1458 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1459 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
1460 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1461 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2
1462 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1463 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1466 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1468 ; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1
1469 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1470 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1471 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
1472 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
1473 ; AVX512F-NEXT: kmovw %eax, %k1
1474 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1475 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1476 ; AVX512F-NEXT: retq
1478 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1479 ; AVX512BW: # %bb.0:
1480 ; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1
1481 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1482 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1483 ; AVX512BW-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
1484 ; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111
1485 ; AVX512BW-NEXT: kmovd %eax, %k1
1486 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1487 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1488 ; AVX512BW-NEXT: retq
1490 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1492 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1493 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3
1494 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268>
1495 ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3
1496 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1497 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292>
1498 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1499 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5
1500 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1501 ; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5
1502 ; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5
1503 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1504 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1505 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1506 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm5
1507 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1508 ; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1509 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1510 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm5
1511 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4
1512 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4
1513 ; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3
1514 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1515 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1517 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1521 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1522 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1524 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1525 ; SSE2-NEXT: psrad $31, %xmm1
1526 ; SSE2-NEXT: psrlq $62, %xmm1
1527 ; SSE2-NEXT: paddq %xmm0, %xmm1
1528 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1529 ; SSE2-NEXT: psrad $2, %xmm2
1530 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1531 ; SSE2-NEXT: psrlq $2, %xmm1
1532 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1533 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1534 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1537 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1539 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1540 ; SSE41-NEXT: psrad $31, %xmm1
1541 ; SSE41-NEXT: psrlq $62, %xmm1
1542 ; SSE41-NEXT: paddq %xmm0, %xmm1
1543 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1544 ; SSE41-NEXT: psrad $2, %xmm2
1545 ; SSE41-NEXT: psrlq $2, %xmm1
1546 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1547 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1548 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1551 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1553 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1554 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1555 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1
1556 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1557 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2
1558 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
1559 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1560 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1563 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1565 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1566 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1567 ; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1
1568 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1569 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2
1570 ; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1
1571 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
1572 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1575 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1577 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1578 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1579 ; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1
1580 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1581 ; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1
1582 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1583 ; AVX512F-NEXT: vzeroupper
1584 ; AVX512F-NEXT: retq
1586 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1587 ; AVX512BW: # %bb.0:
1588 ; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1
1589 ; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1
1590 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1591 ; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1
1592 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1593 ; AVX512BW-NEXT: retq
1595 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1597 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1
1598 ; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1
1599 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1600 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1
1601 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1603 %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1607 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1608 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1610 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1611 ; SSE2-NEXT: psrad $31, %xmm2
1612 ; SSE2-NEXT: psrlq $62, %xmm2
1613 ; SSE2-NEXT: paddq %xmm0, %xmm2
1614 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1615 ; SSE2-NEXT: psrad $2, %xmm3
1616 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
1617 ; SSE2-NEXT: psrlq $2, %xmm2
1618 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1619 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1620 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1621 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1622 ; SSE2-NEXT: psrad $31, %xmm2
1623 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1624 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1625 ; SSE2-NEXT: psrlq $61, %xmm3
1626 ; SSE2-NEXT: psrlq $60, %xmm2
1627 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
1628 ; SSE2-NEXT: paddq %xmm1, %xmm2
1629 ; SSE2-NEXT: movdqa %xmm2, %xmm1
1630 ; SSE2-NEXT: psrlq $3, %xmm1
1631 ; SSE2-NEXT: psrlq $4, %xmm2
1632 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1633 ; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1634 ; SSE2-NEXT: xorpd %xmm1, %xmm2
1635 ; SSE2-NEXT: psubq %xmm1, %xmm2
1636 ; SSE2-NEXT: movdqa %xmm2, %xmm1
1639 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1641 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1642 ; SSE41-NEXT: psrad $31, %xmm0
1643 ; SSE41-NEXT: psrlq $62, %xmm0
1644 ; SSE41-NEXT: paddq %xmm2, %xmm0
1645 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1646 ; SSE41-NEXT: psrad $2, %xmm3
1647 ; SSE41-NEXT: psrlq $2, %xmm0
1648 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1649 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1650 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1651 ; SSE41-NEXT: psrad $31, %xmm2
1652 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1653 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1654 ; SSE41-NEXT: psrlq $60, %xmm3
1655 ; SSE41-NEXT: psrlq $61, %xmm2
1656 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1657 ; SSE41-NEXT: paddq %xmm1, %xmm2
1658 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1659 ; SSE41-NEXT: psrlq $4, %xmm1
1660 ; SSE41-NEXT: psrlq $3, %xmm2
1661 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1662 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1663 ; SSE41-NEXT: pxor %xmm1, %xmm2
1664 ; SSE41-NEXT: psubq %xmm1, %xmm2
1665 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1668 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1670 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1671 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1672 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
1673 ; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4
1674 ; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3
1675 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1676 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
1677 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
1678 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1
1679 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1680 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1681 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1682 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
1683 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
1684 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1685 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1686 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3
1687 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1688 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1689 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1690 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1693 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1695 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1696 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
1697 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1698 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1699 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1700 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1701 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
1702 ; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1
1703 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1706 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1708 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1709 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,2,3,4>
1710 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2
1711 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm2, %ymm2
1712 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
1713 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1714 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1715 ; AVX512F-NEXT: retq
1717 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1718 ; AVX512BW: # %bb.0:
1719 ; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1
1720 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1721 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1722 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %ymm1, %ymm1
1723 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1724 ; AVX512BW-NEXT: retq
1726 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1728 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1729 ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2
1730 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2
1731 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1732 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm2, %xmm2
1733 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1734 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1
1735 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
1736 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
1737 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1
1738 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1739 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1741 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1745 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1746 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1748 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1749 ; SSE2-NEXT: psrad $31, %xmm4
1750 ; SSE2-NEXT: psrlq $62, %xmm4
1751 ; SSE2-NEXT: paddq %xmm0, %xmm4
1752 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1753 ; SSE2-NEXT: psrad $2, %xmm5
1754 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1755 ; SSE2-NEXT: psrlq $2, %xmm4
1756 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1757 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1758 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
1759 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1760 ; SSE2-NEXT: psrad $31, %xmm4
1761 ; SSE2-NEXT: psrlq $62, %xmm4
1762 ; SSE2-NEXT: paddq %xmm2, %xmm4
1763 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1764 ; SSE2-NEXT: psrad $2, %xmm5
1765 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1766 ; SSE2-NEXT: psrlq $2, %xmm4
1767 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1768 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1769 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
1770 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1771 ; SSE2-NEXT: psrad $31, %xmm4
1772 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1773 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1774 ; SSE2-NEXT: psrlq $61, %xmm5
1775 ; SSE2-NEXT: psrlq $60, %xmm4
1776 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1777 ; SSE2-NEXT: paddq %xmm1, %xmm4
1778 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1779 ; SSE2-NEXT: psrlq $3, %xmm1
1780 ; SSE2-NEXT: psrlq $4, %xmm4
1781 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1782 ; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1783 ; SSE2-NEXT: xorpd %xmm1, %xmm4
1784 ; SSE2-NEXT: psubq %xmm1, %xmm4
1785 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1786 ; SSE2-NEXT: psrad $31, %xmm5
1787 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1788 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1789 ; SSE2-NEXT: psrlq $61, %xmm6
1790 ; SSE2-NEXT: psrlq $60, %xmm5
1791 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
1792 ; SSE2-NEXT: paddq %xmm3, %xmm5
1793 ; SSE2-NEXT: movdqa %xmm5, %xmm3
1794 ; SSE2-NEXT: psrlq $3, %xmm3
1795 ; SSE2-NEXT: psrlq $4, %xmm5
1796 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1797 ; SSE2-NEXT: xorpd %xmm1, %xmm5
1798 ; SSE2-NEXT: psubq %xmm1, %xmm5
1799 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1800 ; SSE2-NEXT: movdqa %xmm5, %xmm3
1803 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1805 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1806 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1807 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1808 ; SSE41-NEXT: psrad $31, %xmm0
1809 ; SSE41-NEXT: psrlq $62, %xmm0
1810 ; SSE41-NEXT: paddq %xmm1, %xmm0
1811 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1812 ; SSE41-NEXT: psrad $2, %xmm2
1813 ; SSE41-NEXT: psrlq $2, %xmm0
1814 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1815 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1816 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1817 ; SSE41-NEXT: psrad $31, %xmm2
1818 ; SSE41-NEXT: psrlq $62, %xmm2
1819 ; SSE41-NEXT: paddq %xmm5, %xmm2
1820 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1821 ; SSE41-NEXT: psrad $2, %xmm1
1822 ; SSE41-NEXT: psrlq $2, %xmm2
1823 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1824 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
1825 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1826 ; SSE41-NEXT: psrad $31, %xmm1
1827 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1828 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1829 ; SSE41-NEXT: psrlq $60, %xmm5
1830 ; SSE41-NEXT: psrlq $61, %xmm1
1831 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
1832 ; SSE41-NEXT: paddq %xmm4, %xmm1
1833 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1834 ; SSE41-NEXT: psrlq $4, %xmm4
1835 ; SSE41-NEXT: psrlq $3, %xmm1
1836 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1837 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
1838 ; SSE41-NEXT: pxor %xmm5, %xmm1
1839 ; SSE41-NEXT: psubq %xmm5, %xmm1
1840 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1841 ; SSE41-NEXT: psrad $31, %xmm4
1842 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1843 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1844 ; SSE41-NEXT: psrlq $60, %xmm6
1845 ; SSE41-NEXT: psrlq $61, %xmm4
1846 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
1847 ; SSE41-NEXT: paddq %xmm3, %xmm4
1848 ; SSE41-NEXT: movdqa %xmm4, %xmm3
1849 ; SSE41-NEXT: psrlq $4, %xmm3
1850 ; SSE41-NEXT: psrlq $3, %xmm4
1851 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1852 ; SSE41-NEXT: pxor %xmm5, %xmm4
1853 ; SSE41-NEXT: psubq %xmm5, %xmm4
1854 ; SSE41-NEXT: movdqa %xmm4, %xmm3
1857 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1859 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1860 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1861 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1862 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5
1863 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4
1864 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1865 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
1866 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4
1867 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1868 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1869 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1870 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1871 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1872 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
1873 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
1874 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
1875 ; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6
1876 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5
1877 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1878 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1879 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1880 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1881 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5
1882 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6
1883 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5
1884 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1885 ; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
1886 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5
1887 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1888 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1889 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1890 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1891 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
1892 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1893 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
1894 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1895 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1896 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1897 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1898 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1901 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1903 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1904 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1905 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,62,61,60>
1906 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3
1907 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3
1908 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,2,3,4>
1909 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3
1910 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1911 ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3
1912 ; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3
1913 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1914 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
1915 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2
1916 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2
1917 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2
1918 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
1919 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1920 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1923 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1925 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1926 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1927 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1928 ; AVX512F-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1
1929 ; AVX512F-NEXT: movb $17, %al
1930 ; AVX512F-NEXT: kmovw %eax, %k1
1931 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1932 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1933 ; AVX512F-NEXT: retq
1935 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1936 ; AVX512BW: # %bb.0:
1937 ; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1
1938 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1939 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1940 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1
1941 ; AVX512BW-NEXT: movb $17, %al
1942 ; AVX512BW-NEXT: kmovd %eax, %k1
1943 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1944 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1945 ; AVX512BW-NEXT: retq
1947 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1949 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1950 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1951 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4
1952 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
1953 ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4
1954 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1955 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1956 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1957 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6
1958 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6
1959 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6
1960 ; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614>
1961 ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6
1962 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
1963 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1964 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1965 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6
1966 ; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5
1967 ; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2
1968 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1969 ; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3
1970 ; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3
1971 ; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3
1972 ; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3
1973 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1974 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1976 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
1980 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
1981 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
1983 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1984 ; SSE2-NEXT: psrad $31, %xmm0
1985 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1986 ; SSE2-NEXT: psrld $28, %xmm2
1987 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1988 ; SSE2-NEXT: psrld $29, %xmm3
1989 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1990 ; SSE2-NEXT: psrld $30, %xmm0
1991 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
1992 ; SSE2-NEXT: paddd %xmm1, %xmm0
1993 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1994 ; SSE2-NEXT: psrad $4, %xmm2
1995 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1996 ; SSE2-NEXT: psrad $3, %xmm3
1997 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1998 ; SSE2-NEXT: psrad $2, %xmm0
1999 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
2000 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2001 ; SSE2-NEXT: pxor %xmm1, %xmm1
2002 ; SSE2-NEXT: psubd %xmm0, %xmm1
2003 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2004 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
2005 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2008 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2010 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2011 ; SSE41-NEXT: psrad $31, %xmm1
2012 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2013 ; SSE41-NEXT: psrld $28, %xmm2
2014 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2015 ; SSE41-NEXT: psrld $30, %xmm3
2016 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2017 ; SSE41-NEXT: psrld $29, %xmm1
2018 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2019 ; SSE41-NEXT: paddd %xmm0, %xmm1
2020 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2021 ; SSE41-NEXT: psrad $4, %xmm2
2022 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2023 ; SSE41-NEXT: psrad $2, %xmm3
2024 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2025 ; SSE41-NEXT: psrad $3, %xmm1
2026 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2027 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2028 ; SSE41-NEXT: pxor %xmm0, %xmm0
2029 ; SSE41-NEXT: psubd %xmm1, %xmm0
2030 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
2031 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2034 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2036 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
2037 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2038 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
2039 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2040 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
2041 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2042 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2043 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
2044 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
2045 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2046 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
2047 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2048 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2049 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2050 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2051 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2054 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2055 ; AVX2ORLATER: # %bb.0:
2056 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
2057 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2058 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2059 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
2060 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2061 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2062 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2063 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2064 ; AVX2ORLATER-NEXT: retq
2066 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2068 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
2069 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2070 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2071 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
2072 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2073 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2074 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2075 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2077 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2081 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2082 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2085 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2089 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2090 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2093 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2097 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2098 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2101 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2106 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2107 ; SSE2-LABEL: non_splat_minus_one_divisor_0:
2109 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2110 ; SSE2-NEXT: pxor %xmm1, %xmm0
2111 ; SSE2-NEXT: psubb %xmm0, %xmm1
2112 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2115 ; SSE41-LABEL: non_splat_minus_one_divisor_0:
2117 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2118 ; SSE41-NEXT: pxor %xmm2, %xmm2
2119 ; SSE41-NEXT: psubb %xmm0, %xmm2
2120 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2121 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
2122 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2125 ; AVX1-LABEL: non_splat_minus_one_divisor_0:
2127 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2128 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2129 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2130 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2133 ; AVX2-LABEL: non_splat_minus_one_divisor_0:
2135 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2136 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2137 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2138 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2141 ; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2143 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2144 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2145 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2146 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2147 ; AVX512F-NEXT: retq
2149 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2150 ; AVX512BW: # %bb.0:
2151 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2152 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2153 ; AVX512BW-NEXT: kmovd %eax, %k1
2154 ; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2155 ; AVX512BW-NEXT: retq
2157 ; XOP-LABEL: non_splat_minus_one_divisor_0:
2159 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2160 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2161 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2162 ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2164 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2168 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2169 ; SSE2-LABEL: non_splat_minus_one_divisor_1:
2171 ; SSE2-NEXT: pxor %xmm1, %xmm1
2172 ; SSE2-NEXT: pxor %xmm2, %xmm2
2173 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
2174 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2175 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2176 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2177 ; SSE2-NEXT: psrlw $8, %xmm3
2178 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2179 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2180 ; SSE2-NEXT: psrlw $8, %xmm2
2181 ; SSE2-NEXT: packuswb %xmm3, %xmm2
2182 ; SSE2-NEXT: paddb %xmm0, %xmm2
2183 ; SSE2-NEXT: movdqa %xmm2, %xmm1
2184 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2185 ; SSE2-NEXT: psraw $8, %xmm1
2186 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1
2187 ; SSE2-NEXT: psrlw $8, %xmm1
2188 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2189 ; SSE2-NEXT: psraw $8, %xmm2
2190 ; SSE2-NEXT: psllw $7, %xmm2
2191 ; SSE2-NEXT: psrlw $8, %xmm2
2192 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2193 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2194 ; SSE2-NEXT: pand %xmm1, %xmm2
2195 ; SSE2-NEXT: pandn %xmm0, %xmm1
2196 ; SSE2-NEXT: por %xmm2, %xmm1
2197 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2198 ; SSE2-NEXT: pxor %xmm0, %xmm1
2199 ; SSE2-NEXT: psubb %xmm0, %xmm1
2200 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2203 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
2205 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2206 ; SSE41-NEXT: pxor %xmm2, %xmm2
2207 ; SSE41-NEXT: pxor %xmm0, %xmm0
2208 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm0
2209 ; SSE41-NEXT: pxor %xmm4, %xmm4
2210 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2211 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2212 ; SSE41-NEXT: psllw $1, %xmm3
2213 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4,5],xmm3[6],xmm4[7]
2214 ; SSE41-NEXT: psrlw $8, %xmm3
2215 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2216 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2217 ; SSE41-NEXT: psrlw $8, %xmm0
2218 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2219 ; SSE41-NEXT: paddb %xmm1, %xmm3
2220 ; SSE41-NEXT: movdqa %xmm3, %xmm0
2221 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2222 ; SSE41-NEXT: psraw $8, %xmm0
2223 ; SSE41-NEXT: movdqa %xmm0, %xmm4
2224 ; SSE41-NEXT: psllw $1, %xmm4
2225 ; SSE41-NEXT: psllw $7, %xmm0
2226 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5],xmm0[6],xmm4[7]
2227 ; SSE41-NEXT: psrlw $8, %xmm0
2228 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2229 ; SSE41-NEXT: psraw $8, %xmm3
2230 ; SSE41-NEXT: psllw $7, %xmm3
2231 ; SSE41-NEXT: psrlw $8, %xmm3
2232 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2233 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2234 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
2235 ; SSE41-NEXT: psubb %xmm1, %xmm2
2236 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2237 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
2238 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2241 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
2243 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2244 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2245 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2246 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2247 ; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
2248 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2249 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2250 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2251 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2252 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2253 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2254 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2255 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2256 ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3
2257 ; AVX1-NEXT: vpsllw $1, %xmm3, %xmm4
2258 ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
2259 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6],xmm4[7]
2260 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2261 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2262 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2263 ; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2264 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2265 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2266 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2267 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2268 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2269 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2270 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2273 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
2275 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2276 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2277 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2278 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2279 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2280 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2281 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2282 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2283 ; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
2284 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2285 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2286 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2287 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2288 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2289 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2290 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2291 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2292 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2293 ; AVX2-NEXT: vzeroupper
2296 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2298 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2299 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2300 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2301 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2302 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
2303 ; AVX512F-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2304 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
2305 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm2, %zmm2
2306 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
2307 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2308 ; AVX512F-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2309 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2310 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2311 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2312 ; AVX512F-NEXT: vzeroupper
2313 ; AVX512F-NEXT: retq
2315 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2316 ; AVX512BW: # %bb.0:
2317 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2318 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2319 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2320 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2321 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2322 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2323 ; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2
2324 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm2, %ymm2
2325 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2326 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2327 ; AVX512BW-NEXT: kmovd %eax, %k1
2328 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
2329 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0
2330 ; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44
2331 ; AVX512BW-NEXT: kmovd %eax, %k1
2332 ; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1}
2333 ; AVX512BW-NEXT: vzeroupper
2334 ; AVX512BW-NEXT: retq
2336 ; XOP-LABEL: non_splat_minus_one_divisor_1:
2338 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2339 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2340 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm2, %xmm2
2341 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2342 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm2, %xmm2
2343 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2344 ; XOP-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2345 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2346 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2347 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2349 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2353 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2354 ; SSE2-LABEL: non_splat_minus_one_divisor_2:
2356 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2357 ; SSE2-NEXT: psrld $31, %xmm1
2358 ; SSE2-NEXT: paddd %xmm0, %xmm1
2359 ; SSE2-NEXT: psrad $1, %xmm1
2360 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2361 ; SSE2-NEXT: pxor %xmm0, %xmm0
2362 ; SSE2-NEXT: psubd %xmm1, %xmm0
2363 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2364 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2367 ; SSE41-LABEL: non_splat_minus_one_divisor_2:
2369 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2370 ; SSE41-NEXT: psrld $31, %xmm1
2371 ; SSE41-NEXT: paddd %xmm0, %xmm1
2372 ; SSE41-NEXT: psrad $1, %xmm1
2373 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2374 ; SSE41-NEXT: pxor %xmm0, %xmm0
2375 ; SSE41-NEXT: psubd %xmm1, %xmm0
2376 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
2377 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2380 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
2382 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
2383 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2384 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1
2385 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2386 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2387 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2388 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2391 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2392 ; AVX2ORLATER: # %bb.0:
2393 ; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1
2394 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2395 ; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1
2396 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2397 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2398 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2399 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2400 ; AVX2ORLATER-NEXT: retq
2402 ; XOP-LABEL: non_splat_minus_one_divisor_2:
2404 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm1
2405 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2406 ; XOP-NEXT: vpsrad $1, %xmm1, %xmm1
2407 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2408 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2409 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2410 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2412 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2416 define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2417 ; SSE-LABEL: combine_vec_sdiv_nonuniform:
2419 ; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
2420 ; SSE-NEXT: movdqa %xmm0, %xmm1
2421 ; SSE-NEXT: psrlw $15, %xmm1
2422 ; SSE-NEXT: paddw %xmm0, %xmm1
2423 ; SSE-NEXT: movdqa %xmm1, %xmm0
2426 ; AVX-LABEL: combine_vec_sdiv_nonuniform:
2428 ; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2429 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
2430 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2432 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2436 define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2437 ; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2439 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2440 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2441 ; SSE2-NEXT: psraw $2, %xmm1
2442 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2443 ; SSE2-NEXT: psraw $1, %xmm2
2444 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2445 ; SSE2-NEXT: psrlw $15, %xmm0
2446 ; SSE2-NEXT: paddw %xmm2, %xmm0
2449 ; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2451 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2452 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2453 ; SSE41-NEXT: psraw $1, %xmm1
2454 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2455 ; SSE41-NEXT: psraw $2, %xmm2
2456 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2457 ; SSE41-NEXT: psrlw $15, %xmm0
2458 ; SSE41-NEXT: paddw %xmm2, %xmm0
2461 ; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2463 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2464 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
2465 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
2466 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2467 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2468 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2471 ; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2473 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2474 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
2475 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
2476 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2477 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2478 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2481 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2483 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2484 ; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
2485 ; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
2486 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2487 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2488 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2489 ; AVX512F-NEXT: retq
2491 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2492 ; AVX512BW: # %bb.0:
2493 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2494 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2495 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2496 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2497 ; AVX512BW-NEXT: retq
2499 ; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2501 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2502 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2503 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2504 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2506 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2510 define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2511 ; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2513 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2514 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2515 ; SSE2-NEXT: paddw %xmm0, %xmm1
2516 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2517 ; SSE2-NEXT: psraw $4, %xmm0
2518 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2519 ; SSE2-NEXT: psraw $8, %xmm2
2520 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2521 ; SSE2-NEXT: psrlw $15, %xmm1
2522 ; SSE2-NEXT: paddw %xmm2, %xmm1
2523 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2526 ; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2528 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2529 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2530 ; SSE41-NEXT: paddw %xmm0, %xmm1
2531 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2532 ; SSE41-NEXT: psraw $8, %xmm0
2533 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2534 ; SSE41-NEXT: psraw $4, %xmm2
2535 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2536 ; SSE41-NEXT: psrlw $15, %xmm1
2537 ; SSE41-NEXT: paddw %xmm2, %xmm1
2538 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2541 ; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2543 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2544 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2545 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2546 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2547 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2548 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2549 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2552 ; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2554 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2555 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2556 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2557 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2558 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2559 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2560 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2563 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2565 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2566 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2567 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2568 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2569 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2570 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2571 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2572 ; AVX512F-NEXT: retq
2574 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2575 ; AVX512BW: # %bb.0:
2576 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2577 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2578 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2579 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2580 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2581 ; AVX512BW-NEXT: retq
2583 ; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2585 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2586 ; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2587 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2588 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2589 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2591 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2595 define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2596 ; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2598 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2599 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2600 ; SSE2-NEXT: psubw %xmm0, %xmm1
2601 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2602 ; SSE2-NEXT: psraw $4, %xmm0
2603 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2604 ; SSE2-NEXT: psraw $8, %xmm2
2605 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2606 ; SSE2-NEXT: psrlw $15, %xmm1
2607 ; SSE2-NEXT: paddw %xmm2, %xmm1
2608 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2611 ; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2613 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2614 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2615 ; SSE41-NEXT: psubw %xmm0, %xmm1
2616 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2617 ; SSE41-NEXT: psraw $8, %xmm0
2618 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2619 ; SSE41-NEXT: psraw $4, %xmm2
2620 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2621 ; SSE41-NEXT: psrlw $15, %xmm1
2622 ; SSE41-NEXT: paddw %xmm2, %xmm1
2623 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2626 ; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2628 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2629 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2630 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2631 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2632 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2633 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2634 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2637 ; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2639 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2640 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2641 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2642 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2643 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2644 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2645 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2648 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2650 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2651 ; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2652 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2653 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2654 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2655 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2656 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2657 ; AVX512F-NEXT: retq
2659 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2660 ; AVX512BW: # %bb.0:
2661 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2662 ; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2663 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2664 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2665 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2666 ; AVX512BW-NEXT: retq
2668 ; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2670 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2671 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2672 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2673 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2674 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2676 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2680 define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2681 ; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2683 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2684 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2685 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2686 ; SSE2-NEXT: paddw %xmm1, %xmm0
2687 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2688 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2689 ; SSE2-NEXT: pand %xmm1, %xmm2
2690 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2691 ; SSE2-NEXT: psraw $8, %xmm3
2692 ; SSE2-NEXT: pandn %xmm3, %xmm1
2693 ; SSE2-NEXT: por %xmm2, %xmm1
2694 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2695 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2696 ; SSE2-NEXT: pand %xmm2, %xmm3
2697 ; SSE2-NEXT: psraw $4, %xmm1
2698 ; SSE2-NEXT: pandn %xmm1, %xmm2
2699 ; SSE2-NEXT: por %xmm3, %xmm2
2700 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2701 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2702 ; SSE2-NEXT: pand %xmm1, %xmm3
2703 ; SSE2-NEXT: psraw $2, %xmm2
2704 ; SSE2-NEXT: pandn %xmm2, %xmm1
2705 ; SSE2-NEXT: por %xmm3, %xmm1
2706 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2707 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2708 ; SSE2-NEXT: pand %xmm2, %xmm3
2709 ; SSE2-NEXT: psraw $1, %xmm1
2710 ; SSE2-NEXT: pandn %xmm1, %xmm2
2711 ; SSE2-NEXT: por %xmm3, %xmm2
2712 ; SSE2-NEXT: psrlw $15, %xmm0
2713 ; SSE2-NEXT: paddw %xmm2, %xmm0
2716 ; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2718 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2719 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2720 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2721 ; SSE41-NEXT: paddw %xmm1, %xmm0
2722 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256>
2723 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2724 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2725 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2726 ; SSE41-NEXT: psraw $1, %xmm2
2727 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2728 ; SSE41-NEXT: psrlw $15, %xmm0
2729 ; SSE41-NEXT: paddw %xmm2, %xmm0
2732 ; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2734 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2735 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2736 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2737 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2738 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2739 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
2740 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2741 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2742 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2745 ; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2747 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2748 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2749 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2750 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2751 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2752 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2
2753 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2754 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2755 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2758 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2760 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2761 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2762 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2763 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2764 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2765 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
2766 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2767 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2768 ; AVX512F-NEXT: vzeroupper
2769 ; AVX512F-NEXT: retq
2771 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2772 ; AVX512BW: # %bb.0:
2773 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2774 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2775 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2776 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2777 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2778 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2779 ; AVX512BW-NEXT: retq
2781 ; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2783 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2784 ; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2785 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2786 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2787 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2789 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2793 define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2794 ; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2796 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2797 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2798 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2799 ; SSE2-NEXT: paddw %xmm1, %xmm0
2800 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2801 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2802 ; SSE2-NEXT: psraw $8, %xmm2
2803 ; SSE2-NEXT: pand %xmm1, %xmm2
2804 ; SSE2-NEXT: pandn %xmm0, %xmm1
2805 ; SSE2-NEXT: por %xmm2, %xmm1
2806 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,0,65535,0]
2807 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2808 ; SSE2-NEXT: pand %xmm2, %xmm3
2809 ; SSE2-NEXT: psraw $4, %xmm1
2810 ; SSE2-NEXT: pandn %xmm1, %xmm2
2811 ; SSE2-NEXT: por %xmm3, %xmm2
2812 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535]
2813 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2814 ; SSE2-NEXT: pand %xmm1, %xmm3
2815 ; SSE2-NEXT: psraw $2, %xmm2
2816 ; SSE2-NEXT: pandn %xmm2, %xmm1
2817 ; SSE2-NEXT: por %xmm3, %xmm1
2818 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,0]
2819 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2820 ; SSE2-NEXT: pand %xmm2, %xmm3
2821 ; SSE2-NEXT: psraw $1, %xmm1
2822 ; SSE2-NEXT: pandn %xmm1, %xmm2
2823 ; SSE2-NEXT: por %xmm3, %xmm2
2824 ; SSE2-NEXT: psrlw $15, %xmm0
2825 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2826 ; SSE2-NEXT: paddw %xmm2, %xmm0
2829 ; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2831 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2832 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2833 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2834 ; SSE41-NEXT: paddw %xmm1, %xmm0
2835 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8>
2836 ; SSE41-NEXT: pmulhw %xmm0, %xmm2
2837 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
2838 ; SSE41-NEXT: psrlw $15, %xmm0
2839 ; SSE41-NEXT: pxor %xmm1, %xmm1
2840 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
2841 ; SSE41-NEXT: paddw %xmm2, %xmm1
2842 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2845 ; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2847 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2848 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2849 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2850 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2851 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2852 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2853 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2854 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2855 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2858 ; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2860 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2861 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2862 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2863 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2864 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2865 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2866 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2867 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2868 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2871 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2873 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2874 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2875 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2876 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2877 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2878 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2879 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2880 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
2881 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2882 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2883 ; AVX512F-NEXT: vzeroupper
2884 ; AVX512F-NEXT: retq
2886 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2887 ; AVX512BW: # %bb.0:
2888 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2889 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2890 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2891 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2892 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2893 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2894 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2895 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2896 ; AVX512BW-NEXT: retq
2898 ; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2900 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2901 ; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2902 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2903 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2904 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2905 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2906 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2908 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2912 define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2913 ; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2915 ; SSE2-NEXT: pxor %xmm1, %xmm1
2916 ; SSE2-NEXT: psubw %xmm0, %xmm1
2917 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2920 ; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2922 ; SSE41-NEXT: pxor %xmm1, %xmm1
2923 ; SSE41-NEXT: psubw %xmm0, %xmm1
2924 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2927 ; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2929 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2930 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2931 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2934 ; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2935 ; AVX2ORLATER: # %bb.0:
2936 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2937 ; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2938 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2939 ; AVX2ORLATER-NEXT: retq
2941 ; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2943 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2944 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2945 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2947 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2951 define <16 x i8> @pr38658(<16 x i8> %x) {
2952 ; SSE2-LABEL: pr38658:
2954 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2955 ; SSE2-NEXT: psraw $8, %xmm2
2956 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2957 ; SSE2-NEXT: psrlw $8, %xmm2
2958 ; SSE2-NEXT: pxor %xmm3, %xmm3
2959 ; SSE2-NEXT: pxor %xmm1, %xmm1
2960 ; SSE2-NEXT: packuswb %xmm2, %xmm1
2961 ; SSE2-NEXT: paddb %xmm0, %xmm1
2962 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2963 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2964 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2965 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2966 ; SSE2-NEXT: psraw $8, %xmm2
2967 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2968 ; SSE2-NEXT: psrlw $8, %xmm2
2969 ; SSE2-NEXT: packuswb %xmm2, %xmm0
2970 ; SSE2-NEXT: psrlw $7, %xmm1
2971 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
2972 ; SSE2-NEXT: paddb %xmm0, %xmm1
2973 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2976 ; SSE41-LABEL: pr38658:
2978 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2979 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
2980 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
2981 ; SSE41-NEXT: psrlw $8, %xmm2
2982 ; SSE41-NEXT: pxor %xmm1, %xmm1
2983 ; SSE41-NEXT: packuswb %xmm2, %xmm1
2984 ; SSE41-NEXT: paddb %xmm0, %xmm1
2985 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2986 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2987 ; SSE41-NEXT: psraw $8, %xmm0
2988 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2989 ; SSE41-NEXT: psllw $6, %xmm2
2990 ; SSE41-NEXT: psllw $8, %xmm0
2991 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2992 ; SSE41-NEXT: psrlw $8, %xmm0
2993 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2994 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2995 ; SSE41-NEXT: psrlw $7, %xmm1
2996 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
2997 ; SSE41-NEXT: paddb %xmm2, %xmm1
2998 ; SSE41-NEXT: movdqa %xmm1, %xmm0
3001 ; AVX1-LABEL: pr38658:
3003 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3004 ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
3005 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
3006 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
3007 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
3008 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
3009 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3010 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3011 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
3012 ; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2
3013 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
3014 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
3015 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
3016 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3017 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
3018 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
3019 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3020 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3023 ; AVX2-LABEL: pr38658:
3025 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3026 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3027 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3028 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3029 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3030 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3031 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3032 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3033 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3034 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3035 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3036 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
3037 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3038 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3039 ; AVX2-NEXT: vzeroupper
3042 ; AVX512F-LABEL: pr38658:
3044 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
3045 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3046 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
3047 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3048 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3049 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3050 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1
3051 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3052 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
3053 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
3054 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3055 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3056 ; AVX512F-NEXT: vzeroupper
3057 ; AVX512F-NEXT: retq
3059 ; AVX512BW-LABEL: pr38658:
3060 ; AVX512BW: # %bb.0:
3061 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
3062 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3063 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
3064 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
3065 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3066 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1
3067 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3068 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
3069 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
3070 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
3071 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3072 ; AVX512BW-NEXT: vzeroupper
3073 ; AVX512BW-NEXT: retq
3075 ; XOP-LABEL: pr38658:
3077 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3078 ; XOP-NEXT: vpmovsxbw %xmm1, %xmm1
3079 ; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
3080 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
3081 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
3082 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3083 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm1
3084 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
3085 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3086 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3088 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3092 define i1 @bool_sdiv(i1 %x, i1 %y) {
3093 ; CHECK-LABEL: bool_sdiv:
3095 ; CHECK-NEXT: movl %edi, %eax
3096 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3102 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3103 ; CHECK-LABEL: boolvec_sdiv:
3106 %r = sdiv <4 x i1> %x, %y
3110 define i32 @combine_sdiv_two(i32 %x) {
3111 ; CHECK-LABEL: combine_sdiv_two:
3113 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3114 ; CHECK-NEXT: movl %edi, %eax
3115 ; CHECK-NEXT: shrl $31, %eax
3116 ; CHECK-NEXT: addl %edi, %eax
3117 ; CHECK-NEXT: sarl %eax
3123 define i32 @combine_sdiv_negtwo(i32 %x) {
3124 ; CHECK-LABEL: combine_sdiv_negtwo:
3126 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3127 ; CHECK-NEXT: movl %edi, %eax
3128 ; CHECK-NEXT: shrl $31, %eax
3129 ; CHECK-NEXT: addl %edi, %eax
3130 ; CHECK-NEXT: sarl %eax
3131 ; CHECK-NEXT: negl %eax
3133 %1 = sdiv i32 %x, -2
3137 define i8 @combine_i8_sdiv_pow2(i8 %x) {
3138 ; CHECK-LABEL: combine_i8_sdiv_pow2:
3140 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3141 ; CHECK-NEXT: movl %edi, %eax
3142 ; CHECK-NEXT: sarb $7, %al
3143 ; CHECK-NEXT: shrb $4, %al
3144 ; CHECK-NEXT: addl %edi, %eax
3145 ; CHECK-NEXT: sarb $4, %al
3146 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3152 define i8 @combine_i8_sdiv_negpow2(i8 %x) {
3153 ; CHECK-LABEL: combine_i8_sdiv_negpow2:
3155 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3156 ; CHECK-NEXT: movl %edi, %eax
3157 ; CHECK-NEXT: sarb $7, %al
3158 ; CHECK-NEXT: shrb $2, %al
3159 ; CHECK-NEXT: addl %edi, %eax
3160 ; CHECK-NEXT: sarb $6, %al
3161 ; CHECK-NEXT: negb %al
3162 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3164 %1 = sdiv i8 %x, -64
3168 define i16 @combine_i16_sdiv_pow2(i16 %x) {
3169 ; CHECK-LABEL: combine_i16_sdiv_pow2:
3171 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3172 ; CHECK-NEXT: leal 15(%rdi), %eax
3173 ; CHECK-NEXT: testw %di, %di
3174 ; CHECK-NEXT: cmovnsl %edi, %eax
3176 ; CHECK-NEXT: shrl $4, %eax
3177 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
3179 %1 = sdiv i16 %x, 16
3183 define i16 @combine_i16_sdiv_negpow2(i16 %x) {
3184 ; CHECK-LABEL: combine_i16_sdiv_negpow2:
3186 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3187 ; CHECK-NEXT: leal 255(%rdi), %eax
3188 ; CHECK-NEXT: testw %di, %di
3189 ; CHECK-NEXT: cmovnsl %edi, %eax
3191 ; CHECK-NEXT: sarl $8, %eax
3192 ; CHECK-NEXT: negl %eax
3193 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
3195 %1 = sdiv i16 %x, -256
3199 define i32 @combine_i32_sdiv_pow2(i32 %x) {
3200 ; CHECK-LABEL: combine_i32_sdiv_pow2:
3202 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3203 ; CHECK-NEXT: leal 15(%rdi), %eax
3204 ; CHECK-NEXT: testl %edi, %edi
3205 ; CHECK-NEXT: cmovnsl %edi, %eax
3206 ; CHECK-NEXT: sarl $4, %eax
3208 %1 = sdiv i32 %x, 16
3212 define i32 @combine_i32_sdiv_negpow2(i32 %x) {
3213 ; CHECK-LABEL: combine_i32_sdiv_negpow2:
3215 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3216 ; CHECK-NEXT: leal 255(%rdi), %eax
3217 ; CHECK-NEXT: testl %edi, %edi
3218 ; CHECK-NEXT: cmovnsl %edi, %eax
3219 ; CHECK-NEXT: sarl $8, %eax
3220 ; CHECK-NEXT: negl %eax
3222 %1 = sdiv i32 %x, -256
3226 define i64 @combine_i64_sdiv_pow2(i64 %x) {
3227 ; CHECK-LABEL: combine_i64_sdiv_pow2:
3229 ; CHECK-NEXT: leaq 15(%rdi), %rax
3230 ; CHECK-NEXT: testq %rdi, %rdi
3231 ; CHECK-NEXT: cmovnsq %rdi, %rax
3232 ; CHECK-NEXT: sarq $4, %rax
3234 %1 = sdiv i64 %x, 16
3238 define i64 @combine_i64_sdiv_negpow2(i64 %x) {
3239 ; CHECK-LABEL: combine_i64_sdiv_negpow2:
3241 ; CHECK-NEXT: leaq 255(%rdi), %rax
3242 ; CHECK-NEXT: testq %rdi, %rdi
3243 ; CHECK-NEXT: cmovnsq %rdi, %rax
3244 ; CHECK-NEXT: sarq $8, %rax
3245 ; CHECK-NEXT: negq %rax
3247 %1 = sdiv i64 %x, -256