1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
10 ; fold (sdiv x, 1) -> x
11 define i32 @combine_sdiv_by_one(i32 %x) {
12 ; CHECK-LABEL: combine_sdiv_by_one:
14 ; CHECK-NEXT: movl %edi, %eax
20 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21 ; CHECK-LABEL: combine_vec_sdiv_by_one:
24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
28 ; fold (sdiv x, -1) -> 0 - x
29 define i32 @combine_sdiv_by_negone(i32 %x) {
30 ; CHECK-LABEL: combine_sdiv_by_negone:
32 ; CHECK-NEXT: movl %edi, %eax
33 ; CHECK-NEXT: negl %eax
39 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40 ; SSE-LABEL: combine_vec_sdiv_by_negone:
42 ; SSE-NEXT: pxor %xmm1, %xmm1
43 ; SSE-NEXT: psubd %xmm0, %xmm1
44 ; SSE-NEXT: movdqa %xmm1, %xmm0
47 ; AVX-LABEL: combine_vec_sdiv_by_negone:
49 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
50 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
56 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57 define i32 @combine_sdiv_by_minsigned(i32 %x) {
58 ; CHECK-LABEL: combine_sdiv_by_minsigned:
60 ; CHECK-NEXT: xorl %eax, %eax
61 ; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000
62 ; CHECK-NEXT: sete %al
64 %1 = sdiv i32 %x, -2147483648
68 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69 ; SSE-LABEL: combine_vec_sdiv_by_minsigned:
71 ; SSE-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
72 ; SSE-NEXT: psrld $31, %xmm0
75 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
77 ; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
78 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
81 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
83 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
85 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
88 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
90 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
92 ; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0
95 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
97 ; AVX512BW-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1
98 ; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
101 ; XOP-LABEL: combine_vec_sdiv_by_minsigned:
103 ; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm0, %xmm0
104 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm0
106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
110 ; fold (sdiv 0, x) -> 0
111 define i32 @combine_sdiv_zero(i32 %x) {
112 ; CHECK-LABEL: combine_sdiv_zero:
114 ; CHECK-NEXT: xorl %eax, %eax
120 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121 ; SSE-LABEL: combine_vec_sdiv_zero:
123 ; SSE-NEXT: xorps %xmm0, %xmm0
126 ; AVX-LABEL: combine_vec_sdiv_zero:
128 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
130 %1 = sdiv <4 x i32> zeroinitializer, %x
134 ; fold (sdiv x, x) -> 1
135 define i32 @combine_sdiv_dupe(i32 %x) {
136 ; CHECK-LABEL: combine_sdiv_dupe:
138 ; CHECK-NEXT: movl $1, %eax
144 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145 ; SSE-LABEL: combine_vec_sdiv_dupe:
147 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
150 ; AVX1-LABEL: combine_vec_sdiv_dupe:
152 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
155 ; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
156 ; AVX2ORLATER: # %bb.0:
157 ; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
158 ; AVX2ORLATER-NEXT: retq
160 ; XOP-LABEL: combine_vec_sdiv_dupe:
162 ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
164 %1 = sdiv <4 x i32> %x, %x
168 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
169 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
170 ; SSE-LABEL: combine_vec_sdiv_by_pos0:
172 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
173 ; SSE-NEXT: psrld $2, %xmm0
176 ; AVX-LABEL: combine_vec_sdiv_by_pos0:
178 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
179 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
181 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
182 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
186 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
187 ; SSE2-LABEL: combine_vec_sdiv_by_pos1:
189 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
190 ; SSE2-NEXT: movdqa %xmm0, %xmm1
191 ; SSE2-NEXT: psrld $4, %xmm1
192 ; SSE2-NEXT: movdqa %xmm0, %xmm2
193 ; SSE2-NEXT: psrld $3, %xmm2
194 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
195 ; SSE2-NEXT: movdqa %xmm0, %xmm1
196 ; SSE2-NEXT: psrld $2, %xmm1
197 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
201 ; SSE41-LABEL: combine_vec_sdiv_by_pos1:
203 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
204 ; SSE41-NEXT: movdqa %xmm0, %xmm2
205 ; SSE41-NEXT: movdqa %xmm0, %xmm1
206 ; SSE41-NEXT: psrld $3, %xmm1
207 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
208 ; SSE41-NEXT: psrld $4, %xmm0
209 ; SSE41-NEXT: psrld $2, %xmm2
210 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
211 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
212 ; SSE41-NEXT: movdqa %xmm1, %xmm0
215 ; AVX1-LABEL: combine_vec_sdiv_by_pos1:
217 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
218 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
219 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
220 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
221 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2
222 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
223 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
226 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1:
227 ; AVX2ORLATER: # %bb.0:
228 ; AVX2ORLATER-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
229 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
230 ; AVX2ORLATER-NEXT: retq
232 ; XOP-LABEL: combine_vec_sdiv_by_pos1:
234 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
235 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
237 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
238 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
242 ; fold (sdiv x, (1 << c)) -> x >>u c
243 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
244 ; SSE-LABEL: combine_vec_sdiv_by_pow2a:
246 ; SSE-NEXT: movdqa %xmm0, %xmm1
247 ; SSE-NEXT: psrad $31, %xmm1
248 ; SSE-NEXT: psrld $30, %xmm1
249 ; SSE-NEXT: paddd %xmm0, %xmm1
250 ; SSE-NEXT: psrad $2, %xmm1
251 ; SSE-NEXT: movdqa %xmm1, %xmm0
254 ; AVX-LABEL: combine_vec_sdiv_by_pow2a:
256 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
257 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
258 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
259 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
261 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
265 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
266 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
268 ; SSE-NEXT: movdqa %xmm0, %xmm1
269 ; SSE-NEXT: psrad $31, %xmm1
270 ; SSE-NEXT: psrld $30, %xmm1
271 ; SSE-NEXT: paddd %xmm0, %xmm1
272 ; SSE-NEXT: psrad $2, %xmm1
273 ; SSE-NEXT: pxor %xmm0, %xmm0
274 ; SSE-NEXT: psubd %xmm1, %xmm0
277 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
279 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
280 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
281 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
282 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
283 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
284 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
286 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
290 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
291 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
293 ; SSE2-NEXT: pxor %xmm1, %xmm1
294 ; SSE2-NEXT: pxor %xmm2, %xmm2
295 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
296 ; SSE2-NEXT: movdqa %xmm2, %xmm3
297 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
298 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2]
299 ; SSE2-NEXT: pmullw %xmm4, %xmm3
300 ; SSE2-NEXT: psrlw $8, %xmm3
301 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
302 ; SSE2-NEXT: pmullw %xmm4, %xmm2
303 ; SSE2-NEXT: psrlw $8, %xmm2
304 ; SSE2-NEXT: packuswb %xmm3, %xmm2
305 ; SSE2-NEXT: paddb %xmm0, %xmm2
306 ; SSE2-NEXT: movdqa %xmm2, %xmm1
307 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
308 ; SSE2-NEXT: psraw $8, %xmm1
309 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
310 ; SSE2-NEXT: pmullw %xmm3, %xmm1
311 ; SSE2-NEXT: psrlw $8, %xmm1
312 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313 ; SSE2-NEXT: psraw $8, %xmm2
314 ; SSE2-NEXT: pmullw %xmm3, %xmm2
315 ; SSE2-NEXT: psrlw $8, %xmm2
316 ; SSE2-NEXT: packuswb %xmm1, %xmm2
317 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
318 ; SSE2-NEXT: pand %xmm1, %xmm2
319 ; SSE2-NEXT: pandn %xmm0, %xmm1
320 ; SSE2-NEXT: por %xmm2, %xmm1
321 ; SSE2-NEXT: movdqa %xmm1, %xmm0
324 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
326 ; SSE41-NEXT: movdqa %xmm0, %xmm1
327 ; SSE41-NEXT: pxor %xmm0, %xmm0
328 ; SSE41-NEXT: pxor %xmm3, %xmm3
329 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
330 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
331 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
332 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
333 ; SSE41-NEXT: pmullw %xmm0, %xmm3
334 ; SSE41-NEXT: psrlw $8, %xmm3
335 ; SSE41-NEXT: pmullw %xmm0, %xmm2
336 ; SSE41-NEXT: psrlw $8, %xmm2
337 ; SSE41-NEXT: packuswb %xmm3, %xmm2
338 ; SSE41-NEXT: paddb %xmm1, %xmm2
339 ; SSE41-NEXT: movdqa %xmm2, %xmm0
340 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
341 ; SSE41-NEXT: psraw $8, %xmm0
342 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
343 ; SSE41-NEXT: pmullw %xmm3, %xmm0
344 ; SSE41-NEXT: psrlw $8, %xmm0
345 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
346 ; SSE41-NEXT: psraw $8, %xmm2
347 ; SSE41-NEXT: pmullw %xmm3, %xmm2
348 ; SSE41-NEXT: psrlw $8, %xmm2
349 ; SSE41-NEXT: packuswb %xmm0, %xmm2
350 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
351 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
352 ; SSE41-NEXT: movdqa %xmm1, %xmm0
355 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
357 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
358 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
359 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
360 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2]
361 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
362 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
363 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
364 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
365 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
366 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
367 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
368 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
369 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
371 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
372 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
373 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
375 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
376 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
377 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
378 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
379 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
382 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
384 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
385 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
386 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
387 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
388 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
389 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
390 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
391 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
392 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
393 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
394 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
395 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
396 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
397 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
398 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
399 ; AVX2-NEXT: vzeroupper
402 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
404 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
405 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
406 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
407 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
408 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
409 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
410 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
411 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
412 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
413 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
414 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
415 ; AVX512F-NEXT: vzeroupper
418 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
420 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
421 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
422 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
423 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
424 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
425 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1
426 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
427 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1
428 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
429 ; AVX512BW-NEXT: movw $257, %ax # imm = 0x101
430 ; AVX512BW-NEXT: kmovd %eax, %k1
431 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
432 ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
433 ; AVX512BW-NEXT: vzeroupper
434 ; AVX512BW-NEXT: retq
436 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
438 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
439 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
440 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
441 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
442 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm1, %xmm1
443 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
444 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
446 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
450 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
451 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
453 ; SSE2-NEXT: movdqa %xmm0, %xmm1
454 ; SSE2-NEXT: psraw $15, %xmm1
455 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1
456 ; SSE2-NEXT: paddw %xmm0, %xmm1
457 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
458 ; SSE2-NEXT: movdqa %xmm1, %xmm3
459 ; SSE2-NEXT: pand %xmm2, %xmm3
460 ; SSE2-NEXT: psraw $4, %xmm1
461 ; SSE2-NEXT: pandn %xmm1, %xmm2
462 ; SSE2-NEXT: por %xmm3, %xmm2
463 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
464 ; SSE2-NEXT: movdqa %xmm2, %xmm3
465 ; SSE2-NEXT: pand %xmm1, %xmm3
466 ; SSE2-NEXT: psraw $2, %xmm2
467 ; SSE2-NEXT: pandn %xmm2, %xmm1
468 ; SSE2-NEXT: por %xmm3, %xmm1
469 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
470 ; SSE2-NEXT: movdqa %xmm1, %xmm3
471 ; SSE2-NEXT: pand %xmm2, %xmm3
472 ; SSE2-NEXT: psraw $1, %xmm1
473 ; SSE2-NEXT: pandn %xmm1, %xmm2
474 ; SSE2-NEXT: por %xmm3, %xmm2
475 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
476 ; SSE2-NEXT: pand %xmm1, %xmm2
477 ; SSE2-NEXT: pandn %xmm0, %xmm1
478 ; SSE2-NEXT: por %xmm2, %xmm1
479 ; SSE2-NEXT: movdqa %xmm1, %xmm0
482 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
484 ; SSE41-NEXT: movdqa %xmm0, %xmm1
485 ; SSE41-NEXT: psraw $15, %xmm1
486 ; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1
487 ; SSE41-NEXT: paddw %xmm0, %xmm1
488 ; SSE41-NEXT: movdqa %xmm1, %xmm2
489 ; SSE41-NEXT: psraw $1, %xmm2
490 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm1
491 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
492 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
493 ; SSE41-NEXT: movdqa %xmm1, %xmm0
496 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
498 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
499 ; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
500 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
501 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
502 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1
503 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
504 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
507 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
509 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
510 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
511 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
512 ; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2
513 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1
514 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
515 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
518 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
520 ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1
521 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
522 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1
523 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
524 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
525 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
526 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
527 ; AVX512F-NEXT: vzeroupper
530 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
532 ; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1
533 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
534 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1
535 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm1, %xmm1
536 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
537 ; AVX512BW-NEXT: retq
539 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
541 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm1
542 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
543 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1
544 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm1, %xmm1
545 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
547 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
551 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
552 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
554 ; SSE2-NEXT: movdqa %xmm0, %xmm3
555 ; SSE2-NEXT: psraw $15, %xmm0
556 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2>
557 ; SSE2-NEXT: pmulhuw %xmm8, %xmm0
558 ; SSE2-NEXT: paddw %xmm3, %xmm0
559 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
560 ; SSE2-NEXT: movdqa %xmm0, %xmm2
561 ; SSE2-NEXT: pand %xmm4, %xmm2
562 ; SSE2-NEXT: psraw $4, %xmm0
563 ; SSE2-NEXT: movdqa %xmm4, %xmm6
564 ; SSE2-NEXT: pandn %xmm0, %xmm6
565 ; SSE2-NEXT: por %xmm2, %xmm6
566 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
567 ; SSE2-NEXT: movdqa %xmm6, %xmm0
568 ; SSE2-NEXT: pand %xmm5, %xmm0
569 ; SSE2-NEXT: psraw $2, %xmm6
570 ; SSE2-NEXT: movdqa %xmm5, %xmm2
571 ; SSE2-NEXT: pandn %xmm6, %xmm2
572 ; SSE2-NEXT: por %xmm0, %xmm2
573 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0]
574 ; SSE2-NEXT: movdqa %xmm2, %xmm0
575 ; SSE2-NEXT: pand %xmm7, %xmm0
576 ; SSE2-NEXT: psraw $1, %xmm2
577 ; SSE2-NEXT: movdqa %xmm7, %xmm6
578 ; SSE2-NEXT: pandn %xmm2, %xmm6
579 ; SSE2-NEXT: por %xmm0, %xmm6
580 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
581 ; SSE2-NEXT: pand %xmm2, %xmm6
582 ; SSE2-NEXT: movdqa %xmm2, %xmm0
583 ; SSE2-NEXT: pandn %xmm3, %xmm0
584 ; SSE2-NEXT: por %xmm6, %xmm0
585 ; SSE2-NEXT: movdqa %xmm1, %xmm3
586 ; SSE2-NEXT: psraw $15, %xmm3
587 ; SSE2-NEXT: pmulhuw %xmm8, %xmm3
588 ; SSE2-NEXT: paddw %xmm1, %xmm3
589 ; SSE2-NEXT: movdqa %xmm3, %xmm6
590 ; SSE2-NEXT: pand %xmm4, %xmm6
591 ; SSE2-NEXT: psraw $4, %xmm3
592 ; SSE2-NEXT: pandn %xmm3, %xmm4
593 ; SSE2-NEXT: por %xmm6, %xmm4
594 ; SSE2-NEXT: movdqa %xmm4, %xmm3
595 ; SSE2-NEXT: pand %xmm5, %xmm3
596 ; SSE2-NEXT: psraw $2, %xmm4
597 ; SSE2-NEXT: pandn %xmm4, %xmm5
598 ; SSE2-NEXT: por %xmm3, %xmm5
599 ; SSE2-NEXT: movdqa %xmm5, %xmm3
600 ; SSE2-NEXT: pand %xmm7, %xmm3
601 ; SSE2-NEXT: psraw $1, %xmm5
602 ; SSE2-NEXT: pandn %xmm5, %xmm7
603 ; SSE2-NEXT: por %xmm3, %xmm7
604 ; SSE2-NEXT: pand %xmm2, %xmm7
605 ; SSE2-NEXT: pandn %xmm1, %xmm2
606 ; SSE2-NEXT: por %xmm7, %xmm2
607 ; SSE2-NEXT: movdqa %xmm2, %xmm1
610 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
612 ; SSE41-NEXT: movdqa %xmm0, %xmm2
613 ; SSE41-NEXT: psraw $15, %xmm2
614 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
615 ; SSE41-NEXT: pmulhuw %xmm4, %xmm2
616 ; SSE41-NEXT: paddw %xmm0, %xmm2
617 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768>
618 ; SSE41-NEXT: movdqa %xmm2, %xmm3
619 ; SSE41-NEXT: pmulhw %xmm5, %xmm3
620 ; SSE41-NEXT: psraw $1, %xmm2
621 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7]
622 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
623 ; SSE41-NEXT: movdqa %xmm1, %xmm3
624 ; SSE41-NEXT: psraw $15, %xmm3
625 ; SSE41-NEXT: pmulhuw %xmm4, %xmm3
626 ; SSE41-NEXT: paddw %xmm1, %xmm3
627 ; SSE41-NEXT: pmulhw %xmm3, %xmm5
628 ; SSE41-NEXT: psraw $1, %xmm3
629 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7]
630 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
631 ; SSE41-NEXT: movdqa %xmm2, %xmm0
632 ; SSE41-NEXT: movdqa %xmm3, %xmm1
635 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
637 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
638 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
639 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
640 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
641 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
642 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
643 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4
644 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
645 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
646 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4
647 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3
648 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3
649 ; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2
650 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
651 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
652 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
653 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
654 ; AVX1-NEXT: # ymm2 = mem[0,1,0,1]
655 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
656 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
657 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
660 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
662 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
663 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
664 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
665 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2
666 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm1, %ymm1
667 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
668 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
671 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
673 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
674 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
675 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
676 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
677 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
678 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
679 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
682 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
684 ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1
685 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
686 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1
687 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1
688 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
689 ; AVX512BW-NEXT: retq
691 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
693 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
694 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2
695 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521>
696 ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2
697 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1
698 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535>
699 ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1
700 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4
701 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3
702 ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3
703 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2
704 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
705 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
706 ; XOP-NEXT: # ymm2 = mem[0,1,0,1]
707 ; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
709 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
713 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
714 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
716 ; SSE2-NEXT: movdqa %xmm1, %xmm8
717 ; SSE2-NEXT: movdqa %xmm0, %xmm1
718 ; SSE2-NEXT: psraw $15, %xmm0
719 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
720 ; SSE2-NEXT: pmulhuw %xmm9, %xmm0
721 ; SSE2-NEXT: paddw %xmm1, %xmm0
722 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535]
723 ; SSE2-NEXT: movdqa %xmm0, %xmm4
724 ; SSE2-NEXT: pand %xmm11, %xmm4
725 ; SSE2-NEXT: psraw $4, %xmm0
726 ; SSE2-NEXT: movdqa %xmm11, %xmm5
727 ; SSE2-NEXT: pandn %xmm0, %xmm5
728 ; SSE2-NEXT: por %xmm4, %xmm5
729 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
730 ; SSE2-NEXT: movdqa %xmm5, %xmm0
731 ; SSE2-NEXT: pand %xmm7, %xmm0
732 ; SSE2-NEXT: psraw $2, %xmm5
733 ; SSE2-NEXT: movdqa %xmm7, %xmm4
734 ; SSE2-NEXT: pandn %xmm5, %xmm4
735 ; SSE2-NEXT: por %xmm0, %xmm4
736 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0]
737 ; SSE2-NEXT: movdqa %xmm4, %xmm0
738 ; SSE2-NEXT: pand %xmm10, %xmm0
739 ; SSE2-NEXT: psraw $1, %xmm4
740 ; SSE2-NEXT: movdqa %xmm10, %xmm5
741 ; SSE2-NEXT: pandn %xmm4, %xmm5
742 ; SSE2-NEXT: por %xmm0, %xmm5
743 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535]
744 ; SSE2-NEXT: pand %xmm12, %xmm5
745 ; SSE2-NEXT: movdqa %xmm12, %xmm0
746 ; SSE2-NEXT: pandn %xmm1, %xmm0
747 ; SSE2-NEXT: por %xmm5, %xmm0
748 ; SSE2-NEXT: movdqa %xmm8, %xmm1
749 ; SSE2-NEXT: psraw $15, %xmm1
750 ; SSE2-NEXT: pmulhuw %xmm9, %xmm1
751 ; SSE2-NEXT: paddw %xmm8, %xmm1
752 ; SSE2-NEXT: movdqa %xmm1, %xmm5
753 ; SSE2-NEXT: pand %xmm11, %xmm5
754 ; SSE2-NEXT: psraw $4, %xmm1
755 ; SSE2-NEXT: movdqa %xmm11, %xmm6
756 ; SSE2-NEXT: pandn %xmm1, %xmm6
757 ; SSE2-NEXT: por %xmm5, %xmm6
758 ; SSE2-NEXT: movdqa %xmm6, %xmm1
759 ; SSE2-NEXT: pand %xmm7, %xmm1
760 ; SSE2-NEXT: psraw $2, %xmm6
761 ; SSE2-NEXT: movdqa %xmm7, %xmm5
762 ; SSE2-NEXT: pandn %xmm6, %xmm5
763 ; SSE2-NEXT: por %xmm1, %xmm5
764 ; SSE2-NEXT: movdqa %xmm5, %xmm1
765 ; SSE2-NEXT: pand %xmm10, %xmm1
766 ; SSE2-NEXT: psraw $1, %xmm5
767 ; SSE2-NEXT: movdqa %xmm10, %xmm6
768 ; SSE2-NEXT: pandn %xmm5, %xmm6
769 ; SSE2-NEXT: por %xmm1, %xmm6
770 ; SSE2-NEXT: pand %xmm12, %xmm6
771 ; SSE2-NEXT: movdqa %xmm12, %xmm1
772 ; SSE2-NEXT: pandn %xmm8, %xmm1
773 ; SSE2-NEXT: por %xmm6, %xmm1
774 ; SSE2-NEXT: movdqa %xmm2, %xmm5
775 ; SSE2-NEXT: psraw $15, %xmm5
776 ; SSE2-NEXT: pmulhuw %xmm9, %xmm5
777 ; SSE2-NEXT: paddw %xmm2, %xmm5
778 ; SSE2-NEXT: movdqa %xmm5, %xmm6
779 ; SSE2-NEXT: pand %xmm11, %xmm6
780 ; SSE2-NEXT: psraw $4, %xmm5
781 ; SSE2-NEXT: movdqa %xmm11, %xmm4
782 ; SSE2-NEXT: pandn %xmm5, %xmm4
783 ; SSE2-NEXT: por %xmm6, %xmm4
784 ; SSE2-NEXT: movdqa %xmm4, %xmm5
785 ; SSE2-NEXT: pand %xmm7, %xmm5
786 ; SSE2-NEXT: psraw $2, %xmm4
787 ; SSE2-NEXT: movdqa %xmm7, %xmm6
788 ; SSE2-NEXT: pandn %xmm4, %xmm6
789 ; SSE2-NEXT: por %xmm5, %xmm6
790 ; SSE2-NEXT: movdqa %xmm6, %xmm4
791 ; SSE2-NEXT: pand %xmm10, %xmm4
792 ; SSE2-NEXT: psraw $1, %xmm6
793 ; SSE2-NEXT: movdqa %xmm10, %xmm5
794 ; SSE2-NEXT: pandn %xmm6, %xmm5
795 ; SSE2-NEXT: por %xmm4, %xmm5
796 ; SSE2-NEXT: pand %xmm12, %xmm5
797 ; SSE2-NEXT: movdqa %xmm12, %xmm8
798 ; SSE2-NEXT: pandn %xmm2, %xmm8
799 ; SSE2-NEXT: por %xmm5, %xmm8
800 ; SSE2-NEXT: movdqa %xmm3, %xmm2
801 ; SSE2-NEXT: psraw $15, %xmm2
802 ; SSE2-NEXT: pmulhuw %xmm9, %xmm2
803 ; SSE2-NEXT: paddw %xmm3, %xmm2
804 ; SSE2-NEXT: movdqa %xmm2, %xmm4
805 ; SSE2-NEXT: pand %xmm11, %xmm4
806 ; SSE2-NEXT: psraw $4, %xmm2
807 ; SSE2-NEXT: pandn %xmm2, %xmm11
808 ; SSE2-NEXT: por %xmm4, %xmm11
809 ; SSE2-NEXT: movdqa %xmm11, %xmm2
810 ; SSE2-NEXT: pand %xmm7, %xmm2
811 ; SSE2-NEXT: psraw $2, %xmm11
812 ; SSE2-NEXT: pandn %xmm11, %xmm7
813 ; SSE2-NEXT: por %xmm2, %xmm7
814 ; SSE2-NEXT: movdqa %xmm7, %xmm2
815 ; SSE2-NEXT: pand %xmm10, %xmm2
816 ; SSE2-NEXT: psraw $1, %xmm7
817 ; SSE2-NEXT: pandn %xmm7, %xmm10
818 ; SSE2-NEXT: por %xmm2, %xmm10
819 ; SSE2-NEXT: pand %xmm12, %xmm10
820 ; SSE2-NEXT: pandn %xmm3, %xmm12
821 ; SSE2-NEXT: por %xmm10, %xmm12
822 ; SSE2-NEXT: movdqa %xmm8, %xmm2
823 ; SSE2-NEXT: movdqa %xmm12, %xmm3
826 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
828 ; SSE41-NEXT: movdqa %xmm1, %xmm4
829 ; SSE41-NEXT: movdqa %xmm0, %xmm1
830 ; SSE41-NEXT: psraw $15, %xmm0
831 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
832 ; SSE41-NEXT: pmulhuw %xmm7, %xmm0
833 ; SSE41-NEXT: paddw %xmm1, %xmm0
834 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768>
835 ; SSE41-NEXT: movdqa %xmm0, %xmm5
836 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
837 ; SSE41-NEXT: psraw $1, %xmm0
838 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7]
839 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
840 ; SSE41-NEXT: movdqa %xmm4, %xmm1
841 ; SSE41-NEXT: psraw $15, %xmm1
842 ; SSE41-NEXT: pmulhuw %xmm7, %xmm1
843 ; SSE41-NEXT: paddw %xmm4, %xmm1
844 ; SSE41-NEXT: movdqa %xmm1, %xmm5
845 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
846 ; SSE41-NEXT: psraw $1, %xmm1
847 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
848 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
849 ; SSE41-NEXT: movdqa %xmm2, %xmm4
850 ; SSE41-NEXT: psraw $15, %xmm4
851 ; SSE41-NEXT: pmulhuw %xmm7, %xmm4
852 ; SSE41-NEXT: paddw %xmm2, %xmm4
853 ; SSE41-NEXT: movdqa %xmm4, %xmm5
854 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
855 ; SSE41-NEXT: psraw $1, %xmm4
856 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
857 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
858 ; SSE41-NEXT: movdqa %xmm3, %xmm5
859 ; SSE41-NEXT: psraw $15, %xmm5
860 ; SSE41-NEXT: pmulhuw %xmm7, %xmm5
861 ; SSE41-NEXT: paddw %xmm3, %xmm5
862 ; SSE41-NEXT: pmulhw %xmm5, %xmm6
863 ; SSE41-NEXT: psraw $1, %xmm5
864 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
865 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
866 ; SSE41-NEXT: movdqa %xmm4, %xmm2
867 ; SSE41-NEXT: movdqa %xmm5, %xmm3
870 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
872 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
873 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3
874 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
875 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3
876 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
877 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
878 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5
879 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
880 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
881 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5
882 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5
883 ; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5
884 ; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6
885 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5
886 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
887 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
888 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
889 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
890 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
891 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
892 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
893 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
894 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6
895 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6
896 ; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2
897 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6
898 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
899 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
900 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6
901 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4
902 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4
903 ; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
904 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4
905 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
906 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
907 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
908 ; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1
909 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
912 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
914 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2
915 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
916 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
917 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
918 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2
919 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
920 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
921 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5
922 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
923 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
924 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
925 ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2
926 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
927 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2
928 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3
929 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
930 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
931 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
934 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
936 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
937 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
938 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
939 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
940 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
941 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
942 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
943 ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
944 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
945 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
946 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
947 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
948 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
949 ; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
950 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
951 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
952 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
953 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
956 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
958 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1
959 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
960 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
961 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm1, %zmm1
962 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
963 ; AVX512BW-NEXT: kmovd %eax, %k1
964 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
965 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
966 ; AVX512BW-NEXT: retq
968 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
970 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
971 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3
972 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521>
973 ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3
974 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2
975 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535>
976 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
977 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5
978 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5
979 ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5
980 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5
981 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
982 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
983 ; XOP-NEXT: # ymm5 = mem[0,1,0,1]
984 ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0
985 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
986 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6
987 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6
988 ; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2
989 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
990 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm6
991 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4
992 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4
993 ; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3
994 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
995 ; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1
997 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1001 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1002 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1004 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1005 ; SSE2-NEXT: psrad $31, %xmm1
1006 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1007 ; SSE2-NEXT: psrld $28, %xmm2
1008 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1009 ; SSE2-NEXT: psrld $29, %xmm3
1010 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1011 ; SSE2-NEXT: psrld $30, %xmm1
1012 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1013 ; SSE2-NEXT: paddd %xmm0, %xmm1
1014 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1015 ; SSE2-NEXT: psrad $4, %xmm2
1016 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1017 ; SSE2-NEXT: psrad $3, %xmm3
1018 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1019 ; SSE2-NEXT: psrad $2, %xmm1
1020 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1021 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1022 ; SSE2-NEXT: movaps %xmm1, %xmm0
1025 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1027 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1028 ; SSE41-NEXT: psrad $31, %xmm1
1029 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1030 ; SSE41-NEXT: psrld $28, %xmm2
1031 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1032 ; SSE41-NEXT: psrld $30, %xmm3
1033 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1034 ; SSE41-NEXT: psrld $29, %xmm1
1035 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1036 ; SSE41-NEXT: paddd %xmm0, %xmm1
1037 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1038 ; SSE41-NEXT: psrad $4, %xmm2
1039 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1040 ; SSE41-NEXT: psrad $2, %xmm3
1041 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1042 ; SSE41-NEXT: psrad $3, %xmm1
1043 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1044 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1045 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1048 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1050 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
1051 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1052 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
1053 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1054 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
1055 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1056 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1057 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1058 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1059 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1060 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1061 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1062 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1065 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1066 ; AVX2ORLATER: # %bb.0:
1067 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
1068 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
1069 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1070 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
1071 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1072 ; AVX2ORLATER-NEXT: retq
1074 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1076 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
1077 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
1078 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1079 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
1080 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1082 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1086 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1087 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1089 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1090 ; SSE2-NEXT: psrad $31, %xmm0
1091 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1092 ; SSE2-NEXT: psrld $28, %xmm3
1093 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1094 ; SSE2-NEXT: psrld $29, %xmm4
1095 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1096 ; SSE2-NEXT: psrld $30, %xmm0
1097 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1098 ; SSE2-NEXT: paddd %xmm2, %xmm0
1099 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1100 ; SSE2-NEXT: psrad $4, %xmm3
1101 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1102 ; SSE2-NEXT: psrad $3, %xmm4
1103 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1104 ; SSE2-NEXT: psrad $2, %xmm0
1105 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1106 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1107 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1108 ; SSE2-NEXT: psrad $31, %xmm2
1109 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1110 ; SSE2-NEXT: psrld $28, %xmm3
1111 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1112 ; SSE2-NEXT: psrld $29, %xmm4
1113 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1114 ; SSE2-NEXT: psrld $30, %xmm2
1115 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1116 ; SSE2-NEXT: paddd %xmm1, %xmm2
1117 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1118 ; SSE2-NEXT: psrad $4, %xmm3
1119 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1120 ; SSE2-NEXT: psrad $3, %xmm4
1121 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1122 ; SSE2-NEXT: psrad $2, %xmm2
1123 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1124 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1125 ; SSE2-NEXT: movaps %xmm2, %xmm1
1128 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1130 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1131 ; SSE41-NEXT: psrad $31, %xmm0
1132 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1133 ; SSE41-NEXT: psrld $28, %xmm3
1134 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1135 ; SSE41-NEXT: psrld $30, %xmm4
1136 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1137 ; SSE41-NEXT: psrld $29, %xmm0
1138 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1139 ; SSE41-NEXT: paddd %xmm2, %xmm0
1140 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1141 ; SSE41-NEXT: psrad $4, %xmm3
1142 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1143 ; SSE41-NEXT: psrad $2, %xmm4
1144 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1145 ; SSE41-NEXT: psrad $3, %xmm0
1146 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1147 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1148 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1149 ; SSE41-NEXT: psrad $31, %xmm2
1150 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1151 ; SSE41-NEXT: psrld $28, %xmm3
1152 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1153 ; SSE41-NEXT: psrld $30, %xmm4
1154 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1155 ; SSE41-NEXT: psrld $29, %xmm2
1156 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1157 ; SSE41-NEXT: paddd %xmm1, %xmm2
1158 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1159 ; SSE41-NEXT: psrad $4, %xmm3
1160 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1161 ; SSE41-NEXT: psrad $2, %xmm4
1162 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1163 ; SSE41-NEXT: psrad $3, %xmm2
1164 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1165 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1166 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1169 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1171 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1172 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1173 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1174 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1175 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1176 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1177 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1178 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1179 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1180 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1181 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1182 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1183 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1184 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1185 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1186 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1187 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1188 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1189 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1190 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
1191 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1192 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1193 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1194 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1195 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1196 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1197 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1200 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1201 ; AVX2ORLATER: # %bb.0:
1202 ; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1
1203 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
1204 ; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1
1205 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
1206 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1207 ; AVX2ORLATER-NEXT: retq
1209 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1211 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
1212 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2
1213 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268>
1214 ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2
1215 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1216 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292>
1217 ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1
1218 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4
1219 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3
1220 ; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1221 ; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2
1222 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1223 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1225 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1229 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1230 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1232 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1233 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1234 ; SSE2-NEXT: psrad $31, %xmm0
1235 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1236 ; SSE2-NEXT: psrld $28, %xmm5
1237 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1238 ; SSE2-NEXT: psrld $29, %xmm6
1239 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1240 ; SSE2-NEXT: psrld $30, %xmm0
1241 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1242 ; SSE2-NEXT: paddd %xmm1, %xmm0
1243 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1244 ; SSE2-NEXT: psrad $4, %xmm5
1245 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1246 ; SSE2-NEXT: psrad $3, %xmm6
1247 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1248 ; SSE2-NEXT: psrad $2, %xmm0
1249 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1250 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1251 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1252 ; SSE2-NEXT: psrad $31, %xmm1
1253 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1254 ; SSE2-NEXT: psrld $28, %xmm5
1255 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1256 ; SSE2-NEXT: psrld $29, %xmm6
1257 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1258 ; SSE2-NEXT: psrld $30, %xmm1
1259 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1260 ; SSE2-NEXT: paddd %xmm4, %xmm1
1261 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1262 ; SSE2-NEXT: psrad $4, %xmm5
1263 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1264 ; SSE2-NEXT: psrad $3, %xmm6
1265 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1266 ; SSE2-NEXT: psrad $2, %xmm1
1267 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1268 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1269 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1270 ; SSE2-NEXT: psrad $31, %xmm4
1271 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1272 ; SSE2-NEXT: psrld $28, %xmm5
1273 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1274 ; SSE2-NEXT: psrld $29, %xmm6
1275 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1276 ; SSE2-NEXT: psrld $30, %xmm4
1277 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1278 ; SSE2-NEXT: paddd %xmm2, %xmm4
1279 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1280 ; SSE2-NEXT: psrad $4, %xmm5
1281 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1282 ; SSE2-NEXT: psrad $3, %xmm6
1283 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1284 ; SSE2-NEXT: psrad $2, %xmm4
1285 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1286 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1287 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1288 ; SSE2-NEXT: psrad $31, %xmm5
1289 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1290 ; SSE2-NEXT: psrld $28, %xmm2
1291 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1292 ; SSE2-NEXT: psrld $29, %xmm6
1293 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1294 ; SSE2-NEXT: psrld $30, %xmm5
1295 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1296 ; SSE2-NEXT: paddd %xmm3, %xmm5
1297 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1298 ; SSE2-NEXT: psrad $4, %xmm2
1299 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1300 ; SSE2-NEXT: psrad $3, %xmm6
1301 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1302 ; SSE2-NEXT: psrad $2, %xmm5
1303 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1304 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1305 ; SSE2-NEXT: movaps %xmm4, %xmm2
1306 ; SSE2-NEXT: movaps %xmm5, %xmm3
1309 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1311 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1312 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1313 ; SSE41-NEXT: psrad $31, %xmm0
1314 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1315 ; SSE41-NEXT: psrld $28, %xmm5
1316 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1317 ; SSE41-NEXT: psrld $30, %xmm6
1318 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1319 ; SSE41-NEXT: psrld $29, %xmm0
1320 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1321 ; SSE41-NEXT: paddd %xmm1, %xmm0
1322 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1323 ; SSE41-NEXT: psrad $4, %xmm5
1324 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1325 ; SSE41-NEXT: psrad $2, %xmm6
1326 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1327 ; SSE41-NEXT: psrad $3, %xmm0
1328 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1329 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1330 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1331 ; SSE41-NEXT: psrad $31, %xmm1
1332 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1333 ; SSE41-NEXT: psrld $28, %xmm5
1334 ; SSE41-NEXT: movdqa %xmm1, %xmm6
1335 ; SSE41-NEXT: psrld $30, %xmm6
1336 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1337 ; SSE41-NEXT: psrld $29, %xmm1
1338 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1339 ; SSE41-NEXT: paddd %xmm4, %xmm1
1340 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1341 ; SSE41-NEXT: psrad $4, %xmm5
1342 ; SSE41-NEXT: movdqa %xmm1, %xmm6
1343 ; SSE41-NEXT: psrad $2, %xmm6
1344 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1345 ; SSE41-NEXT: psrad $3, %xmm1
1346 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1347 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
1348 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1349 ; SSE41-NEXT: psrad $31, %xmm4
1350 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1351 ; SSE41-NEXT: psrld $28, %xmm5
1352 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1353 ; SSE41-NEXT: psrld $30, %xmm6
1354 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1355 ; SSE41-NEXT: psrld $29, %xmm4
1356 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1357 ; SSE41-NEXT: paddd %xmm2, %xmm4
1358 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1359 ; SSE41-NEXT: psrad $4, %xmm5
1360 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1361 ; SSE41-NEXT: psrad $2, %xmm6
1362 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1363 ; SSE41-NEXT: psrad $3, %xmm4
1364 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1365 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1366 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1367 ; SSE41-NEXT: psrad $31, %xmm5
1368 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1369 ; SSE41-NEXT: psrld $28, %xmm2
1370 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1371 ; SSE41-NEXT: psrld $30, %xmm6
1372 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1373 ; SSE41-NEXT: psrld $29, %xmm5
1374 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1375 ; SSE41-NEXT: paddd %xmm3, %xmm5
1376 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1377 ; SSE41-NEXT: psrad $4, %xmm2
1378 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1379 ; SSE41-NEXT: psrad $2, %xmm6
1380 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1381 ; SSE41-NEXT: psrad $3, %xmm5
1382 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1383 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
1384 ; SSE41-NEXT: movdqa %xmm4, %xmm2
1385 ; SSE41-NEXT: movdqa %xmm5, %xmm3
1388 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1390 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1391 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1392 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1393 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1394 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1395 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1396 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1397 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1398 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1399 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1400 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1401 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1402 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1403 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
1404 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1405 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1406 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1407 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1408 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1409 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1410 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1411 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1412 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1413 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1414 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1415 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1416 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1417 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1418 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1419 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1420 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1422 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1423 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1424 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1425 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1426 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1427 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1428 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1429 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1430 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
1431 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1432 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1433 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1434 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1435 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1436 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
1437 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1438 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1439 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1440 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1441 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1442 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1443 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1446 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1448 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
1449 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28]
1450 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1451 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1452 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
1453 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1454 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1455 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1456 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1457 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
1458 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1459 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2
1460 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1461 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1464 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1466 ; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1
1467 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1468 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1469 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
1470 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
1471 ; AVX512F-NEXT: kmovw %eax, %k1
1472 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1473 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1474 ; AVX512F-NEXT: retq
1476 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1477 ; AVX512BW: # %bb.0:
1478 ; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1
1479 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1480 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1481 ; AVX512BW-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
1482 ; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111
1483 ; AVX512BW-NEXT: kmovd %eax, %k1
1484 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1485 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1486 ; AVX512BW-NEXT: retq
1488 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1490 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1491 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3
1492 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268>
1493 ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3
1494 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1495 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292>
1496 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1497 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5
1498 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1499 ; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5
1500 ; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5
1501 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1502 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1503 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1504 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm5
1505 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1506 ; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1507 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1508 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm5
1509 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4
1510 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4
1511 ; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3
1512 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1513 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1515 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1519 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1520 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1522 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1523 ; SSE2-NEXT: psrad $31, %xmm1
1524 ; SSE2-NEXT: psrlq $62, %xmm1
1525 ; SSE2-NEXT: paddq %xmm0, %xmm1
1526 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1527 ; SSE2-NEXT: psrad $2, %xmm2
1528 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1529 ; SSE2-NEXT: psrlq $2, %xmm1
1530 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1531 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1532 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1535 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1537 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1538 ; SSE41-NEXT: psrad $31, %xmm1
1539 ; SSE41-NEXT: psrlq $62, %xmm1
1540 ; SSE41-NEXT: paddq %xmm0, %xmm1
1541 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1542 ; SSE41-NEXT: psrad $2, %xmm2
1543 ; SSE41-NEXT: psrlq $2, %xmm1
1544 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1545 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1546 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1549 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1551 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1552 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1553 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1
1554 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1555 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2
1556 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
1557 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1558 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1561 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1563 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1564 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1565 ; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1
1566 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1567 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2
1568 ; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1
1569 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
1570 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1573 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1575 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1576 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1577 ; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1
1578 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1579 ; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1
1580 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1581 ; AVX512F-NEXT: vzeroupper
1582 ; AVX512F-NEXT: retq
1584 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1585 ; AVX512BW: # %bb.0:
1586 ; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1
1587 ; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1
1588 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1589 ; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1
1590 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1591 ; AVX512BW-NEXT: retq
1593 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1595 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1
1596 ; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1
1597 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1598 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1
1599 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1601 %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1605 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1606 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1608 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1609 ; SSE2-NEXT: psrad $31, %xmm2
1610 ; SSE2-NEXT: psrlq $62, %xmm2
1611 ; SSE2-NEXT: paddq %xmm0, %xmm2
1612 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1613 ; SSE2-NEXT: psrad $2, %xmm3
1614 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
1615 ; SSE2-NEXT: psrlq $2, %xmm2
1616 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1617 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1618 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1619 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1620 ; SSE2-NEXT: psrad $31, %xmm2
1621 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1622 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1623 ; SSE2-NEXT: psrlq $61, %xmm3
1624 ; SSE2-NEXT: psrlq $60, %xmm2
1625 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
1626 ; SSE2-NEXT: paddq %xmm1, %xmm2
1627 ; SSE2-NEXT: movdqa %xmm2, %xmm1
1628 ; SSE2-NEXT: psrlq $3, %xmm1
1629 ; SSE2-NEXT: psrlq $4, %xmm2
1630 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1631 ; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1632 ; SSE2-NEXT: xorpd %xmm1, %xmm2
1633 ; SSE2-NEXT: psubq %xmm1, %xmm2
1634 ; SSE2-NEXT: movdqa %xmm2, %xmm1
1637 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1639 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1640 ; SSE41-NEXT: psrad $31, %xmm0
1641 ; SSE41-NEXT: psrlq $62, %xmm0
1642 ; SSE41-NEXT: paddq %xmm2, %xmm0
1643 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1644 ; SSE41-NEXT: psrad $2, %xmm3
1645 ; SSE41-NEXT: psrlq $2, %xmm0
1646 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1647 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1648 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1649 ; SSE41-NEXT: psrad $31, %xmm2
1650 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1651 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1652 ; SSE41-NEXT: psrlq $60, %xmm3
1653 ; SSE41-NEXT: psrlq $61, %xmm2
1654 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1655 ; SSE41-NEXT: paddq %xmm1, %xmm2
1656 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1657 ; SSE41-NEXT: psrlq $4, %xmm1
1658 ; SSE41-NEXT: psrlq $3, %xmm2
1659 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1660 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1661 ; SSE41-NEXT: pxor %xmm1, %xmm2
1662 ; SSE41-NEXT: psubq %xmm1, %xmm2
1663 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1666 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1668 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1669 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1670 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
1671 ; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4
1672 ; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3
1673 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1674 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
1675 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
1676 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1
1677 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1678 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1679 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1680 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
1681 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
1682 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1683 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1684 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3
1685 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1686 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1687 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1688 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1691 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1693 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1694 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
1695 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1696 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1697 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1698 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1699 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
1700 ; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1
1701 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1704 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1706 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1707 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,2,3,4>
1708 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2
1709 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm2, %ymm2
1710 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
1711 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1712 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1713 ; AVX512F-NEXT: retq
1715 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1716 ; AVX512BW: # %bb.0:
1717 ; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1
1718 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1719 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1720 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %ymm1, %ymm1
1721 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1722 ; AVX512BW-NEXT: retq
1724 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1726 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1727 ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2
1728 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2
1729 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1730 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm2, %xmm2
1731 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1732 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1
1733 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
1734 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
1735 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1
1736 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1737 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1739 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1743 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1744 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1746 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1747 ; SSE2-NEXT: psrad $31, %xmm4
1748 ; SSE2-NEXT: psrlq $62, %xmm4
1749 ; SSE2-NEXT: paddq %xmm0, %xmm4
1750 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1751 ; SSE2-NEXT: psrad $2, %xmm5
1752 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1753 ; SSE2-NEXT: psrlq $2, %xmm4
1754 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1755 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1756 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
1757 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1758 ; SSE2-NEXT: psrad $31, %xmm4
1759 ; SSE2-NEXT: psrlq $62, %xmm4
1760 ; SSE2-NEXT: paddq %xmm2, %xmm4
1761 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1762 ; SSE2-NEXT: psrad $2, %xmm5
1763 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1764 ; SSE2-NEXT: psrlq $2, %xmm4
1765 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1766 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1767 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
1768 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1769 ; SSE2-NEXT: psrad $31, %xmm4
1770 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1771 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1772 ; SSE2-NEXT: psrlq $61, %xmm5
1773 ; SSE2-NEXT: psrlq $60, %xmm4
1774 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1775 ; SSE2-NEXT: paddq %xmm1, %xmm4
1776 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1777 ; SSE2-NEXT: psrlq $3, %xmm1
1778 ; SSE2-NEXT: psrlq $4, %xmm4
1779 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1780 ; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1781 ; SSE2-NEXT: xorpd %xmm1, %xmm4
1782 ; SSE2-NEXT: psubq %xmm1, %xmm4
1783 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1784 ; SSE2-NEXT: psrad $31, %xmm5
1785 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1786 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1787 ; SSE2-NEXT: psrlq $61, %xmm6
1788 ; SSE2-NEXT: psrlq $60, %xmm5
1789 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
1790 ; SSE2-NEXT: paddq %xmm3, %xmm5
1791 ; SSE2-NEXT: movdqa %xmm5, %xmm3
1792 ; SSE2-NEXT: psrlq $3, %xmm3
1793 ; SSE2-NEXT: psrlq $4, %xmm5
1794 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1795 ; SSE2-NEXT: xorpd %xmm1, %xmm5
1796 ; SSE2-NEXT: psubq %xmm1, %xmm5
1797 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1798 ; SSE2-NEXT: movdqa %xmm5, %xmm3
1801 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1803 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1804 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1805 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1806 ; SSE41-NEXT: psrad $31, %xmm0
1807 ; SSE41-NEXT: psrlq $62, %xmm0
1808 ; SSE41-NEXT: paddq %xmm1, %xmm0
1809 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1810 ; SSE41-NEXT: psrad $2, %xmm2
1811 ; SSE41-NEXT: psrlq $2, %xmm0
1812 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1813 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1814 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1815 ; SSE41-NEXT: psrad $31, %xmm2
1816 ; SSE41-NEXT: psrlq $62, %xmm2
1817 ; SSE41-NEXT: paddq %xmm5, %xmm2
1818 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1819 ; SSE41-NEXT: psrad $2, %xmm1
1820 ; SSE41-NEXT: psrlq $2, %xmm2
1821 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1822 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
1823 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1824 ; SSE41-NEXT: psrad $31, %xmm1
1825 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1826 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1827 ; SSE41-NEXT: psrlq $60, %xmm5
1828 ; SSE41-NEXT: psrlq $61, %xmm1
1829 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
1830 ; SSE41-NEXT: paddq %xmm4, %xmm1
1831 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1832 ; SSE41-NEXT: psrlq $4, %xmm4
1833 ; SSE41-NEXT: psrlq $3, %xmm1
1834 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1835 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
1836 ; SSE41-NEXT: pxor %xmm5, %xmm1
1837 ; SSE41-NEXT: psubq %xmm5, %xmm1
1838 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1839 ; SSE41-NEXT: psrad $31, %xmm4
1840 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1841 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1842 ; SSE41-NEXT: psrlq $60, %xmm6
1843 ; SSE41-NEXT: psrlq $61, %xmm4
1844 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
1845 ; SSE41-NEXT: paddq %xmm3, %xmm4
1846 ; SSE41-NEXT: movdqa %xmm4, %xmm3
1847 ; SSE41-NEXT: psrlq $4, %xmm3
1848 ; SSE41-NEXT: psrlq $3, %xmm4
1849 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1850 ; SSE41-NEXT: pxor %xmm5, %xmm4
1851 ; SSE41-NEXT: psubq %xmm5, %xmm4
1852 ; SSE41-NEXT: movdqa %xmm4, %xmm3
1855 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1857 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1858 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1859 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1860 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5
1861 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4
1862 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1863 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
1864 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4
1865 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1866 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1867 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1868 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1869 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1870 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
1871 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
1872 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
1873 ; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6
1874 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5
1875 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1876 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1877 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1878 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1879 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5
1880 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6
1881 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5
1882 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1883 ; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
1884 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5
1885 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1886 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1887 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1888 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1889 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
1890 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1891 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
1892 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1893 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1894 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1895 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1896 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1899 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1901 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1902 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1903 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,62,61,60>
1904 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3
1905 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3
1906 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,2,3,4>
1907 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3
1908 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1909 ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3
1910 ; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3
1911 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1912 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
1913 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2
1914 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2
1915 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2
1916 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
1917 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1918 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1921 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1923 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1924 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1925 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1926 ; AVX512F-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1
1927 ; AVX512F-NEXT: movb $17, %al
1928 ; AVX512F-NEXT: kmovw %eax, %k1
1929 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1930 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1931 ; AVX512F-NEXT: retq
1933 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1934 ; AVX512BW: # %bb.0:
1935 ; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1
1936 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1937 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1938 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1
1939 ; AVX512BW-NEXT: movb $17, %al
1940 ; AVX512BW-NEXT: kmovd %eax, %k1
1941 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1942 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1943 ; AVX512BW-NEXT: retq
1945 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1947 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1948 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1949 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4
1950 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
1951 ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4
1952 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1953 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1954 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1955 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6
1956 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6
1957 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6
1958 ; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614>
1959 ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6
1960 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
1961 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1962 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1963 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6
1964 ; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5
1965 ; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2
1966 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1967 ; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3
1968 ; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3
1969 ; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3
1970 ; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3
1971 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1972 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1974 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
1978 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
1979 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
1981 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1982 ; SSE2-NEXT: psrad $31, %xmm0
1983 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1984 ; SSE2-NEXT: psrld $28, %xmm2
1985 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1986 ; SSE2-NEXT: psrld $29, %xmm3
1987 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1988 ; SSE2-NEXT: psrld $30, %xmm0
1989 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
1990 ; SSE2-NEXT: paddd %xmm1, %xmm0
1991 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1992 ; SSE2-NEXT: psrad $4, %xmm2
1993 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1994 ; SSE2-NEXT: psrad $3, %xmm3
1995 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1996 ; SSE2-NEXT: psrad $2, %xmm0
1997 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
1998 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1999 ; SSE2-NEXT: pxor %xmm1, %xmm1
2000 ; SSE2-NEXT: psubd %xmm0, %xmm1
2001 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2002 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
2003 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2006 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2008 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2009 ; SSE41-NEXT: psrad $31, %xmm1
2010 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2011 ; SSE41-NEXT: psrld $28, %xmm2
2012 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2013 ; SSE41-NEXT: psrld $30, %xmm3
2014 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2015 ; SSE41-NEXT: psrld $29, %xmm1
2016 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2017 ; SSE41-NEXT: paddd %xmm0, %xmm1
2018 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2019 ; SSE41-NEXT: psrad $4, %xmm2
2020 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2021 ; SSE41-NEXT: psrad $2, %xmm3
2022 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2023 ; SSE41-NEXT: psrad $3, %xmm1
2024 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2025 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2026 ; SSE41-NEXT: pxor %xmm0, %xmm0
2027 ; SSE41-NEXT: psubd %xmm1, %xmm0
2028 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
2029 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2032 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2034 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
2035 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2036 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
2037 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2038 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
2039 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2040 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2041 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
2042 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
2043 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2044 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
2045 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2046 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2047 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2048 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2049 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2052 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2053 ; AVX2ORLATER: # %bb.0:
2054 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
2055 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2056 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2057 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
2058 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2059 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2060 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2061 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2062 ; AVX2ORLATER-NEXT: retq
2064 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2066 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
2067 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2068 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2069 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
2070 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2071 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2072 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2073 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2075 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2079 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2080 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2083 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2087 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2088 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2091 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2095 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2096 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2099 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2104 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2105 ; SSE2-LABEL: non_splat_minus_one_divisor_0:
2107 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2108 ; SSE2-NEXT: pxor %xmm1, %xmm0
2109 ; SSE2-NEXT: psubb %xmm0, %xmm1
2110 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2113 ; SSE41-LABEL: non_splat_minus_one_divisor_0:
2115 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2116 ; SSE41-NEXT: pxor %xmm2, %xmm2
2117 ; SSE41-NEXT: psubb %xmm0, %xmm2
2118 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2119 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
2120 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2123 ; AVX1-LABEL: non_splat_minus_one_divisor_0:
2125 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2126 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2127 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2128 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2131 ; AVX2-LABEL: non_splat_minus_one_divisor_0:
2133 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2134 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2135 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2136 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2139 ; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2141 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2142 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2143 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2144 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2145 ; AVX512F-NEXT: retq
2147 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2148 ; AVX512BW: # %bb.0:
2149 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2150 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2151 ; AVX512BW-NEXT: kmovd %eax, %k1
2152 ; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2153 ; AVX512BW-NEXT: retq
2155 ; XOP-LABEL: non_splat_minus_one_divisor_0:
2157 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2158 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2159 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2160 ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2162 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2166 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2167 ; SSE2-LABEL: non_splat_minus_one_divisor_1:
2169 ; SSE2-NEXT: pxor %xmm1, %xmm1
2170 ; SSE2-NEXT: pxor %xmm2, %xmm2
2171 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
2172 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2173 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2174 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2175 ; SSE2-NEXT: psrlw $8, %xmm3
2176 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2177 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2178 ; SSE2-NEXT: psrlw $8, %xmm2
2179 ; SSE2-NEXT: packuswb %xmm3, %xmm2
2180 ; SSE2-NEXT: paddb %xmm0, %xmm2
2181 ; SSE2-NEXT: movdqa %xmm2, %xmm1
2182 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2183 ; SSE2-NEXT: psraw $8, %xmm1
2184 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1
2185 ; SSE2-NEXT: psrlw $8, %xmm1
2186 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2187 ; SSE2-NEXT: psraw $8, %xmm2
2188 ; SSE2-NEXT: psllw $7, %xmm2
2189 ; SSE2-NEXT: psrlw $8, %xmm2
2190 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2191 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2192 ; SSE2-NEXT: pand %xmm1, %xmm2
2193 ; SSE2-NEXT: pandn %xmm0, %xmm1
2194 ; SSE2-NEXT: por %xmm2, %xmm1
2195 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2196 ; SSE2-NEXT: pxor %xmm0, %xmm1
2197 ; SSE2-NEXT: psubb %xmm0, %xmm1
2198 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2201 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
2203 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2204 ; SSE41-NEXT: pxor %xmm2, %xmm2
2205 ; SSE41-NEXT: pxor %xmm0, %xmm0
2206 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm0
2207 ; SSE41-NEXT: pxor %xmm4, %xmm4
2208 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2209 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2210 ; SSE41-NEXT: psllw $1, %xmm3
2211 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4,5],xmm3[6],xmm4[7]
2212 ; SSE41-NEXT: psrlw $8, %xmm3
2213 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2214 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2215 ; SSE41-NEXT: psrlw $8, %xmm0
2216 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2217 ; SSE41-NEXT: paddb %xmm1, %xmm3
2218 ; SSE41-NEXT: movdqa %xmm3, %xmm0
2219 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2220 ; SSE41-NEXT: psraw $8, %xmm0
2221 ; SSE41-NEXT: movdqa %xmm0, %xmm4
2222 ; SSE41-NEXT: psllw $1, %xmm4
2223 ; SSE41-NEXT: psllw $7, %xmm0
2224 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5],xmm0[6],xmm4[7]
2225 ; SSE41-NEXT: psrlw $8, %xmm0
2226 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2227 ; SSE41-NEXT: psraw $8, %xmm3
2228 ; SSE41-NEXT: psllw $7, %xmm3
2229 ; SSE41-NEXT: psrlw $8, %xmm3
2230 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2231 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2232 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
2233 ; SSE41-NEXT: psubb %xmm1, %xmm2
2234 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2235 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
2236 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2239 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
2241 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2242 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2243 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2244 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2245 ; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
2246 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2247 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2248 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2249 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2250 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2251 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2252 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2253 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2254 ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3
2255 ; AVX1-NEXT: vpsllw $1, %xmm3, %xmm4
2256 ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
2257 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6],xmm4[7]
2258 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2259 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2260 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2261 ; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2262 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2263 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2264 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2265 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2266 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2267 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2268 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2271 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
2273 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2274 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2275 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2276 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2277 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2278 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2279 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2280 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2281 ; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
2282 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2283 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2284 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2285 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2286 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2287 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2288 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2289 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2290 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2291 ; AVX2-NEXT: vzeroupper
2294 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2296 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2297 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2298 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2299 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2300 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
2301 ; AVX512F-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2302 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
2303 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm2, %zmm2
2304 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
2305 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2306 ; AVX512F-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2307 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2308 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2309 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2310 ; AVX512F-NEXT: vzeroupper
2311 ; AVX512F-NEXT: retq
2313 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2314 ; AVX512BW: # %bb.0:
2315 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2316 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2317 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2318 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2319 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2320 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2321 ; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2
2322 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm2, %ymm2
2323 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2324 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2325 ; AVX512BW-NEXT: kmovd %eax, %k1
2326 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
2327 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0
2328 ; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44
2329 ; AVX512BW-NEXT: kmovd %eax, %k1
2330 ; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1}
2331 ; AVX512BW-NEXT: vzeroupper
2332 ; AVX512BW-NEXT: retq
2334 ; XOP-LABEL: non_splat_minus_one_divisor_1:
2336 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2337 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2338 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm2, %xmm2
2339 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2340 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm2, %xmm2
2341 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2342 ; XOP-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2343 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2344 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2345 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2347 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2351 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2352 ; SSE2-LABEL: non_splat_minus_one_divisor_2:
2354 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2355 ; SSE2-NEXT: psrld $31, %xmm1
2356 ; SSE2-NEXT: paddd %xmm0, %xmm1
2357 ; SSE2-NEXT: psrad $1, %xmm1
2358 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2359 ; SSE2-NEXT: pxor %xmm0, %xmm0
2360 ; SSE2-NEXT: psubd %xmm1, %xmm0
2361 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2362 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2365 ; SSE41-LABEL: non_splat_minus_one_divisor_2:
2367 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2368 ; SSE41-NEXT: psrld $31, %xmm1
2369 ; SSE41-NEXT: paddd %xmm0, %xmm1
2370 ; SSE41-NEXT: psrad $1, %xmm1
2371 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2372 ; SSE41-NEXT: pxor %xmm0, %xmm0
2373 ; SSE41-NEXT: psubd %xmm1, %xmm0
2374 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
2375 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2378 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
2380 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
2381 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2382 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1
2383 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2384 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2385 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2386 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2389 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2390 ; AVX2ORLATER: # %bb.0:
2391 ; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1
2392 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2393 ; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1
2394 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2395 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2396 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2397 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2398 ; AVX2ORLATER-NEXT: retq
2400 ; XOP-LABEL: non_splat_minus_one_divisor_2:
2402 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm1
2403 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2404 ; XOP-NEXT: vpsrad $1, %xmm1, %xmm1
2405 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2406 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2407 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2408 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2410 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2414 define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2415 ; SSE-LABEL: combine_vec_sdiv_nonuniform:
2417 ; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
2418 ; SSE-NEXT: movdqa %xmm0, %xmm1
2419 ; SSE-NEXT: psrlw $15, %xmm1
2420 ; SSE-NEXT: paddw %xmm0, %xmm1
2421 ; SSE-NEXT: movdqa %xmm1, %xmm0
2424 ; AVX-LABEL: combine_vec_sdiv_nonuniform:
2426 ; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2427 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
2428 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2430 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2434 define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2435 ; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2437 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2438 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2439 ; SSE2-NEXT: psraw $2, %xmm1
2440 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2441 ; SSE2-NEXT: psraw $1, %xmm2
2442 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2443 ; SSE2-NEXT: psrlw $15, %xmm0
2444 ; SSE2-NEXT: paddw %xmm2, %xmm0
2447 ; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2449 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2450 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2451 ; SSE41-NEXT: psraw $1, %xmm1
2452 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2453 ; SSE41-NEXT: psraw $2, %xmm2
2454 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2455 ; SSE41-NEXT: psrlw $15, %xmm0
2456 ; SSE41-NEXT: paddw %xmm2, %xmm0
2459 ; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2461 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2462 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
2463 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
2464 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2465 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2466 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2469 ; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2471 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2472 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
2473 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
2474 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2475 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2476 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2479 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2481 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2482 ; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
2483 ; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
2484 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2485 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2486 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2487 ; AVX512F-NEXT: retq
2489 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2490 ; AVX512BW: # %bb.0:
2491 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2492 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2493 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2494 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2495 ; AVX512BW-NEXT: retq
2497 ; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2499 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2500 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2501 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2502 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2504 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2508 define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2509 ; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2511 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2512 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2513 ; SSE2-NEXT: paddw %xmm0, %xmm1
2514 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2515 ; SSE2-NEXT: psraw $4, %xmm0
2516 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2517 ; SSE2-NEXT: psraw $8, %xmm2
2518 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2519 ; SSE2-NEXT: psrlw $15, %xmm1
2520 ; SSE2-NEXT: paddw %xmm2, %xmm1
2521 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2524 ; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2526 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2527 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2528 ; SSE41-NEXT: paddw %xmm0, %xmm1
2529 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2530 ; SSE41-NEXT: psraw $8, %xmm0
2531 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2532 ; SSE41-NEXT: psraw $4, %xmm2
2533 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2534 ; SSE41-NEXT: psrlw $15, %xmm1
2535 ; SSE41-NEXT: paddw %xmm2, %xmm1
2536 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2539 ; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2541 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2542 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2543 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2544 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2545 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2546 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2547 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2550 ; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2552 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2553 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2554 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2555 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2556 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2557 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2558 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2561 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2563 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2564 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2565 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2566 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2567 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2568 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2569 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2570 ; AVX512F-NEXT: retq
2572 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2573 ; AVX512BW: # %bb.0:
2574 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2575 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2576 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2577 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2578 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2579 ; AVX512BW-NEXT: retq
2581 ; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2583 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2584 ; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2585 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2586 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2587 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2589 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2593 define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2594 ; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2596 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2597 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2598 ; SSE2-NEXT: psubw %xmm0, %xmm1
2599 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2600 ; SSE2-NEXT: psraw $4, %xmm0
2601 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2602 ; SSE2-NEXT: psraw $8, %xmm2
2603 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2604 ; SSE2-NEXT: psrlw $15, %xmm1
2605 ; SSE2-NEXT: paddw %xmm2, %xmm1
2606 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2609 ; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2611 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2612 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2613 ; SSE41-NEXT: psubw %xmm0, %xmm1
2614 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2615 ; SSE41-NEXT: psraw $8, %xmm0
2616 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2617 ; SSE41-NEXT: psraw $4, %xmm2
2618 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2619 ; SSE41-NEXT: psrlw $15, %xmm1
2620 ; SSE41-NEXT: paddw %xmm2, %xmm1
2621 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2624 ; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2626 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2627 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2628 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2629 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2630 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2631 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2632 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2635 ; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2637 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2638 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2639 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2640 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2641 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2642 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2643 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2646 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2648 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2649 ; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2650 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2651 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2652 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2653 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2654 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2655 ; AVX512F-NEXT: retq
2657 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2658 ; AVX512BW: # %bb.0:
2659 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2660 ; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2661 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2662 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2663 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2664 ; AVX512BW-NEXT: retq
2666 ; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2668 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2669 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2670 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2671 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2672 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2674 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2678 define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2679 ; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2681 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2682 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2683 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2684 ; SSE2-NEXT: paddw %xmm1, %xmm0
2685 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2686 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2687 ; SSE2-NEXT: pand %xmm1, %xmm2
2688 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2689 ; SSE2-NEXT: psraw $8, %xmm3
2690 ; SSE2-NEXT: pandn %xmm3, %xmm1
2691 ; SSE2-NEXT: por %xmm2, %xmm1
2692 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2693 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2694 ; SSE2-NEXT: pand %xmm2, %xmm3
2695 ; SSE2-NEXT: psraw $4, %xmm1
2696 ; SSE2-NEXT: pandn %xmm1, %xmm2
2697 ; SSE2-NEXT: por %xmm3, %xmm2
2698 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2699 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2700 ; SSE2-NEXT: pand %xmm1, %xmm3
2701 ; SSE2-NEXT: psraw $2, %xmm2
2702 ; SSE2-NEXT: pandn %xmm2, %xmm1
2703 ; SSE2-NEXT: por %xmm3, %xmm1
2704 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2705 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2706 ; SSE2-NEXT: pand %xmm2, %xmm3
2707 ; SSE2-NEXT: psraw $1, %xmm1
2708 ; SSE2-NEXT: pandn %xmm1, %xmm2
2709 ; SSE2-NEXT: por %xmm3, %xmm2
2710 ; SSE2-NEXT: psrlw $15, %xmm0
2711 ; SSE2-NEXT: paddw %xmm2, %xmm0
2714 ; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2716 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2717 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2718 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2719 ; SSE41-NEXT: paddw %xmm1, %xmm0
2720 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256>
2721 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2722 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2723 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2724 ; SSE41-NEXT: psraw $1, %xmm2
2725 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2726 ; SSE41-NEXT: psrlw $15, %xmm0
2727 ; SSE41-NEXT: paddw %xmm2, %xmm0
2730 ; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2732 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2733 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2734 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2735 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2736 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2737 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
2738 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2739 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2740 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2743 ; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2745 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2746 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2747 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2748 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2749 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2750 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2
2751 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2752 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2753 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2756 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2758 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2759 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2760 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2761 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2762 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2763 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
2764 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2765 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2766 ; AVX512F-NEXT: vzeroupper
2767 ; AVX512F-NEXT: retq
2769 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2770 ; AVX512BW: # %bb.0:
2771 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2772 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2773 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2774 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2775 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2776 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2777 ; AVX512BW-NEXT: retq
2779 ; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2781 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2782 ; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2783 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2784 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2785 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2787 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2791 define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2792 ; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2794 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2795 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2796 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2797 ; SSE2-NEXT: paddw %xmm1, %xmm0
2798 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2799 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2800 ; SSE2-NEXT: psraw $8, %xmm2
2801 ; SSE2-NEXT: pand %xmm1, %xmm2
2802 ; SSE2-NEXT: pandn %xmm0, %xmm1
2803 ; SSE2-NEXT: por %xmm2, %xmm1
2804 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,0,65535,0]
2805 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2806 ; SSE2-NEXT: pand %xmm2, %xmm3
2807 ; SSE2-NEXT: psraw $4, %xmm1
2808 ; SSE2-NEXT: pandn %xmm1, %xmm2
2809 ; SSE2-NEXT: por %xmm3, %xmm2
2810 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535]
2811 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2812 ; SSE2-NEXT: pand %xmm1, %xmm3
2813 ; SSE2-NEXT: psraw $2, %xmm2
2814 ; SSE2-NEXT: pandn %xmm2, %xmm1
2815 ; SSE2-NEXT: por %xmm3, %xmm1
2816 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,0]
2817 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2818 ; SSE2-NEXT: pand %xmm2, %xmm3
2819 ; SSE2-NEXT: psraw $1, %xmm1
2820 ; SSE2-NEXT: pandn %xmm1, %xmm2
2821 ; SSE2-NEXT: por %xmm3, %xmm2
2822 ; SSE2-NEXT: psrlw $15, %xmm0
2823 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2824 ; SSE2-NEXT: paddw %xmm2, %xmm0
2827 ; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2829 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2830 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2831 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2832 ; SSE41-NEXT: paddw %xmm1, %xmm0
2833 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8>
2834 ; SSE41-NEXT: pmulhw %xmm0, %xmm2
2835 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
2836 ; SSE41-NEXT: psrlw $15, %xmm0
2837 ; SSE41-NEXT: pxor %xmm1, %xmm1
2838 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
2839 ; SSE41-NEXT: paddw %xmm2, %xmm1
2840 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2843 ; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2845 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2846 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2847 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2848 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2849 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2850 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2851 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2852 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2853 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2856 ; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2858 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2859 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2860 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2861 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2862 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2863 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2864 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2865 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2866 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2869 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2871 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2872 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2873 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2874 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2875 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2876 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2877 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2878 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
2879 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2880 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2881 ; AVX512F-NEXT: vzeroupper
2882 ; AVX512F-NEXT: retq
2884 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2885 ; AVX512BW: # %bb.0:
2886 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2887 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2888 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2889 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2890 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2891 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2892 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2893 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2894 ; AVX512BW-NEXT: retq
2896 ; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2898 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2899 ; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2900 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2901 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2902 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2903 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2904 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2906 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2910 define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2911 ; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2913 ; SSE2-NEXT: pxor %xmm1, %xmm1
2914 ; SSE2-NEXT: psubw %xmm0, %xmm1
2915 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2918 ; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2920 ; SSE41-NEXT: pxor %xmm1, %xmm1
2921 ; SSE41-NEXT: psubw %xmm0, %xmm1
2922 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2925 ; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2927 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2928 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2929 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2932 ; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2933 ; AVX2ORLATER: # %bb.0:
2934 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2935 ; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2936 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2937 ; AVX2ORLATER-NEXT: retq
2939 ; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2941 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2942 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2943 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2945 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2949 define <16 x i8> @pr38658(<16 x i8> %x) {
2950 ; SSE2-LABEL: pr38658:
2952 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2953 ; SSE2-NEXT: psraw $8, %xmm2
2954 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2955 ; SSE2-NEXT: psrlw $8, %xmm2
2956 ; SSE2-NEXT: pxor %xmm3, %xmm3
2957 ; SSE2-NEXT: pxor %xmm1, %xmm1
2958 ; SSE2-NEXT: packuswb %xmm2, %xmm1
2959 ; SSE2-NEXT: paddb %xmm0, %xmm1
2960 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2961 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2962 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2963 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2964 ; SSE2-NEXT: psraw $8, %xmm2
2965 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2966 ; SSE2-NEXT: psrlw $8, %xmm2
2967 ; SSE2-NEXT: packuswb %xmm2, %xmm0
2968 ; SSE2-NEXT: psrlw $7, %xmm1
2969 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
2970 ; SSE2-NEXT: paddb %xmm0, %xmm1
2971 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2974 ; SSE41-LABEL: pr38658:
2976 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
2977 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
2978 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
2979 ; SSE41-NEXT: psrlw $8, %xmm2
2980 ; SSE41-NEXT: pxor %xmm1, %xmm1
2981 ; SSE41-NEXT: packuswb %xmm2, %xmm1
2982 ; SSE41-NEXT: paddb %xmm0, %xmm1
2983 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2984 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2985 ; SSE41-NEXT: psraw $8, %xmm0
2986 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2987 ; SSE41-NEXT: psllw $6, %xmm2
2988 ; SSE41-NEXT: psllw $8, %xmm0
2989 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2990 ; SSE41-NEXT: psrlw $8, %xmm0
2991 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2992 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2993 ; SSE41-NEXT: psrlw $7, %xmm1
2994 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
2995 ; SSE41-NEXT: paddb %xmm2, %xmm1
2996 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2999 ; AVX1-LABEL: pr38658:
3001 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3002 ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
3003 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
3004 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
3005 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
3006 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
3007 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3008 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3009 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
3010 ; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2
3011 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
3012 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
3013 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
3014 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3015 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
3016 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
3017 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3018 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3021 ; AVX2-LABEL: pr38658:
3023 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3024 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3025 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3026 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3027 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3028 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3029 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3030 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3031 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3032 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3033 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3034 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
3035 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3036 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3037 ; AVX2-NEXT: vzeroupper
3040 ; AVX512F-LABEL: pr38658:
3042 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
3043 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3044 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
3045 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3046 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3047 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3048 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1
3049 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3050 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
3051 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
3052 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3053 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3054 ; AVX512F-NEXT: vzeroupper
3055 ; AVX512F-NEXT: retq
3057 ; AVX512BW-LABEL: pr38658:
3058 ; AVX512BW: # %bb.0:
3059 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
3060 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3061 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
3062 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
3063 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3064 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1
3065 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3066 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
3067 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
3068 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
3069 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3070 ; AVX512BW-NEXT: vzeroupper
3071 ; AVX512BW-NEXT: retq
3073 ; XOP-LABEL: pr38658:
3075 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3076 ; XOP-NEXT: vpmovsxbw %xmm1, %xmm1
3077 ; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
3078 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
3079 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
3080 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3081 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm1
3082 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
3083 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3084 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3086 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3090 define i1 @bool_sdiv(i1 %x, i1 %y) {
3091 ; CHECK-LABEL: bool_sdiv:
3093 ; CHECK-NEXT: movl %edi, %eax
3094 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3100 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3101 ; CHECK-LABEL: boolvec_sdiv:
3104 %r = sdiv <4 x i1> %x, %y