1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
10 ; fold (sdiv x, 1) -> x
11 define i32 @combine_sdiv_by_one(i32 %x) {
12 ; CHECK-LABEL: combine_sdiv_by_one:
14 ; CHECK-NEXT: movl %edi, %eax
20 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21 ; CHECK-LABEL: combine_vec_sdiv_by_one:
24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
28 ; fold (sdiv x, -1) -> 0 - x
29 define i32 @combine_sdiv_by_negone(i32 %x) {
30 ; CHECK-LABEL: combine_sdiv_by_negone:
32 ; CHECK-NEXT: movl %edi, %eax
33 ; CHECK-NEXT: negl %eax
39 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40 ; SSE-LABEL: combine_vec_sdiv_by_negone:
42 ; SSE-NEXT: pxor %xmm1, %xmm1
43 ; SSE-NEXT: psubd %xmm0, %xmm1
44 ; SSE-NEXT: movdqa %xmm1, %xmm0
47 ; AVX-LABEL: combine_vec_sdiv_by_negone:
49 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
50 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
56 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57 define i32 @combine_sdiv_by_minsigned(i32 %x) {
58 ; CHECK-LABEL: combine_sdiv_by_minsigned:
60 ; CHECK-NEXT: xorl %eax, %eax
61 ; CHECK-NEXT: negl %edi
62 ; CHECK-NEXT: seto %al
64 %1 = sdiv i32 %x, -2147483648
68 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69 ; SSE-LABEL: combine_vec_sdiv_by_minsigned:
71 ; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
72 ; SSE-NEXT: psrld $31, %xmm0
75 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
77 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
78 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
81 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
83 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
85 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
88 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
90 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
92 ; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0
95 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
97 ; AVX512BW-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
98 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1]
101 ; XOP-LABEL: combine_vec_sdiv_by_minsigned:
103 ; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
104 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm0
106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
110 ; fold (sdiv 0, x) -> 0
111 define i32 @combine_sdiv_zero(i32 %x) {
112 ; CHECK-LABEL: combine_sdiv_zero:
114 ; CHECK-NEXT: xorl %eax, %eax
120 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121 ; SSE-LABEL: combine_vec_sdiv_zero:
123 ; SSE-NEXT: xorps %xmm0, %xmm0
126 ; AVX-LABEL: combine_vec_sdiv_zero:
128 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
130 %1 = sdiv <4 x i32> zeroinitializer, %x
134 ; fold (sdiv x, x) -> 1
135 define i32 @combine_sdiv_dupe(i32 %x) {
136 ; CHECK-LABEL: combine_sdiv_dupe:
138 ; CHECK-NEXT: movl $1, %eax
144 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145 ; SSE-LABEL: combine_vec_sdiv_dupe:
147 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
150 ; AVX-LABEL: combine_vec_sdiv_dupe:
152 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
154 %1 = sdiv <4 x i32> %x, %x
158 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
159 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
160 ; SSE-LABEL: combine_vec_sdiv_by_pos0:
162 ; SSE-NEXT: psrld $2, %xmm0
163 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
166 ; AVX1-LABEL: combine_vec_sdiv_by_pos0:
168 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
169 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
172 ; AVX2-LABEL: combine_vec_sdiv_by_pos0:
174 ; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
175 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
176 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
179 ; AVX512F-LABEL: combine_vec_sdiv_by_pos0:
181 ; AVX512F-NEXT: vpsrld $2, %xmm0, %xmm0
182 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
183 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
186 ; AVX512BW-LABEL: combine_vec_sdiv_by_pos0:
188 ; AVX512BW-NEXT: vpsrld $2, %xmm0, %xmm0
189 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
190 ; AVX512BW-NEXT: retq
192 ; XOP-LABEL: combine_vec_sdiv_by_pos0:
194 ; XOP-NEXT: vpsrld $2, %xmm0, %xmm0
195 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
197 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
198 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
202 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
203 ; SSE2-LABEL: combine_vec_sdiv_by_pos1:
205 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
206 ; SSE2-NEXT: movdqa %xmm0, %xmm1
207 ; SSE2-NEXT: psrld $4, %xmm1
208 ; SSE2-NEXT: movdqa %xmm0, %xmm2
209 ; SSE2-NEXT: psrld $3, %xmm2
210 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
211 ; SSE2-NEXT: movdqa %xmm0, %xmm1
212 ; SSE2-NEXT: psrld $2, %xmm1
213 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
214 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
217 ; SSE41-LABEL: combine_vec_sdiv_by_pos1:
219 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
220 ; SSE41-NEXT: movdqa %xmm0, %xmm1
221 ; SSE41-NEXT: psrld $4, %xmm1
222 ; SSE41-NEXT: movdqa %xmm0, %xmm2
223 ; SSE41-NEXT: psrld $2, %xmm2
224 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
225 ; SSE41-NEXT: movdqa %xmm0, %xmm1
226 ; SSE41-NEXT: psrld $3, %xmm1
227 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
228 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
231 ; AVX1-LABEL: combine_vec_sdiv_by_pos1:
233 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
234 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
235 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
236 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
237 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2
238 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
239 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
242 ; AVX2-LABEL: combine_vec_sdiv_by_pos1:
244 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
245 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
248 ; AVX512F-LABEL: combine_vec_sdiv_by_pos1:
250 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
251 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
254 ; AVX512BW-LABEL: combine_vec_sdiv_by_pos1:
256 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
257 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
258 ; AVX512BW-NEXT: retq
260 ; XOP-LABEL: combine_vec_sdiv_by_pos1:
262 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
263 ; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
265 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
266 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
270 ; fold (sdiv x, (1 << c)) -> x >>u c
271 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
272 ; SSE-LABEL: combine_vec_sdiv_by_pow2a:
274 ; SSE-NEXT: movdqa %xmm0, %xmm1
275 ; SSE-NEXT: psrad $31, %xmm1
276 ; SSE-NEXT: psrld $30, %xmm1
277 ; SSE-NEXT: paddd %xmm1, %xmm0
278 ; SSE-NEXT: psrad $2, %xmm0
281 ; AVX-LABEL: combine_vec_sdiv_by_pow2a:
283 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
284 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
285 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
286 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
288 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
292 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
293 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
295 ; SSE-NEXT: movdqa %xmm0, %xmm1
296 ; SSE-NEXT: psrad $31, %xmm1
297 ; SSE-NEXT: psrld $30, %xmm1
298 ; SSE-NEXT: paddd %xmm0, %xmm1
299 ; SSE-NEXT: psrad $2, %xmm1
300 ; SSE-NEXT: pxor %xmm0, %xmm0
301 ; SSE-NEXT: psubd %xmm1, %xmm0
304 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
306 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
307 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
308 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
309 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
310 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
311 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
313 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
317 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
318 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
320 ; SSE2-NEXT: pxor %xmm1, %xmm1
321 ; SSE2-NEXT: pxor %xmm2, %xmm2
322 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
323 ; SSE2-NEXT: movdqa %xmm2, %xmm3
324 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
325 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2]
326 ; SSE2-NEXT: pmullw %xmm4, %xmm3
327 ; SSE2-NEXT: psrlw $8, %xmm3
328 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
329 ; SSE2-NEXT: pmullw %xmm4, %xmm2
330 ; SSE2-NEXT: psrlw $8, %xmm2
331 ; SSE2-NEXT: packuswb %xmm3, %xmm2
332 ; SSE2-NEXT: paddb %xmm0, %xmm2
333 ; SSE2-NEXT: movdqa %xmm2, %xmm1
334 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
335 ; SSE2-NEXT: psraw $8, %xmm1
336 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
337 ; SSE2-NEXT: pmullw %xmm3, %xmm1
338 ; SSE2-NEXT: psrlw $8, %xmm1
339 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
340 ; SSE2-NEXT: psraw $8, %xmm2
341 ; SSE2-NEXT: pmullw %xmm3, %xmm2
342 ; SSE2-NEXT: psrlw $8, %xmm2
343 ; SSE2-NEXT: packuswb %xmm1, %xmm2
344 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
345 ; SSE2-NEXT: pand %xmm1, %xmm2
346 ; SSE2-NEXT: pandn %xmm0, %xmm1
347 ; SSE2-NEXT: por %xmm2, %xmm1
348 ; SSE2-NEXT: movdqa %xmm1, %xmm0
351 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
353 ; SSE41-NEXT: movdqa %xmm0, %xmm1
354 ; SSE41-NEXT: pxor %xmm0, %xmm0
355 ; SSE41-NEXT: pxor %xmm3, %xmm3
356 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
357 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
358 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
359 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
360 ; SSE41-NEXT: pmullw %xmm0, %xmm3
361 ; SSE41-NEXT: psrlw $8, %xmm3
362 ; SSE41-NEXT: pmullw %xmm0, %xmm2
363 ; SSE41-NEXT: psrlw $8, %xmm2
364 ; SSE41-NEXT: packuswb %xmm3, %xmm2
365 ; SSE41-NEXT: paddb %xmm1, %xmm2
366 ; SSE41-NEXT: movdqa %xmm2, %xmm0
367 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
368 ; SSE41-NEXT: psraw $8, %xmm0
369 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
370 ; SSE41-NEXT: pmullw %xmm3, %xmm0
371 ; SSE41-NEXT: psrlw $8, %xmm0
372 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
373 ; SSE41-NEXT: psraw $8, %xmm2
374 ; SSE41-NEXT: pmullw %xmm3, %xmm2
375 ; SSE41-NEXT: psrlw $8, %xmm2
376 ; SSE41-NEXT: packuswb %xmm0, %xmm2
377 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
378 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
379 ; SSE41-NEXT: movdqa %xmm1, %xmm0
382 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
384 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
385 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
386 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
387 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2]
388 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
389 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
390 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
391 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
392 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
393 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
394 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
395 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
396 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
397 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
398 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
399 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
400 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
401 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
402 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
403 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
404 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
405 ; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
406 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
409 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
411 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
412 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
413 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
414 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,4,2,16,8,32,64,2,256,4,2,16,8,32,64,2]
415 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
416 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
417 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
418 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
419 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
420 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,64,128,16,32,8,4,128,256,64,128,16,32,8,4,128]
421 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
422 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
423 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
424 ; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
425 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
426 ; AVX2-NEXT: vzeroupper
429 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
431 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
432 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
433 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
434 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
435 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
436 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
437 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
438 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
439 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
440 ; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
441 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
442 ; AVX512F-NEXT: vzeroupper
445 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
447 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
448 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
449 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
450 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
451 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
452 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1
453 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
454 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
455 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
456 ; AVX512BW-NEXT: movw $257, %ax # imm = 0x101
457 ; AVX512BW-NEXT: kmovd %eax, %k1
458 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
459 ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
460 ; AVX512BW-NEXT: vzeroupper
461 ; AVX512BW-NEXT: retq
463 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
465 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
466 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
467 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
468 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
469 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
470 ; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
471 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
473 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
477 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
478 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
480 ; SSE2-NEXT: movdqa %xmm0, %xmm1
481 ; SSE2-NEXT: psraw $15, %xmm1
482 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,4,2,16,8,32,64,2]
483 ; SSE2-NEXT: paddw %xmm0, %xmm1
484 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
485 ; SSE2-NEXT: movdqa %xmm1, %xmm3
486 ; SSE2-NEXT: pand %xmm2, %xmm3
487 ; SSE2-NEXT: psraw $4, %xmm1
488 ; SSE2-NEXT: pandn %xmm1, %xmm2
489 ; SSE2-NEXT: por %xmm3, %xmm2
490 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
491 ; SSE2-NEXT: movdqa %xmm2, %xmm3
492 ; SSE2-NEXT: pand %xmm1, %xmm3
493 ; SSE2-NEXT: psraw $2, %xmm2
494 ; SSE2-NEXT: pandn %xmm2, %xmm1
495 ; SSE2-NEXT: por %xmm3, %xmm1
496 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
497 ; SSE2-NEXT: movdqa %xmm1, %xmm3
498 ; SSE2-NEXT: pand %xmm2, %xmm3
499 ; SSE2-NEXT: psraw $1, %xmm1
500 ; SSE2-NEXT: pandn %xmm1, %xmm2
501 ; SSE2-NEXT: por %xmm3, %xmm2
502 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
503 ; SSE2-NEXT: pand %xmm1, %xmm2
504 ; SSE2-NEXT: pandn %xmm0, %xmm1
505 ; SSE2-NEXT: por %xmm2, %xmm1
506 ; SSE2-NEXT: movdqa %xmm1, %xmm0
509 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
511 ; SSE41-NEXT: movdqa %xmm0, %xmm1
512 ; SSE41-NEXT: psraw $15, %xmm1
513 ; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,4,2,16,8,32,64,2]
514 ; SSE41-NEXT: paddw %xmm0, %xmm1
515 ; SSE41-NEXT: movdqa %xmm1, %xmm2
516 ; SSE41-NEXT: psraw $1, %xmm2
517 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,16384,u,4096,8192,2048,1024,u]
518 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
519 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
522 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
524 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
525 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2]
526 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
527 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
528 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,16384,u,4096,8192,2048,1024,u]
529 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
530 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
533 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
535 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
536 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2]
537 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
538 ; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2
539 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,16384,u,4096,8192,2048,1024,u]
540 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
541 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
544 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
546 ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1
547 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2]
548 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1
549 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
550 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
551 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
552 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
553 ; AVX512F-NEXT: vzeroupper
556 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
558 ; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1
559 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
560 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1
561 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
562 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
563 ; AVX512BW-NEXT: retq
565 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
567 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm1
568 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
569 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1
570 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
571 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
573 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
577 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
578 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
580 ; SSE2-NEXT: movdqa %xmm0, %xmm3
581 ; SSE2-NEXT: psraw $15, %xmm0
582 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [u,4,2,16,8,32,64,2]
583 ; SSE2-NEXT: pmulhuw %xmm7, %xmm0
584 ; SSE2-NEXT: paddw %xmm3, %xmm0
585 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
586 ; SSE2-NEXT: movdqa %xmm0, %xmm2
587 ; SSE2-NEXT: pand %xmm4, %xmm2
588 ; SSE2-NEXT: psraw $4, %xmm0
589 ; SSE2-NEXT: movdqa %xmm4, %xmm6
590 ; SSE2-NEXT: pandn %xmm0, %xmm6
591 ; SSE2-NEXT: por %xmm2, %xmm6
592 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
593 ; SSE2-NEXT: movdqa %xmm6, %xmm0
594 ; SSE2-NEXT: pand %xmm5, %xmm0
595 ; SSE2-NEXT: psraw $2, %xmm6
596 ; SSE2-NEXT: movdqa %xmm5, %xmm2
597 ; SSE2-NEXT: pandn %xmm6, %xmm2
598 ; SSE2-NEXT: por %xmm0, %xmm2
599 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,0]
600 ; SSE2-NEXT: movdqa %xmm2, %xmm0
601 ; SSE2-NEXT: pand %xmm6, %xmm0
602 ; SSE2-NEXT: psraw $1, %xmm2
603 ; SSE2-NEXT: movdqa %xmm6, %xmm8
604 ; SSE2-NEXT: pandn %xmm2, %xmm8
605 ; SSE2-NEXT: por %xmm0, %xmm8
606 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
607 ; SSE2-NEXT: pand %xmm2, %xmm8
608 ; SSE2-NEXT: movdqa %xmm2, %xmm0
609 ; SSE2-NEXT: pandn %xmm3, %xmm0
610 ; SSE2-NEXT: por %xmm8, %xmm0
611 ; SSE2-NEXT: movdqa %xmm1, %xmm3
612 ; SSE2-NEXT: psraw $15, %xmm3
613 ; SSE2-NEXT: pmulhuw %xmm7, %xmm3
614 ; SSE2-NEXT: paddw %xmm1, %xmm3
615 ; SSE2-NEXT: movdqa %xmm3, %xmm7
616 ; SSE2-NEXT: pand %xmm4, %xmm7
617 ; SSE2-NEXT: psraw $4, %xmm3
618 ; SSE2-NEXT: pandn %xmm3, %xmm4
619 ; SSE2-NEXT: por %xmm7, %xmm4
620 ; SSE2-NEXT: movdqa %xmm4, %xmm3
621 ; SSE2-NEXT: pand %xmm5, %xmm3
622 ; SSE2-NEXT: psraw $2, %xmm4
623 ; SSE2-NEXT: pandn %xmm4, %xmm5
624 ; SSE2-NEXT: por %xmm3, %xmm5
625 ; SSE2-NEXT: movdqa %xmm5, %xmm3
626 ; SSE2-NEXT: pand %xmm6, %xmm3
627 ; SSE2-NEXT: psraw $1, %xmm5
628 ; SSE2-NEXT: pandn %xmm5, %xmm6
629 ; SSE2-NEXT: por %xmm3, %xmm6
630 ; SSE2-NEXT: pand %xmm2, %xmm6
631 ; SSE2-NEXT: pandn %xmm1, %xmm2
632 ; SSE2-NEXT: por %xmm6, %xmm2
633 ; SSE2-NEXT: movdqa %xmm2, %xmm1
636 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
638 ; SSE41-NEXT: movdqa %xmm0, %xmm2
639 ; SSE41-NEXT: psraw $15, %xmm2
640 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2]
641 ; SSE41-NEXT: pmulhuw %xmm3, %xmm2
642 ; SSE41-NEXT: paddw %xmm0, %xmm2
643 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768]
644 ; SSE41-NEXT: movdqa %xmm2, %xmm5
645 ; SSE41-NEXT: pmulhw %xmm4, %xmm5
646 ; SSE41-NEXT: psraw $1, %xmm2
647 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
648 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
649 ; SSE41-NEXT: movdqa %xmm1, %xmm2
650 ; SSE41-NEXT: psraw $15, %xmm2
651 ; SSE41-NEXT: pmulhuw %xmm3, %xmm2
652 ; SSE41-NEXT: paddw %xmm1, %xmm2
653 ; SSE41-NEXT: pmulhw %xmm2, %xmm4
654 ; SSE41-NEXT: psraw $1, %xmm2
655 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6],xmm2[7]
656 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
659 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
661 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
662 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
663 ; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2]
664 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
665 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
666 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [u,16384,32768,4096,8192,2048,1024,32768]
667 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4
668 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
669 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
670 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4
671 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3
672 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3
673 ; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2
674 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
675 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
676 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
677 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
678 ; AVX1-NEXT: # ymm2 = mem[0,1,0,1]
679 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
680 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
681 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
684 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
686 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
687 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,4,2,16,8,32,64,2,u,4,2,16,8,32,64,2]
688 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
689 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2
690 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,16384,u,4096,8192,2048,1024,u,u,16384,u,4096,8192,2048,1024,u]
691 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
692 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
695 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
697 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
698 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,4,2,16,8,32,64,2,u,4,2,16,8,32,64,2]
699 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
700 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
701 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
702 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
703 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
706 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
708 ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1
709 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
710 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1
711 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
712 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
713 ; AVX512BW-NEXT: retq
715 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
717 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
718 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2
719 ; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65522,65521,65524,65523,65525,65526,65521]
720 ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2
721 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1
722 ; XOP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535]
723 ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1
724 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4
725 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3
726 ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3
727 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2
728 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
729 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
730 ; XOP-NEXT: # ymm2 = mem[0,1,0,1]
731 ; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
733 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
737 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
738 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
740 ; SSE2-NEXT: movdqa %xmm1, %xmm5
741 ; SSE2-NEXT: movdqa %xmm0, %xmm1
742 ; SSE2-NEXT: psraw $15, %xmm0
743 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [u,4,2,16,8,32,64,2]
744 ; SSE2-NEXT: pmulhuw %xmm9, %xmm0
745 ; SSE2-NEXT: paddw %xmm1, %xmm0
746 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535]
747 ; SSE2-NEXT: movdqa %xmm0, %xmm4
748 ; SSE2-NEXT: pand %xmm6, %xmm4
749 ; SSE2-NEXT: psraw $4, %xmm0
750 ; SSE2-NEXT: movdqa %xmm6, %xmm8
751 ; SSE2-NEXT: pandn %xmm0, %xmm8
752 ; SSE2-NEXT: por %xmm4, %xmm8
753 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
754 ; SSE2-NEXT: movdqa %xmm8, %xmm0
755 ; SSE2-NEXT: pand %xmm7, %xmm0
756 ; SSE2-NEXT: psraw $2, %xmm8
757 ; SSE2-NEXT: movdqa %xmm7, %xmm4
758 ; SSE2-NEXT: pandn %xmm8, %xmm4
759 ; SSE2-NEXT: por %xmm0, %xmm4
760 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,0,65535,0]
761 ; SSE2-NEXT: movdqa %xmm4, %xmm0
762 ; SSE2-NEXT: pand %xmm8, %xmm0
763 ; SSE2-NEXT: psraw $1, %xmm4
764 ; SSE2-NEXT: movdqa %xmm8, %xmm10
765 ; SSE2-NEXT: pandn %xmm4, %xmm10
766 ; SSE2-NEXT: por %xmm0, %xmm10
767 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,65535]
768 ; SSE2-NEXT: pand %xmm4, %xmm10
769 ; SSE2-NEXT: movdqa %xmm4, %xmm0
770 ; SSE2-NEXT: pandn %xmm1, %xmm0
771 ; SSE2-NEXT: por %xmm10, %xmm0
772 ; SSE2-NEXT: movdqa %xmm5, %xmm1
773 ; SSE2-NEXT: psraw $15, %xmm1
774 ; SSE2-NEXT: pmulhuw %xmm9, %xmm1
775 ; SSE2-NEXT: paddw %xmm5, %xmm1
776 ; SSE2-NEXT: movdqa %xmm1, %xmm10
777 ; SSE2-NEXT: pand %xmm6, %xmm10
778 ; SSE2-NEXT: psraw $4, %xmm1
779 ; SSE2-NEXT: movdqa %xmm6, %xmm11
780 ; SSE2-NEXT: pandn %xmm1, %xmm11
781 ; SSE2-NEXT: por %xmm10, %xmm11
782 ; SSE2-NEXT: movdqa %xmm11, %xmm1
783 ; SSE2-NEXT: pand %xmm7, %xmm1
784 ; SSE2-NEXT: psraw $2, %xmm11
785 ; SSE2-NEXT: movdqa %xmm7, %xmm10
786 ; SSE2-NEXT: pandn %xmm11, %xmm10
787 ; SSE2-NEXT: por %xmm1, %xmm10
788 ; SSE2-NEXT: movdqa %xmm10, %xmm1
789 ; SSE2-NEXT: pand %xmm8, %xmm1
790 ; SSE2-NEXT: psraw $1, %xmm10
791 ; SSE2-NEXT: movdqa %xmm8, %xmm11
792 ; SSE2-NEXT: pandn %xmm10, %xmm11
793 ; SSE2-NEXT: por %xmm1, %xmm11
794 ; SSE2-NEXT: pand %xmm4, %xmm11
795 ; SSE2-NEXT: movdqa %xmm4, %xmm1
796 ; SSE2-NEXT: pandn %xmm5, %xmm1
797 ; SSE2-NEXT: por %xmm11, %xmm1
798 ; SSE2-NEXT: movdqa %xmm2, %xmm5
799 ; SSE2-NEXT: psraw $15, %xmm5
800 ; SSE2-NEXT: pmulhuw %xmm9, %xmm5
801 ; SSE2-NEXT: paddw %xmm2, %xmm5
802 ; SSE2-NEXT: movdqa %xmm5, %xmm10
803 ; SSE2-NEXT: pand %xmm6, %xmm10
804 ; SSE2-NEXT: psraw $4, %xmm5
805 ; SSE2-NEXT: movdqa %xmm6, %xmm11
806 ; SSE2-NEXT: pandn %xmm5, %xmm11
807 ; SSE2-NEXT: por %xmm10, %xmm11
808 ; SSE2-NEXT: movdqa %xmm11, %xmm5
809 ; SSE2-NEXT: pand %xmm7, %xmm5
810 ; SSE2-NEXT: psraw $2, %xmm11
811 ; SSE2-NEXT: movdqa %xmm7, %xmm10
812 ; SSE2-NEXT: pandn %xmm11, %xmm10
813 ; SSE2-NEXT: por %xmm5, %xmm10
814 ; SSE2-NEXT: movdqa %xmm10, %xmm5
815 ; SSE2-NEXT: pand %xmm8, %xmm5
816 ; SSE2-NEXT: psraw $1, %xmm10
817 ; SSE2-NEXT: movdqa %xmm8, %xmm11
818 ; SSE2-NEXT: pandn %xmm10, %xmm11
819 ; SSE2-NEXT: por %xmm5, %xmm11
820 ; SSE2-NEXT: pand %xmm4, %xmm11
821 ; SSE2-NEXT: movdqa %xmm4, %xmm5
822 ; SSE2-NEXT: pandn %xmm2, %xmm5
823 ; SSE2-NEXT: por %xmm11, %xmm5
824 ; SSE2-NEXT: movdqa %xmm3, %xmm2
825 ; SSE2-NEXT: psraw $15, %xmm2
826 ; SSE2-NEXT: pmulhuw %xmm9, %xmm2
827 ; SSE2-NEXT: paddw %xmm3, %xmm2
828 ; SSE2-NEXT: movdqa %xmm2, %xmm9
829 ; SSE2-NEXT: pand %xmm6, %xmm9
830 ; SSE2-NEXT: psraw $4, %xmm2
831 ; SSE2-NEXT: pandn %xmm2, %xmm6
832 ; SSE2-NEXT: por %xmm9, %xmm6
833 ; SSE2-NEXT: movdqa %xmm6, %xmm2
834 ; SSE2-NEXT: pand %xmm7, %xmm2
835 ; SSE2-NEXT: psraw $2, %xmm6
836 ; SSE2-NEXT: pandn %xmm6, %xmm7
837 ; SSE2-NEXT: por %xmm2, %xmm7
838 ; SSE2-NEXT: movdqa %xmm7, %xmm2
839 ; SSE2-NEXT: pand %xmm8, %xmm2
840 ; SSE2-NEXT: psraw $1, %xmm7
841 ; SSE2-NEXT: pandn %xmm7, %xmm8
842 ; SSE2-NEXT: por %xmm2, %xmm8
843 ; SSE2-NEXT: pand %xmm4, %xmm8
844 ; SSE2-NEXT: pandn %xmm3, %xmm4
845 ; SSE2-NEXT: por %xmm8, %xmm4
846 ; SSE2-NEXT: movdqa %xmm5, %xmm2
847 ; SSE2-NEXT: movdqa %xmm4, %xmm3
850 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
852 ; SSE41-NEXT: movdqa %xmm0, %xmm6
853 ; SSE41-NEXT: psraw $15, %xmm6
854 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [0,4,2,16,8,32,64,2]
855 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
856 ; SSE41-NEXT: paddw %xmm0, %xmm6
857 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768]
858 ; SSE41-NEXT: movdqa %xmm6, %xmm7
859 ; SSE41-NEXT: pmulhw %xmm4, %xmm7
860 ; SSE41-NEXT: psraw $1, %xmm6
861 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
862 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7]
863 ; SSE41-NEXT: movdqa %xmm1, %xmm6
864 ; SSE41-NEXT: psraw $15, %xmm6
865 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
866 ; SSE41-NEXT: paddw %xmm1, %xmm6
867 ; SSE41-NEXT: movdqa %xmm6, %xmm7
868 ; SSE41-NEXT: pmulhw %xmm4, %xmm7
869 ; SSE41-NEXT: psraw $1, %xmm6
870 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
871 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7]
872 ; SSE41-NEXT: movdqa %xmm2, %xmm6
873 ; SSE41-NEXT: psraw $15, %xmm6
874 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
875 ; SSE41-NEXT: paddw %xmm2, %xmm6
876 ; SSE41-NEXT: movdqa %xmm6, %xmm7
877 ; SSE41-NEXT: pmulhw %xmm4, %xmm7
878 ; SSE41-NEXT: psraw $1, %xmm6
879 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
880 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7]
881 ; SSE41-NEXT: movdqa %xmm3, %xmm6
882 ; SSE41-NEXT: psraw $15, %xmm6
883 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
884 ; SSE41-NEXT: paddw %xmm3, %xmm6
885 ; SSE41-NEXT: pmulhw %xmm6, %xmm4
886 ; SSE41-NEXT: psraw $1, %xmm6
887 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7]
888 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7]
891 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
893 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
894 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3
895 ; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,4,2,16,8,32,64,2]
896 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3
897 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
898 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [u,16384,32768,4096,8192,2048,1024,32768]
899 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5
900 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
901 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
902 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5
903 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5
904 ; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5
905 ; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6
906 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5
907 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
908 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
909 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
910 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
911 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
912 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
913 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
914 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
915 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6
916 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6
917 ; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2
918 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6
919 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
920 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
921 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6
922 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4
923 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4
924 ; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
925 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4
926 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
927 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
928 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
929 ; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1
930 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
933 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
935 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2
936 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
937 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
938 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
939 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2
940 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
941 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
942 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5
943 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
944 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
945 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
946 ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2
947 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
948 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2
949 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3
950 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
951 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
952 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
955 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
957 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
958 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
959 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
960 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
961 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
962 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
963 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
964 ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
965 ; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1
966 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
967 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
968 ; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5
969 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2
970 ; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2
971 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
972 ; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2
973 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
974 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
975 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
976 ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
977 ; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
980 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
982 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1
983 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
984 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
985 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
986 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
987 ; AVX512BW-NEXT: kmovd %eax, %k1
988 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
989 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
990 ; AVX512BW-NEXT: retq
992 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
994 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
995 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3
996 ; XOP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,65522,65521,65524,65523,65525,65526,65521]
997 ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3
998 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2
999 ; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535]
1000 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
1001 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5
1002 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5
1003 ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5
1004 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5
1005 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1006 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1007 ; XOP-NEXT: # ymm5 = mem[0,1,0,1]
1008 ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0
1009 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1010 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6
1011 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6
1012 ; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2
1013 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
1014 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm6
1015 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4
1016 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4
1017 ; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3
1018 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1019 ; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1
1021 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1025 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1026 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1028 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1029 ; SSE2-NEXT: psrad $31, %xmm1
1030 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1031 ; SSE2-NEXT: psrld $28, %xmm2
1032 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1033 ; SSE2-NEXT: psrld $29, %xmm3
1034 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1035 ; SSE2-NEXT: psrld $30, %xmm1
1036 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1037 ; SSE2-NEXT: paddd %xmm0, %xmm1
1038 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1039 ; SSE2-NEXT: psrad $4, %xmm2
1040 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1041 ; SSE2-NEXT: psrad $3, %xmm3
1042 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1043 ; SSE2-NEXT: psrad $2, %xmm1
1044 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1045 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1046 ; SSE2-NEXT: movaps %xmm1, %xmm0
1049 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1051 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1052 ; SSE41-NEXT: psrad $31, %xmm1
1053 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1054 ; SSE41-NEXT: psrld $28, %xmm2
1055 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1056 ; SSE41-NEXT: psrld $30, %xmm3
1057 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1058 ; SSE41-NEXT: psrld $29, %xmm1
1059 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1060 ; SSE41-NEXT: paddd %xmm0, %xmm1
1061 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1062 ; SSE41-NEXT: psrad $4, %xmm2
1063 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1064 ; SSE41-NEXT: psrad $2, %xmm3
1065 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1066 ; SSE41-NEXT: psrad $3, %xmm1
1067 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1068 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1071 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1073 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
1074 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1075 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
1076 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1077 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
1078 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1079 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1080 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1081 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1082 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1083 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1084 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1085 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1088 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1089 ; AVX2ORLATER: # %bb.0:
1090 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
1091 ; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1092 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1093 ; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1094 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1095 ; AVX2ORLATER-NEXT: retq
1097 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1099 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
1100 ; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1101 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1102 ; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1103 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1105 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1109 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1110 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1112 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1113 ; SSE2-NEXT: psrad $31, %xmm0
1114 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1115 ; SSE2-NEXT: psrld $28, %xmm3
1116 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1117 ; SSE2-NEXT: psrld $29, %xmm4
1118 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1119 ; SSE2-NEXT: psrld $30, %xmm0
1120 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1121 ; SSE2-NEXT: paddd %xmm2, %xmm0
1122 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1123 ; SSE2-NEXT: psrad $4, %xmm3
1124 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1125 ; SSE2-NEXT: psrad $3, %xmm4
1126 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1127 ; SSE2-NEXT: psrad $2, %xmm0
1128 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1129 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1130 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1131 ; SSE2-NEXT: psrad $31, %xmm2
1132 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1133 ; SSE2-NEXT: psrld $28, %xmm3
1134 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1135 ; SSE2-NEXT: psrld $29, %xmm4
1136 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1137 ; SSE2-NEXT: psrld $30, %xmm2
1138 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1139 ; SSE2-NEXT: paddd %xmm1, %xmm2
1140 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1141 ; SSE2-NEXT: psrad $4, %xmm3
1142 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1143 ; SSE2-NEXT: psrad $3, %xmm4
1144 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1145 ; SSE2-NEXT: psrad $2, %xmm2
1146 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1147 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1148 ; SSE2-NEXT: movaps %xmm2, %xmm1
1151 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1153 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1154 ; SSE41-NEXT: psrad $31, %xmm2
1155 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1156 ; SSE41-NEXT: psrld $28, %xmm3
1157 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1158 ; SSE41-NEXT: psrld $30, %xmm4
1159 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1160 ; SSE41-NEXT: psrld $29, %xmm2
1161 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1162 ; SSE41-NEXT: paddd %xmm0, %xmm2
1163 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1164 ; SSE41-NEXT: psrad $4, %xmm3
1165 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1166 ; SSE41-NEXT: psrad $2, %xmm4
1167 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1168 ; SSE41-NEXT: psrad $3, %xmm2
1169 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1170 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1171 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1172 ; SSE41-NEXT: psrad $31, %xmm2
1173 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1174 ; SSE41-NEXT: psrld $28, %xmm3
1175 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1176 ; SSE41-NEXT: psrld $30, %xmm4
1177 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1178 ; SSE41-NEXT: psrld $29, %xmm2
1179 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1180 ; SSE41-NEXT: paddd %xmm1, %xmm2
1181 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1182 ; SSE41-NEXT: psrad $4, %xmm3
1183 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1184 ; SSE41-NEXT: psrad $2, %xmm4
1185 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1186 ; SSE41-NEXT: psrad $3, %xmm2
1187 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1188 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1191 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1193 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1194 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1195 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1196 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1197 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1198 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1199 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1200 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1201 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1202 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1203 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1204 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1206 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1207 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1208 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1209 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1210 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1211 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1212 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
1213 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1214 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1215 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1216 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1217 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1218 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1219 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1222 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1223 ; AVX2ORLATER: # %bb.0:
1224 ; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1
1225 ; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1226 ; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1
1227 ; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1228 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1229 ; AVX2ORLATER-NEXT: retq
1231 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1233 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
1234 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2
1235 ; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967266,4294967267,4294967268]
1236 ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2
1237 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1238 ; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292]
1239 ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1
1240 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4
1241 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3
1242 ; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1243 ; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2
1244 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1245 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1247 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1251 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1252 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1254 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1255 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1256 ; SSE2-NEXT: psrad $31, %xmm0
1257 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1258 ; SSE2-NEXT: psrld $28, %xmm5
1259 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1260 ; SSE2-NEXT: psrld $29, %xmm6
1261 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1262 ; SSE2-NEXT: psrld $30, %xmm0
1263 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1264 ; SSE2-NEXT: paddd %xmm1, %xmm0
1265 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1266 ; SSE2-NEXT: psrad $4, %xmm5
1267 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1268 ; SSE2-NEXT: psrad $3, %xmm6
1269 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1270 ; SSE2-NEXT: psrad $2, %xmm0
1271 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1272 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1273 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1274 ; SSE2-NEXT: psrad $31, %xmm1
1275 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1276 ; SSE2-NEXT: psrld $28, %xmm5
1277 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1278 ; SSE2-NEXT: psrld $29, %xmm6
1279 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1280 ; SSE2-NEXT: psrld $30, %xmm1
1281 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1282 ; SSE2-NEXT: paddd %xmm4, %xmm1
1283 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1284 ; SSE2-NEXT: psrad $4, %xmm5
1285 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1286 ; SSE2-NEXT: psrad $3, %xmm6
1287 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1288 ; SSE2-NEXT: psrad $2, %xmm1
1289 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1290 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1291 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1292 ; SSE2-NEXT: psrad $31, %xmm4
1293 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1294 ; SSE2-NEXT: psrld $28, %xmm5
1295 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1296 ; SSE2-NEXT: psrld $29, %xmm6
1297 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1298 ; SSE2-NEXT: psrld $30, %xmm4
1299 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1300 ; SSE2-NEXT: paddd %xmm2, %xmm4
1301 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1302 ; SSE2-NEXT: psrad $4, %xmm5
1303 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1304 ; SSE2-NEXT: psrad $3, %xmm6
1305 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1306 ; SSE2-NEXT: psrad $2, %xmm4
1307 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1308 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1309 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1310 ; SSE2-NEXT: psrad $31, %xmm5
1311 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1312 ; SSE2-NEXT: psrld $28, %xmm2
1313 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1314 ; SSE2-NEXT: psrld $29, %xmm6
1315 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1316 ; SSE2-NEXT: psrld $30, %xmm5
1317 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1318 ; SSE2-NEXT: paddd %xmm3, %xmm5
1319 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1320 ; SSE2-NEXT: psrad $4, %xmm2
1321 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1322 ; SSE2-NEXT: psrad $3, %xmm6
1323 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1324 ; SSE2-NEXT: psrad $2, %xmm5
1325 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1326 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1327 ; SSE2-NEXT: movaps %xmm4, %xmm2
1328 ; SSE2-NEXT: movaps %xmm5, %xmm3
1331 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1333 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1334 ; SSE41-NEXT: psrad $31, %xmm4
1335 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1336 ; SSE41-NEXT: psrld $28, %xmm5
1337 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1338 ; SSE41-NEXT: psrld $30, %xmm6
1339 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1340 ; SSE41-NEXT: psrld $29, %xmm4
1341 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1342 ; SSE41-NEXT: paddd %xmm0, %xmm4
1343 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1344 ; SSE41-NEXT: psrad $4, %xmm5
1345 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1346 ; SSE41-NEXT: psrad $2, %xmm6
1347 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1348 ; SSE41-NEXT: psrad $3, %xmm4
1349 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1350 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5,6,7]
1351 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1352 ; SSE41-NEXT: psrad $31, %xmm4
1353 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1354 ; SSE41-NEXT: psrld $28, %xmm5
1355 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1356 ; SSE41-NEXT: psrld $30, %xmm6
1357 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1358 ; SSE41-NEXT: psrld $29, %xmm4
1359 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1360 ; SSE41-NEXT: paddd %xmm1, %xmm4
1361 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1362 ; SSE41-NEXT: psrad $4, %xmm5
1363 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1364 ; SSE41-NEXT: psrad $2, %xmm6
1365 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1366 ; SSE41-NEXT: psrad $3, %xmm4
1367 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1368 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5,6,7]
1369 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1370 ; SSE41-NEXT: psrad $31, %xmm4
1371 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1372 ; SSE41-NEXT: psrld $28, %xmm5
1373 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1374 ; SSE41-NEXT: psrld $30, %xmm6
1375 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1376 ; SSE41-NEXT: psrld $29, %xmm4
1377 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1378 ; SSE41-NEXT: paddd %xmm2, %xmm4
1379 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1380 ; SSE41-NEXT: psrad $4, %xmm5
1381 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1382 ; SSE41-NEXT: psrad $2, %xmm6
1383 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1384 ; SSE41-NEXT: psrad $3, %xmm4
1385 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1386 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1387 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1388 ; SSE41-NEXT: psrad $31, %xmm4
1389 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1390 ; SSE41-NEXT: psrld $28, %xmm5
1391 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1392 ; SSE41-NEXT: psrld $30, %xmm6
1393 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1394 ; SSE41-NEXT: psrld $29, %xmm4
1395 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1396 ; SSE41-NEXT: paddd %xmm3, %xmm4
1397 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1398 ; SSE41-NEXT: psrad $4, %xmm5
1399 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1400 ; SSE41-NEXT: psrad $2, %xmm6
1401 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1402 ; SSE41-NEXT: psrad $3, %xmm4
1403 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1404 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
1407 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1409 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1410 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1411 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1412 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1413 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1414 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1415 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1416 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1417 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1418 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1419 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1420 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1422 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
1423 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1424 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1425 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1426 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1427 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1428 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1429 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1430 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1431 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1432 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1433 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1434 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1435 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1436 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1437 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1438 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1439 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1440 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1441 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1442 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1443 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1444 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1445 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1446 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1447 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1448 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1449 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
1450 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1451 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1452 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1453 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1454 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1455 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
1456 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1457 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1458 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1459 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1460 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1461 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1462 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1465 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1467 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
1468 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28]
1469 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1470 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1471 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
1472 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1473 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1474 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1475 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1476 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
1477 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1478 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2
1479 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1480 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1483 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1485 ; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1
1486 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1487 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1488 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1489 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
1490 ; AVX512F-NEXT: kmovw %eax, %k1
1491 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1492 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1493 ; AVX512F-NEXT: retq
1495 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1496 ; AVX512BW: # %bb.0:
1497 ; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1
1498 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1499 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1500 ; AVX512BW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1501 ; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111
1502 ; AVX512BW-NEXT: kmovd %eax, %k1
1503 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1504 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1505 ; AVX512BW-NEXT: retq
1507 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1509 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1510 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3
1511 ; XOP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,4294967266,4294967267,4294967268]
1512 ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3
1513 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1514 ; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292]
1515 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1516 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5
1517 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1518 ; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5
1519 ; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5
1520 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1521 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1522 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1523 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm5
1524 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1525 ; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1526 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1527 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm5
1528 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4
1529 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4
1530 ; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3
1531 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1532 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1534 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1538 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1539 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1541 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1542 ; SSE2-NEXT: psrad $31, %xmm1
1543 ; SSE2-NEXT: psrlq $62, %xmm1
1544 ; SSE2-NEXT: paddq %xmm0, %xmm1
1545 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1546 ; SSE2-NEXT: psrad $2, %xmm2
1547 ; SSE2-NEXT: psrlq $2, %xmm1
1548 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1549 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1550 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1553 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1555 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1556 ; SSE41-NEXT: psrad $31, %xmm1
1557 ; SSE41-NEXT: psrlq $62, %xmm1
1558 ; SSE41-NEXT: paddq %xmm0, %xmm1
1559 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1560 ; SSE41-NEXT: psrad $2, %xmm2
1561 ; SSE41-NEXT: psrlq $2, %xmm1
1562 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1563 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1566 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1568 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1569 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1570 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1
1571 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1572 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2
1573 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
1574 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1575 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1578 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1580 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1581 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1582 ; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1
1583 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1584 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2
1585 ; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1
1586 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
1587 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1590 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1592 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1593 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1594 ; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1
1595 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1596 ; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1
1597 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1598 ; AVX512F-NEXT: vzeroupper
1599 ; AVX512F-NEXT: retq
1601 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1602 ; AVX512BW: # %bb.0:
1603 ; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1
1604 ; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1
1605 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1606 ; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1
1607 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1608 ; AVX512BW-NEXT: retq
1610 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1612 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1613 ; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1
1614 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1615 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1616 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1618 %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1622 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1623 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1625 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1626 ; SSE2-NEXT: psrad $31, %xmm2
1627 ; SSE2-NEXT: psrlq $62, %xmm2
1628 ; SSE2-NEXT: paddq %xmm0, %xmm2
1629 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
1630 ; SSE2-NEXT: psrad $2, %xmm3
1631 ; SSE2-NEXT: psrlq $2, %xmm2
1632 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1633 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1634 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1635 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1636 ; SSE2-NEXT: psrad $31, %xmm2
1637 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1638 ; SSE2-NEXT: psrlq $61, %xmm3
1639 ; SSE2-NEXT: psrlq $60, %xmm2
1640 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
1641 ; SSE2-NEXT: paddq %xmm2, %xmm1
1642 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1643 ; SSE2-NEXT: psrlq $3, %xmm2
1644 ; SSE2-NEXT: psrlq $4, %xmm1
1645 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1646 ; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
1647 ; SSE2-NEXT: xorpd %xmm2, %xmm1
1648 ; SSE2-NEXT: psubq %xmm2, %xmm1
1651 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1653 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1654 ; SSE41-NEXT: psrad $31, %xmm2
1655 ; SSE41-NEXT: psrlq $62, %xmm2
1656 ; SSE41-NEXT: paddq %xmm0, %xmm2
1657 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1658 ; SSE41-NEXT: psrad $2, %xmm3
1659 ; SSE41-NEXT: psrlq $2, %xmm2
1660 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1661 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1662 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1663 ; SSE41-NEXT: psrad $31, %xmm2
1664 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1665 ; SSE41-NEXT: psrlq $60, %xmm3
1666 ; SSE41-NEXT: psrlq $61, %xmm2
1667 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1668 ; SSE41-NEXT: paddq %xmm2, %xmm1
1669 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1670 ; SSE41-NEXT: psrlq $4, %xmm2
1671 ; SSE41-NEXT: psrlq $3, %xmm1
1672 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1673 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
1674 ; SSE41-NEXT: pxor %xmm2, %xmm1
1675 ; SSE41-NEXT: psubq %xmm2, %xmm1
1678 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1680 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1681 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1682 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
1683 ; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4
1684 ; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3
1685 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1686 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
1687 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
1688 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1
1689 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1690 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1691 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1692 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
1693 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
1694 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1695 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1696 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3
1697 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1698 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1699 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1700 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1703 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1705 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1706 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
1707 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1708 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1709 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1710 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,2305843009213693952,1152921504606846976,576460752303423488]
1711 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
1712 ; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1
1713 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1716 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1718 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1719 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,3,4]
1720 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2
1721 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
1722 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
1723 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1724 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1725 ; AVX512F-NEXT: retq
1727 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1728 ; AVX512BW: # %bb.0:
1729 ; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1
1730 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1731 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1732 ; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1733 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1734 ; AVX512BW-NEXT: retq
1736 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1738 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1739 ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2
1740 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2
1741 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1742 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1743 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1744 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1
1745 ; XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1746 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
1747 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1748 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1749 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1751 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1755 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1756 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1758 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1759 ; SSE2-NEXT: psrad $31, %xmm4
1760 ; SSE2-NEXT: psrlq $62, %xmm4
1761 ; SSE2-NEXT: paddq %xmm0, %xmm4
1762 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
1763 ; SSE2-NEXT: psrad $2, %xmm5
1764 ; SSE2-NEXT: psrlq $2, %xmm4
1765 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1766 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1767 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
1768 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1769 ; SSE2-NEXT: psrad $31, %xmm4
1770 ; SSE2-NEXT: psrlq $62, %xmm4
1771 ; SSE2-NEXT: paddq %xmm2, %xmm4
1772 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
1773 ; SSE2-NEXT: psrad $2, %xmm5
1774 ; SSE2-NEXT: psrlq $2, %xmm4
1775 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1776 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1777 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
1778 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1779 ; SSE2-NEXT: psrad $31, %xmm4
1780 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1781 ; SSE2-NEXT: psrlq $61, %xmm5
1782 ; SSE2-NEXT: psrlq $60, %xmm4
1783 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1784 ; SSE2-NEXT: paddq %xmm4, %xmm1
1785 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1786 ; SSE2-NEXT: psrlq $3, %xmm4
1787 ; SSE2-NEXT: psrlq $4, %xmm1
1788 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
1789 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1790 ; SSE2-NEXT: xorpd %xmm4, %xmm1
1791 ; SSE2-NEXT: psubq %xmm4, %xmm1
1792 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1793 ; SSE2-NEXT: psrad $31, %xmm5
1794 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1795 ; SSE2-NEXT: psrlq $61, %xmm6
1796 ; SSE2-NEXT: psrlq $60, %xmm5
1797 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
1798 ; SSE2-NEXT: paddq %xmm5, %xmm3
1799 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1800 ; SSE2-NEXT: psrlq $3, %xmm5
1801 ; SSE2-NEXT: psrlq $4, %xmm3
1802 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1]
1803 ; SSE2-NEXT: xorpd %xmm4, %xmm3
1804 ; SSE2-NEXT: psubq %xmm4, %xmm3
1807 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1809 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1810 ; SSE41-NEXT: psrad $31, %xmm4
1811 ; SSE41-NEXT: psrlq $62, %xmm4
1812 ; SSE41-NEXT: paddq %xmm0, %xmm4
1813 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1814 ; SSE41-NEXT: psrad $2, %xmm5
1815 ; SSE41-NEXT: psrlq $2, %xmm4
1816 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
1817 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
1818 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1819 ; SSE41-NEXT: psrad $31, %xmm4
1820 ; SSE41-NEXT: psrlq $62, %xmm4
1821 ; SSE41-NEXT: paddq %xmm2, %xmm4
1822 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1823 ; SSE41-NEXT: psrad $2, %xmm5
1824 ; SSE41-NEXT: psrlq $2, %xmm4
1825 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
1826 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1827 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1828 ; SSE41-NEXT: psrad $31, %xmm4
1829 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1830 ; SSE41-NEXT: psrlq $60, %xmm5
1831 ; SSE41-NEXT: psrlq $61, %xmm4
1832 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1833 ; SSE41-NEXT: paddq %xmm4, %xmm1
1834 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1835 ; SSE41-NEXT: psrlq $4, %xmm4
1836 ; SSE41-NEXT: psrlq $3, %xmm1
1837 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1838 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1839 ; SSE41-NEXT: pxor %xmm4, %xmm1
1840 ; SSE41-NEXT: psubq %xmm4, %xmm1
1841 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1842 ; SSE41-NEXT: psrad $31, %xmm5
1843 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1844 ; SSE41-NEXT: psrlq $60, %xmm6
1845 ; SSE41-NEXT: psrlq $61, %xmm5
1846 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1847 ; SSE41-NEXT: paddq %xmm5, %xmm3
1848 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1849 ; SSE41-NEXT: psrlq $4, %xmm5
1850 ; SSE41-NEXT: psrlq $3, %xmm3
1851 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1852 ; SSE41-NEXT: pxor %xmm4, %xmm3
1853 ; SSE41-NEXT: psubq %xmm4, %xmm3
1856 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1858 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1859 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1860 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1861 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5
1862 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4
1863 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1864 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
1865 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4
1866 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1867 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1868 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1869 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1870 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1871 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
1872 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
1873 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
1874 ; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6
1875 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5
1876 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1877 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1878 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1879 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1880 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5
1881 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6
1882 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5
1883 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1884 ; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
1885 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5
1886 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1887 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1888 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1889 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1890 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
1891 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1892 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
1893 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1894 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1895 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1896 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1897 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1900 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1902 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1903 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1904 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,62,61,60]
1905 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3
1906 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3
1907 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,2,3,4]
1908 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3
1909 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,2305843009213693952,1152921504606846976,576460752303423488]
1910 ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3
1911 ; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3
1912 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1913 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
1914 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2
1915 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2
1916 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2
1917 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
1918 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1919 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1922 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1924 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1925 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1926 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1927 ; AVX512F-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1928 ; AVX512F-NEXT: movb $17, %al
1929 ; AVX512F-NEXT: kmovw %eax, %k1
1930 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1931 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1932 ; AVX512F-NEXT: retq
1934 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1935 ; AVX512BW: # %bb.0:
1936 ; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1
1937 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1938 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1939 ; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1940 ; AVX512BW-NEXT: movb $17, %al
1941 ; AVX512BW-NEXT: kmovd %eax, %k1
1942 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1943 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1944 ; AVX512BW-NEXT: retq
1946 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1948 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1949 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1950 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4
1951 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
1952 ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4
1953 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1954 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1955 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1956 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6
1957 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6
1958 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6
1959 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614]
1960 ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6
1961 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
1962 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1963 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1964 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6
1965 ; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5
1966 ; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2
1967 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1968 ; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3
1969 ; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3
1970 ; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3
1971 ; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3
1972 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1973 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1975 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
1979 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
1980 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
1982 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1983 ; SSE2-NEXT: psrad $31, %xmm1
1984 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1985 ; SSE2-NEXT: psrld $28, %xmm2
1986 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1987 ; SSE2-NEXT: psrld $29, %xmm3
1988 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1989 ; SSE2-NEXT: psrld $30, %xmm1
1990 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1991 ; SSE2-NEXT: paddd %xmm0, %xmm1
1992 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1993 ; SSE2-NEXT: psrad $4, %xmm2
1994 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1995 ; SSE2-NEXT: psrad $3, %xmm3
1996 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1997 ; SSE2-NEXT: psrad $2, %xmm1
1998 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1999 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2000 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
2001 ; SSE2-NEXT: pxor %xmm2, %xmm2
2002 ; SSE2-NEXT: psubd %xmm1, %xmm2
2003 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2004 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2007 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2009 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2010 ; SSE41-NEXT: psrad $31, %xmm1
2011 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2012 ; SSE41-NEXT: psrld $28, %xmm2
2013 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2014 ; SSE41-NEXT: psrld $30, %xmm3
2015 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2016 ; SSE41-NEXT: psrld $29, %xmm1
2017 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2018 ; SSE41-NEXT: paddd %xmm0, %xmm1
2019 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2020 ; SSE41-NEXT: psrad $4, %xmm2
2021 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2022 ; SSE41-NEXT: psrad $2, %xmm3
2023 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2024 ; SSE41-NEXT: pxor %xmm2, %xmm2
2025 ; SSE41-NEXT: psubd %xmm3, %xmm2
2026 ; SSE41-NEXT: psrad $3, %xmm1
2027 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2028 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2031 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2033 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
2034 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2035 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
2036 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2037 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
2038 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2039 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2040 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
2041 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
2042 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2043 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2044 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
2045 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
2046 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2047 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2050 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2051 ; AVX2ORLATER: # %bb.0:
2052 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
2053 ; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2054 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2055 ; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2056 ; AVX2ORLATER-NEXT: vpxor %xmm2, %xmm2, %xmm2
2057 ; AVX2ORLATER-NEXT: vpsubd %xmm1, %xmm2, %xmm2
2058 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2059 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
2060 ; AVX2ORLATER-NEXT: retq
2062 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2064 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
2065 ; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2066 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2067 ; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2068 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2069 ; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm2
2070 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2071 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2073 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2077 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2078 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2081 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2085 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2086 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2089 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2093 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2094 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2097 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2102 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2103 ; SSE-LABEL: non_splat_minus_one_divisor_0:
2105 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2106 ; SSE-NEXT: pxor %xmm1, %xmm0
2107 ; SSE-NEXT: psubb %xmm1, %xmm0
2110 ; AVX1-LABEL: non_splat_minus_one_divisor_0:
2112 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2113 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2114 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2117 ; AVX2-LABEL: non_splat_minus_one_divisor_0:
2119 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2120 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
2121 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2124 ; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2126 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2127 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
2128 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2129 ; AVX512F-NEXT: retq
2131 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2132 ; AVX512BW: # %bb.0:
2133 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2134 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2135 ; AVX512BW-NEXT: kmovd %eax, %k1
2136 ; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2137 ; AVX512BW-NEXT: retq
2139 ; XOP-LABEL: non_splat_minus_one_divisor_0:
2141 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2142 ; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
2143 ; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2145 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2149 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2150 ; SSE2-LABEL: non_splat_minus_one_divisor_1:
2152 ; SSE2-NEXT: pxor %xmm1, %xmm1
2153 ; SSE2-NEXT: pxor %xmm2, %xmm2
2154 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
2155 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2156 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2157 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
2158 ; SSE2-NEXT: psrlw $8, %xmm3
2159 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2160 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,256,2,256,256,256,2,256]
2161 ; SSE2-NEXT: psrlw $8, %xmm2
2162 ; SSE2-NEXT: packuswb %xmm3, %xmm2
2163 ; SSE2-NEXT: paddb %xmm0, %xmm2
2164 ; SSE2-NEXT: movdqa %xmm2, %xmm1
2165 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2166 ; SSE2-NEXT: psraw $8, %xmm1
2167 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,128,128,128,128,2,128,2]
2168 ; SSE2-NEXT: psrlw $8, %xmm1
2169 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2170 ; SSE2-NEXT: psraw $8, %xmm2
2171 ; SSE2-NEXT: psllw $7, %xmm2
2172 ; SSE2-NEXT: psrlw $8, %xmm2
2173 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2174 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2175 ; SSE2-NEXT: pand %xmm1, %xmm2
2176 ; SSE2-NEXT: pandn %xmm0, %xmm1
2177 ; SSE2-NEXT: por %xmm2, %xmm1
2178 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2179 ; SSE2-NEXT: pxor %xmm0, %xmm1
2180 ; SSE2-NEXT: psubb %xmm0, %xmm1
2181 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2184 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
2186 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2187 ; SSE41-NEXT: pxor %xmm0, %xmm0
2188 ; SSE41-NEXT: pxor %xmm3, %xmm3
2189 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
2190 ; SSE41-NEXT: pxor %xmm4, %xmm4
2191 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2192 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2193 ; SSE41-NEXT: psllw $1, %xmm2
2194 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
2195 ; SSE41-NEXT: psrlw $8, %xmm2
2196 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2197 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
2198 ; SSE41-NEXT: psrlw $8, %xmm3
2199 ; SSE41-NEXT: packuswb %xmm3, %xmm2
2200 ; SSE41-NEXT: paddb %xmm1, %xmm2
2201 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2202 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2203 ; SSE41-NEXT: psraw $8, %xmm0
2204 ; SSE41-NEXT: movdqa %xmm0, %xmm3
2205 ; SSE41-NEXT: psllw $1, %xmm3
2206 ; SSE41-NEXT: psllw $7, %xmm0
2207 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2208 ; SSE41-NEXT: psrlw $8, %xmm0
2209 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2210 ; SSE41-NEXT: psraw $8, %xmm2
2211 ; SSE41-NEXT: psllw $7, %xmm2
2212 ; SSE41-NEXT: psrlw $8, %xmm2
2213 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2214 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2215 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
2216 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2217 ; SSE41-NEXT: pxor %xmm0, %xmm1
2218 ; SSE41-NEXT: psubb %xmm0, %xmm1
2219 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2222 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
2224 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2225 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2226 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2227 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2228 ; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
2229 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2230 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2231 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2232 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
2233 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2234 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
2235 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2236 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2237 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2238 ; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3
2239 ; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2240 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2241 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2242 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2243 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
2244 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
2245 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2246 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2247 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2248 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2249 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2250 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2251 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2254 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
2256 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2257 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2258 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2259 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,2,256,256,256,2,256,256,2,2,2,2,128,2,128]
2260 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2261 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2262 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2263 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2264 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
2265 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,256,256,256,128,256,256,128,128,128,128,2,128,2]
2266 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2267 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2268 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2269 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2270 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2271 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2272 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
2273 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2274 ; AVX2-NEXT: vzeroupper
2277 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2279 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2280 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2281 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2282 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2283 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
2284 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2285 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
2286 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2287 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
2288 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2289 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2290 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2291 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
2292 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2293 ; AVX512F-NEXT: vzeroupper
2294 ; AVX512F-NEXT: retq
2296 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2297 ; AVX512BW: # %bb.0:
2298 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2299 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2300 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2301 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2302 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2303 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2304 ; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2
2305 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2306 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2307 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2308 ; AVX512BW-NEXT: kmovd %eax, %k1
2309 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
2310 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0
2311 ; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44
2312 ; AVX512BW-NEXT: kmovd %eax, %k1
2313 ; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1}
2314 ; AVX512BW-NEXT: vzeroupper
2315 ; AVX512BW-NEXT: retq
2317 ; XOP-LABEL: non_splat_minus_one_divisor_1:
2319 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2320 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2321 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2322 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2323 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2324 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2325 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2326 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2327 ; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
2328 ; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2330 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2334 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2335 ; SSE2-LABEL: non_splat_minus_one_divisor_2:
2337 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2338 ; SSE2-NEXT: psrld $31, %xmm1
2339 ; SSE2-NEXT: paddd %xmm0, %xmm1
2340 ; SSE2-NEXT: psrad $1, %xmm1
2341 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2342 ; SSE2-NEXT: pxor %xmm0, %xmm0
2343 ; SSE2-NEXT: psubd %xmm1, %xmm0
2344 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2345 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2348 ; SSE41-LABEL: non_splat_minus_one_divisor_2:
2350 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2351 ; SSE41-NEXT: psrld $31, %xmm1
2352 ; SSE41-NEXT: paddd %xmm0, %xmm1
2353 ; SSE41-NEXT: psrad $1, %xmm1
2354 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2355 ; SSE41-NEXT: pxor %xmm1, %xmm1
2356 ; SSE41-NEXT: psubd %xmm0, %xmm1
2357 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2360 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
2362 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
2363 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2364 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1
2365 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2366 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2367 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2368 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2371 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2372 ; AVX2ORLATER: # %bb.0:
2373 ; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1
2374 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2375 ; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1
2376 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2377 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2378 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2379 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2380 ; AVX2ORLATER-NEXT: retq
2382 ; XOP-LABEL: non_splat_minus_one_divisor_2:
2384 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm1
2385 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2386 ; XOP-NEXT: vpsrad $1, %xmm1, %xmm1
2387 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2388 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2389 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2390 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2392 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2396 define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2397 ; SSE-LABEL: combine_vec_sdiv_nonuniform:
2399 ; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [21846,21846,21846,21846,2979,2979,2979,2979]
2400 ; SSE-NEXT: movdqa %xmm0, %xmm1
2401 ; SSE-NEXT: psrlw $15, %xmm1
2402 ; SSE-NEXT: paddw %xmm1, %xmm0
2405 ; AVX-LABEL: combine_vec_sdiv_nonuniform:
2407 ; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [21846,21846,21846,21846,2979,2979,2979,2979]
2408 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
2409 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2411 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2415 define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2416 ; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2418 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2419 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2420 ; SSE2-NEXT: psraw $2, %xmm1
2421 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2422 ; SSE2-NEXT: psraw $1, %xmm2
2423 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2424 ; SSE2-NEXT: psrlw $15, %xmm0
2425 ; SSE2-NEXT: paddw %xmm2, %xmm0
2428 ; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2430 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2431 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2432 ; SSE41-NEXT: psraw $1, %xmm1
2433 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2434 ; SSE41-NEXT: psraw $2, %xmm2
2435 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2436 ; SSE41-NEXT: psrlw $15, %xmm0
2437 ; SSE41-NEXT: paddw %xmm2, %xmm0
2440 ; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2442 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2443 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
2444 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
2445 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2446 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2447 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2450 ; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2452 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2453 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
2454 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
2455 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2456 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2457 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2460 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2462 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2463 ; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
2464 ; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
2465 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2466 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2467 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2468 ; AVX512F-NEXT: retq
2470 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2471 ; AVX512BW: # %bb.0:
2472 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2473 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2474 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2475 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2476 ; AVX512BW-NEXT: retq
2478 ; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2480 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10923,10923,10923,10923,5243,5243,5243,5243]
2481 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2482 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2483 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2485 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2489 define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2490 ; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2492 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2493 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2494 ; SSE2-NEXT: paddw %xmm1, %xmm0
2495 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2496 ; SSE2-NEXT: psraw $4, %xmm1
2497 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2498 ; SSE2-NEXT: psraw $8, %xmm2
2499 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2500 ; SSE2-NEXT: psrlw $15, %xmm0
2501 ; SSE2-NEXT: paddw %xmm2, %xmm0
2504 ; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2506 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2507 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2508 ; SSE41-NEXT: paddw %xmm1, %xmm0
2509 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2510 ; SSE41-NEXT: psraw $8, %xmm1
2511 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2512 ; SSE41-NEXT: psraw $4, %xmm2
2513 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2514 ; SSE41-NEXT: psrlw $15, %xmm0
2515 ; SSE41-NEXT: paddw %xmm2, %xmm0
2518 ; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2520 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833]
2521 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2522 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2523 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2524 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2525 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2526 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2529 ; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2531 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833]
2532 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2533 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2534 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2535 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2536 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2537 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2540 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2542 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833]
2543 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2544 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2545 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2546 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2547 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2548 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2549 ; AVX512F-NEXT: retq
2551 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2552 ; AVX512BW: # %bb.0:
2553 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833]
2554 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2555 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2556 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2557 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2558 ; AVX512BW-NEXT: retq
2560 ; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2562 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45591,45591,45591,45591,32833,32833,32833,32833]
2563 ; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2564 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2565 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2566 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2568 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2572 define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2573 ; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2575 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2576 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2577 ; SSE2-NEXT: psubw %xmm0, %xmm1
2578 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2579 ; SSE2-NEXT: psraw $4, %xmm0
2580 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2581 ; SSE2-NEXT: psraw $8, %xmm2
2582 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2583 ; SSE2-NEXT: psrlw $15, %xmm1
2584 ; SSE2-NEXT: paddw %xmm2, %xmm1
2585 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2588 ; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2590 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2591 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2592 ; SSE41-NEXT: psubw %xmm0, %xmm1
2593 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2594 ; SSE41-NEXT: psraw $8, %xmm0
2595 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2596 ; SSE41-NEXT: psraw $4, %xmm2
2597 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2598 ; SSE41-NEXT: psrlw $15, %xmm1
2599 ; SSE41-NEXT: paddw %xmm2, %xmm1
2600 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2603 ; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2605 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639]
2606 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2607 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2608 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2609 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2610 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2611 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2614 ; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2616 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639]
2617 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2618 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2619 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2620 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2621 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2622 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2625 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2627 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639]
2628 ; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2629 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2630 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2631 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2632 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2633 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2634 ; AVX512F-NEXT: retq
2636 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2637 ; AVX512BW: # %bb.0:
2638 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639]
2639 ; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2640 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2641 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2642 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2643 ; AVX512BW-NEXT: retq
2645 ; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2647 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [19945,19945,19945,19945,32639,32639,32639,32639]
2648 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2649 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2650 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2651 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2653 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2657 define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2658 ; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2660 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2661 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2662 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
2663 ; SSE2-NEXT: paddw %xmm1, %xmm0
2664 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2665 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2666 ; SSE2-NEXT: pand %xmm1, %xmm2
2667 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2668 ; SSE2-NEXT: psraw $8, %xmm3
2669 ; SSE2-NEXT: pandn %xmm3, %xmm1
2670 ; SSE2-NEXT: por %xmm2, %xmm1
2671 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2672 ; SSE2-NEXT: pand %xmm2, %xmm1
2673 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2674 ; SSE2-NEXT: psraw $4, %xmm3
2675 ; SSE2-NEXT: pandn %xmm3, %xmm2
2676 ; SSE2-NEXT: por %xmm1, %xmm2
2677 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2678 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2679 ; SSE2-NEXT: pand %xmm1, %xmm3
2680 ; SSE2-NEXT: psraw $2, %xmm2
2681 ; SSE2-NEXT: pandn %xmm2, %xmm1
2682 ; SSE2-NEXT: por %xmm3, %xmm1
2683 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2684 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2685 ; SSE2-NEXT: pand %xmm2, %xmm3
2686 ; SSE2-NEXT: psraw $1, %xmm1
2687 ; SSE2-NEXT: pandn %xmm1, %xmm2
2688 ; SSE2-NEXT: por %xmm3, %xmm2
2689 ; SSE2-NEXT: psrlw $15, %xmm0
2690 ; SSE2-NEXT: paddw %xmm2, %xmm0
2693 ; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2695 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2696 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2697 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
2698 ; SSE41-NEXT: paddw %xmm1, %xmm0
2699 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [256,16384,4096,u,u,u,512,256]
2700 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2701 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2702 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2703 ; SSE41-NEXT: psraw $1, %xmm2
2704 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2705 ; SSE41-NEXT: psrlw $15, %xmm0
2706 ; SSE41-NEXT: paddw %xmm2, %xmm0
2709 ; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2711 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1]
2712 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
2713 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2714 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [256,16384,4096,u,u,u,512,256]
2715 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2716 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
2717 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2718 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2719 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2722 ; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2724 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1]
2725 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
2726 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2727 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [256,16384,4096,u,u,u,512,256]
2728 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2729 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2
2730 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2731 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2732 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2735 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2737 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1]
2738 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
2739 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2740 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2741 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2742 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2743 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2744 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2745 ; AVX512F-NEXT: vzeroupper
2746 ; AVX512F-NEXT: retq
2748 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2749 ; AVX512BW: # %bb.0:
2750 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1]
2751 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
2752 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2753 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2754 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2755 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2756 ; AVX512BW-NEXT: retq
2758 ; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2760 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32639,54613,19945,21846,2979,5243,32897,32833]
2761 ; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2762 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2763 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2764 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2766 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2770 define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2771 ; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2773 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2774 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2775 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
2776 ; SSE2-NEXT: paddw %xmm1, %xmm0
2777 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2778 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2779 ; SSE2-NEXT: psraw $8, %xmm2
2780 ; SSE2-NEXT: pand %xmm1, %xmm2
2781 ; SSE2-NEXT: pandn %xmm0, %xmm1
2782 ; SSE2-NEXT: por %xmm2, %xmm1
2783 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2784 ; SSE2-NEXT: psraw $6, %xmm2
2785 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535]
2786 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0]
2787 ; SSE2-NEXT: pand %xmm4, %xmm1
2788 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2789 ; SSE2-NEXT: psraw $12, %xmm5
2790 ; SSE2-NEXT: pandn %xmm5, %xmm4
2791 ; SSE2-NEXT: por %xmm1, %xmm4
2792 ; SSE2-NEXT: pand %xmm3, %xmm4
2793 ; SSE2-NEXT: pandn %xmm2, %xmm3
2794 ; SSE2-NEXT: por %xmm4, %xmm3
2795 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0]
2796 ; SSE2-NEXT: movdqa %xmm3, %xmm2
2797 ; SSE2-NEXT: pand %xmm1, %xmm2
2798 ; SSE2-NEXT: psraw $1, %xmm3
2799 ; SSE2-NEXT: pandn %xmm3, %xmm1
2800 ; SSE2-NEXT: por %xmm2, %xmm1
2801 ; SSE2-NEXT: psrlw $15, %xmm0
2802 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2803 ; SSE2-NEXT: paddw %xmm1, %xmm0
2806 ; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2808 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2809 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2810 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
2811 ; SSE41-NEXT: paddw %xmm1, %xmm0
2812 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,256,256,u,u,512,256,8]
2813 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2814 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2815 ; SSE41-NEXT: psrlw $15, %xmm0
2816 ; SSE41-NEXT: pxor %xmm2, %xmm2
2817 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2818 ; SSE41-NEXT: paddw %xmm1, %xmm0
2821 ; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2823 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0]
2824 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
2825 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2826 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,256,256,u,u,512,256,8]
2827 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2828 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2829 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2830 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2831 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2834 ; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2836 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0]
2837 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
2838 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2839 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,256,256,u,u,512,256,8]
2840 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2841 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2842 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2843 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2844 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2847 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2849 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0]
2850 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
2851 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2852 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2853 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2854 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2855 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2856 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2857 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2858 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2859 ; AVX512F-NEXT: vzeroupper
2860 ; AVX512F-NEXT: retq
2862 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2863 ; AVX512BW: # %bb.0:
2864 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0]
2865 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
2866 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2867 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2868 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2869 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2870 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2871 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2872 ; AVX512BW-NEXT: retq
2874 ; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2876 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32767,32767,32703,0,0,32897,32769,16385]
2877 ; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2878 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2879 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2880 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2881 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2882 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2884 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2888 define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2889 ; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2891 ; SSE2-NEXT: pxor %xmm1, %xmm1
2892 ; SSE2-NEXT: psubw %xmm0, %xmm1
2893 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2896 ; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2898 ; SSE41-NEXT: pxor %xmm1, %xmm1
2899 ; SSE41-NEXT: psubw %xmm0, %xmm1
2900 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2903 ; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2905 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2906 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2907 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2910 ; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2911 ; AVX2ORLATER: # %bb.0:
2912 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2913 ; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2914 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2915 ; AVX2ORLATER-NEXT: retq
2917 ; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2919 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2920 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2921 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2923 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2927 define <16 x i8> @pr38658(<16 x i8> %x) {
2928 ; SSE2-LABEL: pr38658:
2930 ; SSE2-NEXT: pxor %xmm1, %xmm1
2931 ; SSE2-NEXT: pxor %xmm2, %xmm2
2932 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2933 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
2934 ; SSE2-NEXT: psrlw $8, %xmm2
2935 ; SSE2-NEXT: pxor %xmm3, %xmm3
2936 ; SSE2-NEXT: packuswb %xmm2, %xmm3
2937 ; SSE2-NEXT: paddb %xmm3, %xmm0
2938 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2939 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2940 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2941 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2942 ; SSE2-NEXT: psraw $8, %xmm1
2943 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,256,256,256,256,256,256,64]
2944 ; SSE2-NEXT: psrlw $8, %xmm1
2945 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2946 ; SSE2-NEXT: psrlw $7, %xmm0
2947 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2948 ; SSE2-NEXT: paddb %xmm2, %xmm0
2951 ; SSE41-LABEL: pr38658:
2953 ; SSE41-NEXT: pxor %xmm1, %xmm1
2954 ; SSE41-NEXT: pxor %xmm2, %xmm2
2955 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2956 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
2957 ; SSE41-NEXT: psrlw $8, %xmm2
2958 ; SSE41-NEXT: packuswb %xmm2, %xmm1
2959 ; SSE41-NEXT: paddb %xmm1, %xmm0
2960 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2961 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2962 ; SSE41-NEXT: psraw $8, %xmm1
2963 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2964 ; SSE41-NEXT: psllw $6, %xmm2
2965 ; SSE41-NEXT: psllw $8, %xmm1
2966 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2967 ; SSE41-NEXT: psrlw $8, %xmm1
2968 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2969 ; SSE41-NEXT: packuswb %xmm1, %xmm2
2970 ; SSE41-NEXT: psrlw $7, %xmm0
2971 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2972 ; SSE41-NEXT: paddb %xmm2, %xmm0
2975 ; AVX1-LABEL: pr38658:
2977 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2978 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2979 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
2980 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2981 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2982 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2983 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2984 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
2985 ; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2
2986 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
2987 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2988 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2989 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2990 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2991 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
2992 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2993 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2996 ; AVX2-LABEL: pr38658:
2998 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
2999 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65427]
3000 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3001 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3002 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3003 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3004 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3005 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,64]
3006 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3007 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3008 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3009 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
3010 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3011 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3012 ; AVX2-NEXT: vzeroupper
3015 ; AVX512F-LABEL: pr38658:
3017 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
3018 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65427]
3019 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
3020 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3021 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3022 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3023 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1
3024 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3025 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
3026 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
3027 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3028 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3029 ; AVX512F-NEXT: vzeroupper
3030 ; AVX512F-NEXT: retq
3032 ; AVX512BW-LABEL: pr38658:
3033 ; AVX512BW: # %bb.0:
3034 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
3035 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,65427]
3036 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
3037 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
3038 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3039 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1
3040 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3041 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
3042 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3043 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
3044 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3045 ; AVX512BW-NEXT: vzeroupper
3046 ; AVX512BW-NEXT: retq
3048 ; XOP-LABEL: pr38658:
3050 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
3051 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3052 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
3053 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
3054 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3055 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3056 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3057 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3058 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3060 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3064 define i1 @bool_sdiv(i1 %x, i1 %y) {
3065 ; CHECK-LABEL: bool_sdiv:
3067 ; CHECK-NEXT: movl %edi, %eax
3068 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3074 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3075 ; CHECK-LABEL: boolvec_sdiv:
3078 %r = sdiv <4 x i1> %x, %y
3082 define i32 @combine_sdiv_two(i32 %x) {
3083 ; CHECK-LABEL: combine_sdiv_two:
3085 ; CHECK-NEXT: movl %edi, %eax
3086 ; CHECK-NEXT: shrl $31, %eax
3087 ; CHECK-NEXT: addl %edi, %eax
3088 ; CHECK-NEXT: sarl %eax
3094 define i32 @combine_sdiv_negtwo(i32 %x) {
3095 ; CHECK-LABEL: combine_sdiv_negtwo:
3097 ; CHECK-NEXT: movl %edi, %eax
3098 ; CHECK-NEXT: shrl $31, %eax
3099 ; CHECK-NEXT: addl %edi, %eax
3100 ; CHECK-NEXT: sarl %eax
3101 ; CHECK-NEXT: negl %eax
3103 %1 = sdiv i32 %x, -2
3107 define i8 @combine_i8_sdiv_pow2(i8 %x) {
3108 ; CHECK-LABEL: combine_i8_sdiv_pow2:
3110 ; CHECK-NEXT: movl %edi, %eax
3111 ; CHECK-NEXT: sarb $7, %al
3112 ; CHECK-NEXT: shrb $4, %al
3113 ; CHECK-NEXT: addb %dil, %al
3114 ; CHECK-NEXT: sarb $4, %al
3120 define i8 @combine_i8_sdiv_negpow2(i8 %x) {
3121 ; CHECK-LABEL: combine_i8_sdiv_negpow2:
3123 ; CHECK-NEXT: movl %edi, %eax
3124 ; CHECK-NEXT: sarb $7, %al
3125 ; CHECK-NEXT: shrb $2, %al
3126 ; CHECK-NEXT: addb %dil, %al
3127 ; CHECK-NEXT: sarb $6, %al
3128 ; CHECK-NEXT: negb %al
3130 %1 = sdiv i8 %x, -64
3134 define i16 @combine_i16_sdiv_pow2(i16 %x) {
3135 ; CHECK-LABEL: combine_i16_sdiv_pow2:
3137 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3138 ; CHECK-NEXT: leal 15(%rdi), %eax
3139 ; CHECK-NEXT: testw %di, %di
3140 ; CHECK-NEXT: cmovnsl %edi, %eax
3142 ; CHECK-NEXT: shrl $4, %eax
3143 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
3145 %1 = sdiv i16 %x, 16
3149 define i16 @combine_i16_sdiv_negpow2(i16 %x) {
3150 ; CHECK-LABEL: combine_i16_sdiv_negpow2:
3152 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3153 ; CHECK-NEXT: leal 255(%rdi), %eax
3154 ; CHECK-NEXT: testw %di, %di
3155 ; CHECK-NEXT: cmovnsl %edi, %eax
3157 ; CHECK-NEXT: sarl $8, %eax
3158 ; CHECK-NEXT: negl %eax
3159 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
3161 %1 = sdiv i16 %x, -256
3165 define i32 @combine_i32_sdiv_pow2(i32 %x) {
3166 ; CHECK-LABEL: combine_i32_sdiv_pow2:
3168 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3169 ; CHECK-NEXT: leal 15(%rdi), %eax
3170 ; CHECK-NEXT: testl %edi, %edi
3171 ; CHECK-NEXT: cmovnsl %edi, %eax
3172 ; CHECK-NEXT: sarl $4, %eax
3174 %1 = sdiv i32 %x, 16
3178 define i32 @combine_i32_sdiv_negpow2(i32 %x) {
3179 ; CHECK-LABEL: combine_i32_sdiv_negpow2:
3181 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3182 ; CHECK-NEXT: leal 255(%rdi), %eax
3183 ; CHECK-NEXT: testl %edi, %edi
3184 ; CHECK-NEXT: cmovnsl %edi, %eax
3185 ; CHECK-NEXT: sarl $8, %eax
3186 ; CHECK-NEXT: negl %eax
3188 %1 = sdiv i32 %x, -256
3192 define i64 @combine_i64_sdiv_pow2(i64 %x) {
3193 ; CHECK-LABEL: combine_i64_sdiv_pow2:
3195 ; CHECK-NEXT: leaq 15(%rdi), %rax
3196 ; CHECK-NEXT: testq %rdi, %rdi
3197 ; CHECK-NEXT: cmovnsq %rdi, %rax
3198 ; CHECK-NEXT: sarq $4, %rax
3200 %1 = sdiv i64 %x, 16
3204 define i64 @combine_i64_sdiv_negpow2(i64 %x) {
3205 ; CHECK-LABEL: combine_i64_sdiv_negpow2:
3207 ; CHECK-NEXT: leaq 255(%rdi), %rax
3208 ; CHECK-NEXT: testq %rdi, %rdi
3209 ; CHECK-NEXT: cmovnsq %rdi, %rax
3210 ; CHECK-NEXT: sarq $8, %rax
3211 ; CHECK-NEXT: negq %rax
3213 %1 = sdiv i64 %x, -256