1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
10 ; fold (sdiv x, 1) -> x
11 define i32 @combine_sdiv_by_one(i32 %x) {
12 ; CHECK-LABEL: combine_sdiv_by_one:
14 ; CHECK-NEXT: movl %edi, %eax
20 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21 ; CHECK-LABEL: combine_vec_sdiv_by_one:
24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
28 ; fold (sdiv x, -1) -> 0 - x
29 define i32 @combine_sdiv_by_negone(i32 %x) {
30 ; CHECK-LABEL: combine_sdiv_by_negone:
32 ; CHECK-NEXT: movl %edi, %eax
33 ; CHECK-NEXT: negl %eax
39 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40 ; SSE-LABEL: combine_vec_sdiv_by_negone:
42 ; SSE-NEXT: pxor %xmm1, %xmm1
43 ; SSE-NEXT: psubd %xmm0, %xmm1
44 ; SSE-NEXT: movdqa %xmm1, %xmm0
47 ; AVX-LABEL: combine_vec_sdiv_by_negone:
49 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
50 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
56 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57 define i32 @combine_sdiv_by_minsigned(i32 %x) {
58 ; CHECK-LABEL: combine_sdiv_by_minsigned:
60 ; CHECK-NEXT: xorl %eax, %eax
61 ; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000
62 ; CHECK-NEXT: sete %al
64 %1 = sdiv i32 %x, -2147483648
68 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69 ; SSE-LABEL: combine_vec_sdiv_by_minsigned:
71 ; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
72 ; SSE-NEXT: psrld $31, %xmm0
75 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
77 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
78 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
81 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
83 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
85 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
88 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
90 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
92 ; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0
95 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
97 ; AVX512BW-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
98 ; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
101 ; XOP-LABEL: combine_vec_sdiv_by_minsigned:
103 ; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
104 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm0
106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
110 ; fold (sdiv 0, x) -> 0
111 define i32 @combine_sdiv_zero(i32 %x) {
112 ; CHECK-LABEL: combine_sdiv_zero:
114 ; CHECK-NEXT: xorl %eax, %eax
120 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121 ; SSE-LABEL: combine_vec_sdiv_zero:
123 ; SSE-NEXT: xorps %xmm0, %xmm0
126 ; AVX-LABEL: combine_vec_sdiv_zero:
128 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
130 %1 = sdiv <4 x i32> zeroinitializer, %x
134 ; fold (sdiv x, x) -> 1
135 define i32 @combine_sdiv_dupe(i32 %x) {
136 ; CHECK-LABEL: combine_sdiv_dupe:
138 ; CHECK-NEXT: movl $1, %eax
144 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145 ; SSE-LABEL: combine_vec_sdiv_dupe:
147 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
150 ; AVX-LABEL: combine_vec_sdiv_dupe:
152 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
154 %1 = sdiv <4 x i32> %x, %x
158 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
159 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
160 ; SSE-LABEL: combine_vec_sdiv_by_pos0:
162 ; SSE-NEXT: psrld $2, %xmm0
163 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
166 ; AVX1-LABEL: combine_vec_sdiv_by_pos0:
168 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
169 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
172 ; AVX2-LABEL: combine_vec_sdiv_by_pos0:
174 ; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
175 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
176 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
179 ; AVX512F-LABEL: combine_vec_sdiv_by_pos0:
181 ; AVX512F-NEXT: vpsrld $2, %xmm0, %xmm0
182 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
183 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
186 ; AVX512BW-LABEL: combine_vec_sdiv_by_pos0:
188 ; AVX512BW-NEXT: vpsrld $2, %xmm0, %xmm0
189 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
190 ; AVX512BW-NEXT: retq
192 ; XOP-LABEL: combine_vec_sdiv_by_pos0:
194 ; XOP-NEXT: vpsrld $2, %xmm0, %xmm0
195 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
197 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
198 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
202 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
203 ; SSE2-LABEL: combine_vec_sdiv_by_pos1:
205 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
206 ; SSE2-NEXT: movdqa %xmm0, %xmm1
207 ; SSE2-NEXT: psrld $4, %xmm1
208 ; SSE2-NEXT: movdqa %xmm0, %xmm2
209 ; SSE2-NEXT: psrld $3, %xmm2
210 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
211 ; SSE2-NEXT: movdqa %xmm0, %xmm1
212 ; SSE2-NEXT: psrld $2, %xmm1
213 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
214 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
217 ; SSE41-LABEL: combine_vec_sdiv_by_pos1:
219 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
220 ; SSE41-NEXT: movdqa %xmm0, %xmm1
221 ; SSE41-NEXT: psrld $4, %xmm1
222 ; SSE41-NEXT: movdqa %xmm0, %xmm2
223 ; SSE41-NEXT: psrld $2, %xmm2
224 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
225 ; SSE41-NEXT: movdqa %xmm0, %xmm1
226 ; SSE41-NEXT: psrld $3, %xmm1
227 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
228 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
231 ; AVX1-LABEL: combine_vec_sdiv_by_pos1:
233 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
234 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
235 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
236 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
237 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2
238 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
239 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
242 ; AVX2-LABEL: combine_vec_sdiv_by_pos1:
244 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
245 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
248 ; AVX512F-LABEL: combine_vec_sdiv_by_pos1:
250 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
251 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
254 ; AVX512BW-LABEL: combine_vec_sdiv_by_pos1:
256 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
257 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
258 ; AVX512BW-NEXT: retq
260 ; XOP-LABEL: combine_vec_sdiv_by_pos1:
262 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
263 ; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
265 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
266 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
270 ; fold (sdiv x, (1 << c)) -> x >>u c
271 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
272 ; SSE-LABEL: combine_vec_sdiv_by_pow2a:
274 ; SSE-NEXT: movdqa %xmm0, %xmm1
275 ; SSE-NEXT: psrad $31, %xmm1
276 ; SSE-NEXT: psrld $30, %xmm1
277 ; SSE-NEXT: paddd %xmm1, %xmm0
278 ; SSE-NEXT: psrad $2, %xmm0
281 ; AVX-LABEL: combine_vec_sdiv_by_pow2a:
283 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
284 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
285 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
286 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
288 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
292 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
293 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
295 ; SSE-NEXT: movdqa %xmm0, %xmm1
296 ; SSE-NEXT: psrad $31, %xmm1
297 ; SSE-NEXT: psrld $30, %xmm1
298 ; SSE-NEXT: paddd %xmm0, %xmm1
299 ; SSE-NEXT: psrad $2, %xmm1
300 ; SSE-NEXT: pxor %xmm0, %xmm0
301 ; SSE-NEXT: psubd %xmm1, %xmm0
304 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
306 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
307 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
308 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
309 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
310 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
311 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
313 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
317 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
318 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
320 ; SSE2-NEXT: pxor %xmm1, %xmm1
321 ; SSE2-NEXT: pxor %xmm2, %xmm2
322 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
323 ; SSE2-NEXT: movdqa %xmm2, %xmm3
324 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
325 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2]
326 ; SSE2-NEXT: pmullw %xmm4, %xmm3
327 ; SSE2-NEXT: psrlw $8, %xmm3
328 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
329 ; SSE2-NEXT: pmullw %xmm4, %xmm2
330 ; SSE2-NEXT: psrlw $8, %xmm2
331 ; SSE2-NEXT: packuswb %xmm3, %xmm2
332 ; SSE2-NEXT: paddb %xmm0, %xmm2
333 ; SSE2-NEXT: movdqa %xmm2, %xmm1
334 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
335 ; SSE2-NEXT: psraw $8, %xmm1
336 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
337 ; SSE2-NEXT: pmullw %xmm3, %xmm1
338 ; SSE2-NEXT: psrlw $8, %xmm1
339 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
340 ; SSE2-NEXT: psraw $8, %xmm2
341 ; SSE2-NEXT: pmullw %xmm3, %xmm2
342 ; SSE2-NEXT: psrlw $8, %xmm2
343 ; SSE2-NEXT: packuswb %xmm1, %xmm2
344 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
345 ; SSE2-NEXT: pand %xmm1, %xmm2
346 ; SSE2-NEXT: pandn %xmm0, %xmm1
347 ; SSE2-NEXT: por %xmm2, %xmm1
348 ; SSE2-NEXT: movdqa %xmm1, %xmm0
351 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
353 ; SSE41-NEXT: movdqa %xmm0, %xmm1
354 ; SSE41-NEXT: pxor %xmm0, %xmm0
355 ; SSE41-NEXT: pxor %xmm3, %xmm3
356 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
357 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
358 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
359 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
360 ; SSE41-NEXT: pmullw %xmm0, %xmm3
361 ; SSE41-NEXT: psrlw $8, %xmm3
362 ; SSE41-NEXT: pmullw %xmm0, %xmm2
363 ; SSE41-NEXT: psrlw $8, %xmm2
364 ; SSE41-NEXT: packuswb %xmm3, %xmm2
365 ; SSE41-NEXT: paddb %xmm1, %xmm2
366 ; SSE41-NEXT: movdqa %xmm2, %xmm0
367 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
368 ; SSE41-NEXT: psraw $8, %xmm0
369 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
370 ; SSE41-NEXT: pmullw %xmm3, %xmm0
371 ; SSE41-NEXT: psrlw $8, %xmm0
372 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
373 ; SSE41-NEXT: psraw $8, %xmm2
374 ; SSE41-NEXT: pmullw %xmm3, %xmm2
375 ; SSE41-NEXT: psrlw $8, %xmm2
376 ; SSE41-NEXT: packuswb %xmm0, %xmm2
377 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
378 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
379 ; SSE41-NEXT: movdqa %xmm1, %xmm0
382 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
384 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
385 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
386 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
387 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2]
388 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
389 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
390 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
391 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
392 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
393 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
394 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
395 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
396 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
397 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
398 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
399 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
400 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
401 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
402 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
403 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
404 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
405 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
406 ; AVX1-NEXT: # xmm2 = mem[0,0]
407 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
410 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
412 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
413 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
414 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
415 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
416 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
417 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
418 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
419 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
420 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
421 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
422 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
423 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
424 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
425 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
426 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
427 ; AVX2-NEXT: vzeroupper
430 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
432 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
433 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
434 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
435 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
436 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
437 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
438 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
439 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
440 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
441 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
442 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
443 ; AVX512F-NEXT: vzeroupper
446 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
448 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
449 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
450 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
451 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
452 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
453 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1
454 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
455 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
456 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
457 ; AVX512BW-NEXT: movw $257, %ax # imm = 0x101
458 ; AVX512BW-NEXT: kmovd %eax, %k1
459 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
460 ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
461 ; AVX512BW-NEXT: vzeroupper
462 ; AVX512BW-NEXT: retq
464 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
466 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
467 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
468 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
469 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
470 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
471 ; XOP-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
472 ; XOP-NEXT: # xmm2 = mem[0,0]
473 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
475 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
479 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
480 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
482 ; SSE2-NEXT: movdqa %xmm0, %xmm1
483 ; SSE2-NEXT: psraw $15, %xmm1
484 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
485 ; SSE2-NEXT: paddw %xmm0, %xmm1
486 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
487 ; SSE2-NEXT: movdqa %xmm1, %xmm3
488 ; SSE2-NEXT: pand %xmm2, %xmm3
489 ; SSE2-NEXT: psraw $4, %xmm1
490 ; SSE2-NEXT: pandn %xmm1, %xmm2
491 ; SSE2-NEXT: por %xmm3, %xmm2
492 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
493 ; SSE2-NEXT: movdqa %xmm2, %xmm3
494 ; SSE2-NEXT: pand %xmm1, %xmm3
495 ; SSE2-NEXT: psraw $2, %xmm2
496 ; SSE2-NEXT: pandn %xmm2, %xmm1
497 ; SSE2-NEXT: por %xmm3, %xmm1
498 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
499 ; SSE2-NEXT: movdqa %xmm1, %xmm3
500 ; SSE2-NEXT: pand %xmm2, %xmm3
501 ; SSE2-NEXT: psraw $1, %xmm1
502 ; SSE2-NEXT: pandn %xmm1, %xmm2
503 ; SSE2-NEXT: por %xmm3, %xmm2
504 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
505 ; SSE2-NEXT: pand %xmm1, %xmm2
506 ; SSE2-NEXT: pandn %xmm0, %xmm1
507 ; SSE2-NEXT: por %xmm2, %xmm1
508 ; SSE2-NEXT: movdqa %xmm1, %xmm0
511 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
513 ; SSE41-NEXT: movdqa %xmm0, %xmm1
514 ; SSE41-NEXT: psraw $15, %xmm1
515 ; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
516 ; SSE41-NEXT: paddw %xmm0, %xmm1
517 ; SSE41-NEXT: movdqa %xmm1, %xmm2
518 ; SSE41-NEXT: psraw $1, %xmm2
519 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
520 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
521 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
524 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
526 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
527 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
528 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
529 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
530 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
531 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
532 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
535 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
537 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
538 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
539 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
540 ; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2
541 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
542 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
543 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
546 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
548 ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1
549 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
550 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1
551 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
552 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
553 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
554 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
555 ; AVX512F-NEXT: vzeroupper
558 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
560 ; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1
561 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
562 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1
563 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
564 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
565 ; AVX512BW-NEXT: retq
567 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
569 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm1
570 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
571 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1
572 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
573 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
575 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
579 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
580 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
582 ; SSE2-NEXT: movdqa %xmm0, %xmm3
583 ; SSE2-NEXT: psraw $15, %xmm0
584 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
585 ; SSE2-NEXT: pmulhuw %xmm7, %xmm0
586 ; SSE2-NEXT: paddw %xmm3, %xmm0
587 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
588 ; SSE2-NEXT: movdqa %xmm0, %xmm2
589 ; SSE2-NEXT: pand %xmm4, %xmm2
590 ; SSE2-NEXT: psraw $4, %xmm0
591 ; SSE2-NEXT: movdqa %xmm4, %xmm6
592 ; SSE2-NEXT: pandn %xmm0, %xmm6
593 ; SSE2-NEXT: por %xmm2, %xmm6
594 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
595 ; SSE2-NEXT: movdqa %xmm6, %xmm0
596 ; SSE2-NEXT: pand %xmm5, %xmm0
597 ; SSE2-NEXT: psraw $2, %xmm6
598 ; SSE2-NEXT: movdqa %xmm5, %xmm2
599 ; SSE2-NEXT: pandn %xmm6, %xmm2
600 ; SSE2-NEXT: por %xmm0, %xmm2
601 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,0]
602 ; SSE2-NEXT: movdqa %xmm2, %xmm0
603 ; SSE2-NEXT: pand %xmm6, %xmm0
604 ; SSE2-NEXT: psraw $1, %xmm2
605 ; SSE2-NEXT: movdqa %xmm6, %xmm8
606 ; SSE2-NEXT: pandn %xmm2, %xmm8
607 ; SSE2-NEXT: por %xmm0, %xmm8
608 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
609 ; SSE2-NEXT: pand %xmm2, %xmm8
610 ; SSE2-NEXT: movdqa %xmm2, %xmm0
611 ; SSE2-NEXT: pandn %xmm3, %xmm0
612 ; SSE2-NEXT: por %xmm8, %xmm0
613 ; SSE2-NEXT: movdqa %xmm1, %xmm3
614 ; SSE2-NEXT: psraw $15, %xmm3
615 ; SSE2-NEXT: pmulhuw %xmm7, %xmm3
616 ; SSE2-NEXT: paddw %xmm1, %xmm3
617 ; SSE2-NEXT: movdqa %xmm3, %xmm7
618 ; SSE2-NEXT: pand %xmm4, %xmm7
619 ; SSE2-NEXT: psraw $4, %xmm3
620 ; SSE2-NEXT: pandn %xmm3, %xmm4
621 ; SSE2-NEXT: por %xmm7, %xmm4
622 ; SSE2-NEXT: movdqa %xmm4, %xmm3
623 ; SSE2-NEXT: pand %xmm5, %xmm3
624 ; SSE2-NEXT: psraw $2, %xmm4
625 ; SSE2-NEXT: pandn %xmm4, %xmm5
626 ; SSE2-NEXT: por %xmm3, %xmm5
627 ; SSE2-NEXT: movdqa %xmm5, %xmm3
628 ; SSE2-NEXT: pand %xmm6, %xmm3
629 ; SSE2-NEXT: psraw $1, %xmm5
630 ; SSE2-NEXT: pandn %xmm5, %xmm6
631 ; SSE2-NEXT: por %xmm3, %xmm6
632 ; SSE2-NEXT: pand %xmm2, %xmm6
633 ; SSE2-NEXT: pandn %xmm1, %xmm2
634 ; SSE2-NEXT: por %xmm6, %xmm2
635 ; SSE2-NEXT: movdqa %xmm2, %xmm1
638 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
640 ; SSE41-NEXT: movdqa %xmm0, %xmm2
641 ; SSE41-NEXT: psraw $15, %xmm2
642 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
643 ; SSE41-NEXT: pmulhuw %xmm3, %xmm2
644 ; SSE41-NEXT: paddw %xmm0, %xmm2
645 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,16384,32768,4096,8192,2048,1024,32768>
646 ; SSE41-NEXT: movdqa %xmm2, %xmm5
647 ; SSE41-NEXT: pmulhw %xmm4, %xmm5
648 ; SSE41-NEXT: psraw $1, %xmm2
649 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
650 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
651 ; SSE41-NEXT: movdqa %xmm1, %xmm2
652 ; SSE41-NEXT: psraw $15, %xmm2
653 ; SSE41-NEXT: pmulhuw %xmm3, %xmm2
654 ; SSE41-NEXT: paddw %xmm1, %xmm2
655 ; SSE41-NEXT: pmulhw %xmm2, %xmm4
656 ; SSE41-NEXT: psraw $1, %xmm2
657 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6],xmm2[7]
658 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
661 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
663 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
664 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
665 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
666 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
667 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
668 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
669 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4
670 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
671 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
672 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4
673 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3
674 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3
675 ; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2
676 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
677 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
678 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
679 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
680 ; AVX1-NEXT: # ymm2 = mem[0,1,0,1]
681 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
682 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
683 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
686 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
688 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
689 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
690 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
691 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2
692 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
693 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
694 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
697 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
699 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
700 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
701 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
702 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
703 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
704 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
705 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
708 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
710 ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1
711 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
712 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1
713 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
714 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
715 ; AVX512BW-NEXT: retq
717 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
719 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
720 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2
721 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521>
722 ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2
723 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1
724 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535>
725 ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1
726 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4
727 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3
728 ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3
729 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2
730 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
731 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
732 ; XOP-NEXT: # ymm2 = mem[0,1,0,1]
733 ; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
735 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
739 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
740 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
742 ; SSE2-NEXT: movdqa %xmm1, %xmm5
743 ; SSE2-NEXT: movdqa %xmm0, %xmm1
744 ; SSE2-NEXT: psraw $15, %xmm0
745 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
746 ; SSE2-NEXT: pmulhuw %xmm9, %xmm0
747 ; SSE2-NEXT: paddw %xmm1, %xmm0
748 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535]
749 ; SSE2-NEXT: movdqa %xmm0, %xmm4
750 ; SSE2-NEXT: pand %xmm6, %xmm4
751 ; SSE2-NEXT: psraw $4, %xmm0
752 ; SSE2-NEXT: movdqa %xmm6, %xmm8
753 ; SSE2-NEXT: pandn %xmm0, %xmm8
754 ; SSE2-NEXT: por %xmm4, %xmm8
755 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
756 ; SSE2-NEXT: movdqa %xmm8, %xmm0
757 ; SSE2-NEXT: pand %xmm7, %xmm0
758 ; SSE2-NEXT: psraw $2, %xmm8
759 ; SSE2-NEXT: movdqa %xmm7, %xmm4
760 ; SSE2-NEXT: pandn %xmm8, %xmm4
761 ; SSE2-NEXT: por %xmm0, %xmm4
762 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,0,65535,0]
763 ; SSE2-NEXT: movdqa %xmm4, %xmm0
764 ; SSE2-NEXT: pand %xmm8, %xmm0
765 ; SSE2-NEXT: psraw $1, %xmm4
766 ; SSE2-NEXT: movdqa %xmm8, %xmm10
767 ; SSE2-NEXT: pandn %xmm4, %xmm10
768 ; SSE2-NEXT: por %xmm0, %xmm10
769 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,65535]
770 ; SSE2-NEXT: pand %xmm4, %xmm10
771 ; SSE2-NEXT: movdqa %xmm4, %xmm0
772 ; SSE2-NEXT: pandn %xmm1, %xmm0
773 ; SSE2-NEXT: por %xmm10, %xmm0
774 ; SSE2-NEXT: movdqa %xmm5, %xmm1
775 ; SSE2-NEXT: psraw $15, %xmm1
776 ; SSE2-NEXT: pmulhuw %xmm9, %xmm1
777 ; SSE2-NEXT: paddw %xmm5, %xmm1
778 ; SSE2-NEXT: movdqa %xmm1, %xmm10
779 ; SSE2-NEXT: pand %xmm6, %xmm10
780 ; SSE2-NEXT: psraw $4, %xmm1
781 ; SSE2-NEXT: movdqa %xmm6, %xmm11
782 ; SSE2-NEXT: pandn %xmm1, %xmm11
783 ; SSE2-NEXT: por %xmm10, %xmm11
784 ; SSE2-NEXT: movdqa %xmm11, %xmm1
785 ; SSE2-NEXT: pand %xmm7, %xmm1
786 ; SSE2-NEXT: psraw $2, %xmm11
787 ; SSE2-NEXT: movdqa %xmm7, %xmm10
788 ; SSE2-NEXT: pandn %xmm11, %xmm10
789 ; SSE2-NEXT: por %xmm1, %xmm10
790 ; SSE2-NEXT: movdqa %xmm10, %xmm1
791 ; SSE2-NEXT: pand %xmm8, %xmm1
792 ; SSE2-NEXT: psraw $1, %xmm10
793 ; SSE2-NEXT: movdqa %xmm8, %xmm11
794 ; SSE2-NEXT: pandn %xmm10, %xmm11
795 ; SSE2-NEXT: por %xmm1, %xmm11
796 ; SSE2-NEXT: pand %xmm4, %xmm11
797 ; SSE2-NEXT: movdqa %xmm4, %xmm1
798 ; SSE2-NEXT: pandn %xmm5, %xmm1
799 ; SSE2-NEXT: por %xmm11, %xmm1
800 ; SSE2-NEXT: movdqa %xmm2, %xmm5
801 ; SSE2-NEXT: psraw $15, %xmm5
802 ; SSE2-NEXT: pmulhuw %xmm9, %xmm5
803 ; SSE2-NEXT: paddw %xmm2, %xmm5
804 ; SSE2-NEXT: movdqa %xmm5, %xmm10
805 ; SSE2-NEXT: pand %xmm6, %xmm10
806 ; SSE2-NEXT: psraw $4, %xmm5
807 ; SSE2-NEXT: movdqa %xmm6, %xmm11
808 ; SSE2-NEXT: pandn %xmm5, %xmm11
809 ; SSE2-NEXT: por %xmm10, %xmm11
810 ; SSE2-NEXT: movdqa %xmm11, %xmm5
811 ; SSE2-NEXT: pand %xmm7, %xmm5
812 ; SSE2-NEXT: psraw $2, %xmm11
813 ; SSE2-NEXT: movdqa %xmm7, %xmm10
814 ; SSE2-NEXT: pandn %xmm11, %xmm10
815 ; SSE2-NEXT: por %xmm5, %xmm10
816 ; SSE2-NEXT: movdqa %xmm10, %xmm5
817 ; SSE2-NEXT: pand %xmm8, %xmm5
818 ; SSE2-NEXT: psraw $1, %xmm10
819 ; SSE2-NEXT: movdqa %xmm8, %xmm11
820 ; SSE2-NEXT: pandn %xmm10, %xmm11
821 ; SSE2-NEXT: por %xmm5, %xmm11
822 ; SSE2-NEXT: pand %xmm4, %xmm11
823 ; SSE2-NEXT: movdqa %xmm4, %xmm5
824 ; SSE2-NEXT: pandn %xmm2, %xmm5
825 ; SSE2-NEXT: por %xmm11, %xmm5
826 ; SSE2-NEXT: movdqa %xmm3, %xmm2
827 ; SSE2-NEXT: psraw $15, %xmm2
828 ; SSE2-NEXT: pmulhuw %xmm9, %xmm2
829 ; SSE2-NEXT: paddw %xmm3, %xmm2
830 ; SSE2-NEXT: movdqa %xmm2, %xmm9
831 ; SSE2-NEXT: pand %xmm6, %xmm9
832 ; SSE2-NEXT: psraw $4, %xmm2
833 ; SSE2-NEXT: pandn %xmm2, %xmm6
834 ; SSE2-NEXT: por %xmm9, %xmm6
835 ; SSE2-NEXT: movdqa %xmm6, %xmm2
836 ; SSE2-NEXT: pand %xmm7, %xmm2
837 ; SSE2-NEXT: psraw $2, %xmm6
838 ; SSE2-NEXT: pandn %xmm6, %xmm7
839 ; SSE2-NEXT: por %xmm2, %xmm7
840 ; SSE2-NEXT: movdqa %xmm7, %xmm2
841 ; SSE2-NEXT: pand %xmm8, %xmm2
842 ; SSE2-NEXT: psraw $1, %xmm7
843 ; SSE2-NEXT: pandn %xmm7, %xmm8
844 ; SSE2-NEXT: por %xmm2, %xmm8
845 ; SSE2-NEXT: pand %xmm4, %xmm8
846 ; SSE2-NEXT: pandn %xmm3, %xmm4
847 ; SSE2-NEXT: por %xmm8, %xmm4
848 ; SSE2-NEXT: movdqa %xmm5, %xmm2
849 ; SSE2-NEXT: movdqa %xmm4, %xmm3
852 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
854 ; SSE41-NEXT: movdqa %xmm0, %xmm6
855 ; SSE41-NEXT: psraw $15, %xmm6
856 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,4,2,16,8,32,64,2>
857 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
858 ; SSE41-NEXT: paddw %xmm0, %xmm6
859 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,16384,32768,4096,8192,2048,1024,32768>
860 ; SSE41-NEXT: movdqa %xmm6, %xmm7
861 ; SSE41-NEXT: pmulhw %xmm4, %xmm7
862 ; SSE41-NEXT: psraw $1, %xmm6
863 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
864 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7]
865 ; SSE41-NEXT: movdqa %xmm1, %xmm6
866 ; SSE41-NEXT: psraw $15, %xmm6
867 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
868 ; SSE41-NEXT: paddw %xmm1, %xmm6
869 ; SSE41-NEXT: movdqa %xmm6, %xmm7
870 ; SSE41-NEXT: pmulhw %xmm4, %xmm7
871 ; SSE41-NEXT: psraw $1, %xmm6
872 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
873 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7]
874 ; SSE41-NEXT: movdqa %xmm2, %xmm6
875 ; SSE41-NEXT: psraw $15, %xmm6
876 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
877 ; SSE41-NEXT: paddw %xmm2, %xmm6
878 ; SSE41-NEXT: movdqa %xmm6, %xmm7
879 ; SSE41-NEXT: pmulhw %xmm4, %xmm7
880 ; SSE41-NEXT: psraw $1, %xmm6
881 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
882 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7]
883 ; SSE41-NEXT: movdqa %xmm3, %xmm6
884 ; SSE41-NEXT: psraw $15, %xmm6
885 ; SSE41-NEXT: pmulhuw %xmm5, %xmm6
886 ; SSE41-NEXT: paddw %xmm3, %xmm6
887 ; SSE41-NEXT: pmulhw %xmm6, %xmm4
888 ; SSE41-NEXT: psraw $1, %xmm6
889 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7]
890 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7]
893 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
895 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
896 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3
897 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
898 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3
899 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
900 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
901 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5
902 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
903 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
904 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5
905 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5
906 ; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5
907 ; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6
908 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5
909 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
910 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
911 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
912 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
913 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
914 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
915 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
916 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
917 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6
918 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6
919 ; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2
920 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6
921 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
922 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
923 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6
924 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4
925 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4
926 ; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
927 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4
928 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
929 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
930 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
931 ; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1
932 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
935 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
937 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2
938 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
939 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
940 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
941 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2
942 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
943 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
944 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5
945 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
946 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
947 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
948 ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2
949 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
950 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2
951 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3
952 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
953 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
954 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
957 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
959 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
960 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
961 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
962 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
963 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
964 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
965 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
966 ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
967 ; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1
968 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
969 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
970 ; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5
971 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2
972 ; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2
973 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
974 ; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2
975 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
976 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
977 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
978 ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
979 ; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
982 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
984 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1
985 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
986 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
987 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
988 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
989 ; AVX512BW-NEXT: kmovd %eax, %k1
990 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
991 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
992 ; AVX512BW-NEXT: retq
994 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
996 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
997 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3
998 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521>
999 ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3
1000 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2
1001 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535>
1002 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
1003 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5
1004 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5
1005 ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5
1006 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5
1007 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1008 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1009 ; XOP-NEXT: # ymm5 = mem[0,1,0,1]
1010 ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0
1011 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1012 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6
1013 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6
1014 ; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2
1015 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
1016 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm6
1017 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4
1018 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4
1019 ; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3
1020 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1021 ; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1
1023 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1027 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1028 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1030 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1031 ; SSE2-NEXT: psrad $31, %xmm1
1032 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1033 ; SSE2-NEXT: psrld $28, %xmm2
1034 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1035 ; SSE2-NEXT: psrld $29, %xmm3
1036 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1037 ; SSE2-NEXT: psrld $30, %xmm1
1038 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1039 ; SSE2-NEXT: paddd %xmm0, %xmm1
1040 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1041 ; SSE2-NEXT: psrad $4, %xmm2
1042 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1043 ; SSE2-NEXT: psrad $3, %xmm3
1044 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1045 ; SSE2-NEXT: psrad $2, %xmm1
1046 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1047 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1048 ; SSE2-NEXT: movaps %xmm1, %xmm0
1051 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1053 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1054 ; SSE41-NEXT: psrad $31, %xmm1
1055 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1056 ; SSE41-NEXT: psrld $28, %xmm2
1057 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1058 ; SSE41-NEXT: psrld $30, %xmm3
1059 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1060 ; SSE41-NEXT: psrld $29, %xmm1
1061 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1062 ; SSE41-NEXT: paddd %xmm0, %xmm1
1063 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1064 ; SSE41-NEXT: psrad $4, %xmm2
1065 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1066 ; SSE41-NEXT: psrad $2, %xmm3
1067 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1068 ; SSE41-NEXT: psrad $3, %xmm1
1069 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1070 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1073 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1075 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
1076 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1077 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
1078 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1079 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
1080 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1081 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1082 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1083 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1084 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1085 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1086 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1087 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1090 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1091 ; AVX2ORLATER: # %bb.0:
1092 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
1093 ; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1094 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1095 ; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1096 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1097 ; AVX2ORLATER-NEXT: retq
1099 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1101 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
1102 ; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1103 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1104 ; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1105 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1107 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1111 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1112 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1114 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1115 ; SSE2-NEXT: psrad $31, %xmm0
1116 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1117 ; SSE2-NEXT: psrld $28, %xmm3
1118 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1119 ; SSE2-NEXT: psrld $29, %xmm4
1120 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1121 ; SSE2-NEXT: psrld $30, %xmm0
1122 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1123 ; SSE2-NEXT: paddd %xmm2, %xmm0
1124 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1125 ; SSE2-NEXT: psrad $4, %xmm3
1126 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1127 ; SSE2-NEXT: psrad $3, %xmm4
1128 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1129 ; SSE2-NEXT: psrad $2, %xmm0
1130 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1131 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1132 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1133 ; SSE2-NEXT: psrad $31, %xmm2
1134 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1135 ; SSE2-NEXT: psrld $28, %xmm3
1136 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1137 ; SSE2-NEXT: psrld $29, %xmm4
1138 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1139 ; SSE2-NEXT: psrld $30, %xmm2
1140 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1141 ; SSE2-NEXT: paddd %xmm1, %xmm2
1142 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1143 ; SSE2-NEXT: psrad $4, %xmm3
1144 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1145 ; SSE2-NEXT: psrad $3, %xmm4
1146 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1147 ; SSE2-NEXT: psrad $2, %xmm2
1148 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1149 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1150 ; SSE2-NEXT: movaps %xmm2, %xmm1
1153 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1155 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1156 ; SSE41-NEXT: psrad $31, %xmm2
1157 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1158 ; SSE41-NEXT: psrld $28, %xmm3
1159 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1160 ; SSE41-NEXT: psrld $30, %xmm4
1161 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1162 ; SSE41-NEXT: psrld $29, %xmm2
1163 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1164 ; SSE41-NEXT: paddd %xmm0, %xmm2
1165 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1166 ; SSE41-NEXT: psrad $4, %xmm3
1167 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1168 ; SSE41-NEXT: psrad $2, %xmm4
1169 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1170 ; SSE41-NEXT: psrad $3, %xmm2
1171 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1172 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1173 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1174 ; SSE41-NEXT: psrad $31, %xmm2
1175 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1176 ; SSE41-NEXT: psrld $28, %xmm3
1177 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1178 ; SSE41-NEXT: psrld $30, %xmm4
1179 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1180 ; SSE41-NEXT: psrld $29, %xmm2
1181 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1182 ; SSE41-NEXT: paddd %xmm1, %xmm2
1183 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1184 ; SSE41-NEXT: psrad $4, %xmm3
1185 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1186 ; SSE41-NEXT: psrad $2, %xmm4
1187 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1188 ; SSE41-NEXT: psrad $3, %xmm2
1189 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1190 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1193 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1195 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1196 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1197 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1198 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1199 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1200 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1201 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1202 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1203 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1204 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1205 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1206 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1207 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1208 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1209 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1210 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1211 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1212 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1213 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1214 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
1215 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1216 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1217 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1218 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1219 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1220 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1221 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1224 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1225 ; AVX2ORLATER: # %bb.0:
1226 ; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1
1227 ; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1228 ; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1
1229 ; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1230 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1231 ; AVX2ORLATER-NEXT: retq
1233 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1235 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
1236 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2
1237 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268>
1238 ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2
1239 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1240 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292>
1241 ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1
1242 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4
1243 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3
1244 ; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1245 ; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2
1246 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1247 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1249 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1253 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1254 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1256 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1257 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1258 ; SSE2-NEXT: psrad $31, %xmm0
1259 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1260 ; SSE2-NEXT: psrld $28, %xmm5
1261 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1262 ; SSE2-NEXT: psrld $29, %xmm6
1263 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1264 ; SSE2-NEXT: psrld $30, %xmm0
1265 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1266 ; SSE2-NEXT: paddd %xmm1, %xmm0
1267 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1268 ; SSE2-NEXT: psrad $4, %xmm5
1269 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1270 ; SSE2-NEXT: psrad $3, %xmm6
1271 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1272 ; SSE2-NEXT: psrad $2, %xmm0
1273 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1274 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1275 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1276 ; SSE2-NEXT: psrad $31, %xmm1
1277 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1278 ; SSE2-NEXT: psrld $28, %xmm5
1279 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1280 ; SSE2-NEXT: psrld $29, %xmm6
1281 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1282 ; SSE2-NEXT: psrld $30, %xmm1
1283 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1284 ; SSE2-NEXT: paddd %xmm4, %xmm1
1285 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1286 ; SSE2-NEXT: psrad $4, %xmm5
1287 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1288 ; SSE2-NEXT: psrad $3, %xmm6
1289 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1290 ; SSE2-NEXT: psrad $2, %xmm1
1291 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1292 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1293 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1294 ; SSE2-NEXT: psrad $31, %xmm4
1295 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1296 ; SSE2-NEXT: psrld $28, %xmm5
1297 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1298 ; SSE2-NEXT: psrld $29, %xmm6
1299 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1300 ; SSE2-NEXT: psrld $30, %xmm4
1301 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1302 ; SSE2-NEXT: paddd %xmm2, %xmm4
1303 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1304 ; SSE2-NEXT: psrad $4, %xmm5
1305 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1306 ; SSE2-NEXT: psrad $3, %xmm6
1307 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1308 ; SSE2-NEXT: psrad $2, %xmm4
1309 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1310 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1311 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1312 ; SSE2-NEXT: psrad $31, %xmm5
1313 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1314 ; SSE2-NEXT: psrld $28, %xmm2
1315 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1316 ; SSE2-NEXT: psrld $29, %xmm6
1317 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1318 ; SSE2-NEXT: psrld $30, %xmm5
1319 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1320 ; SSE2-NEXT: paddd %xmm3, %xmm5
1321 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1322 ; SSE2-NEXT: psrad $4, %xmm2
1323 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1324 ; SSE2-NEXT: psrad $3, %xmm6
1325 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1326 ; SSE2-NEXT: psrad $2, %xmm5
1327 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1328 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1329 ; SSE2-NEXT: movaps %xmm4, %xmm2
1330 ; SSE2-NEXT: movaps %xmm5, %xmm3
1333 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1335 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1336 ; SSE41-NEXT: psrad $31, %xmm4
1337 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1338 ; SSE41-NEXT: psrld $28, %xmm5
1339 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1340 ; SSE41-NEXT: psrld $30, %xmm6
1341 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1342 ; SSE41-NEXT: psrld $29, %xmm4
1343 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1344 ; SSE41-NEXT: paddd %xmm0, %xmm4
1345 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1346 ; SSE41-NEXT: psrad $4, %xmm5
1347 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1348 ; SSE41-NEXT: psrad $2, %xmm6
1349 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1350 ; SSE41-NEXT: psrad $3, %xmm4
1351 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1352 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5,6,7]
1353 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1354 ; SSE41-NEXT: psrad $31, %xmm4
1355 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1356 ; SSE41-NEXT: psrld $28, %xmm5
1357 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1358 ; SSE41-NEXT: psrld $30, %xmm6
1359 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1360 ; SSE41-NEXT: psrld $29, %xmm4
1361 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1362 ; SSE41-NEXT: paddd %xmm1, %xmm4
1363 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1364 ; SSE41-NEXT: psrad $4, %xmm5
1365 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1366 ; SSE41-NEXT: psrad $2, %xmm6
1367 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1368 ; SSE41-NEXT: psrad $3, %xmm4
1369 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1370 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5,6,7]
1371 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1372 ; SSE41-NEXT: psrad $31, %xmm4
1373 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1374 ; SSE41-NEXT: psrld $28, %xmm5
1375 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1376 ; SSE41-NEXT: psrld $30, %xmm6
1377 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1378 ; SSE41-NEXT: psrld $29, %xmm4
1379 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1380 ; SSE41-NEXT: paddd %xmm2, %xmm4
1381 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1382 ; SSE41-NEXT: psrad $4, %xmm5
1383 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1384 ; SSE41-NEXT: psrad $2, %xmm6
1385 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1386 ; SSE41-NEXT: psrad $3, %xmm4
1387 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1388 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1389 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1390 ; SSE41-NEXT: psrad $31, %xmm4
1391 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1392 ; SSE41-NEXT: psrld $28, %xmm5
1393 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1394 ; SSE41-NEXT: psrld $30, %xmm6
1395 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1396 ; SSE41-NEXT: psrld $29, %xmm4
1397 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1398 ; SSE41-NEXT: paddd %xmm3, %xmm4
1399 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1400 ; SSE41-NEXT: psrad $4, %xmm5
1401 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1402 ; SSE41-NEXT: psrad $2, %xmm6
1403 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1404 ; SSE41-NEXT: psrad $3, %xmm4
1405 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1406 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
1409 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1411 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1412 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1413 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1414 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1415 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1416 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1417 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1418 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1419 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1420 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1422 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1423 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1424 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
1425 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1426 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1427 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1428 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1429 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1430 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1431 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1432 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1433 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1434 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1435 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1436 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1437 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1438 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1439 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1440 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1441 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1442 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1443 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1444 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1445 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1446 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1447 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1448 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1449 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1450 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1451 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
1452 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1453 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1454 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1455 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1456 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1457 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
1458 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1459 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1460 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1461 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1462 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1463 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1464 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1467 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1469 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
1470 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28]
1471 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1472 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1473 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
1474 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1475 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1476 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1477 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1478 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
1479 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1480 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2
1481 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1482 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1485 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1487 ; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1
1488 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1489 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1490 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1491 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
1492 ; AVX512F-NEXT: kmovw %eax, %k1
1493 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1494 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1495 ; AVX512F-NEXT: retq
1497 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1498 ; AVX512BW: # %bb.0:
1499 ; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1
1500 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1501 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1502 ; AVX512BW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1503 ; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111
1504 ; AVX512BW-NEXT: kmovd %eax, %k1
1505 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1506 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1507 ; AVX512BW-NEXT: retq
1509 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1511 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1512 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3
1513 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268>
1514 ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3
1515 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1516 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292>
1517 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1518 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5
1519 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1520 ; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5
1521 ; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5
1522 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1523 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1524 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1525 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm5
1526 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1527 ; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1528 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1529 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm5
1530 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4
1531 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4
1532 ; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3
1533 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1534 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1536 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1540 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1541 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1543 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1544 ; SSE2-NEXT: psrad $31, %xmm1
1545 ; SSE2-NEXT: psrlq $62, %xmm1
1546 ; SSE2-NEXT: paddq %xmm0, %xmm1
1547 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1548 ; SSE2-NEXT: psrad $2, %xmm2
1549 ; SSE2-NEXT: psrlq $2, %xmm1
1550 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1551 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1552 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1555 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1557 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1558 ; SSE41-NEXT: psrad $31, %xmm1
1559 ; SSE41-NEXT: psrlq $62, %xmm1
1560 ; SSE41-NEXT: paddq %xmm0, %xmm1
1561 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1562 ; SSE41-NEXT: psrad $2, %xmm2
1563 ; SSE41-NEXT: psrlq $2, %xmm1
1564 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1565 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1568 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1570 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1571 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1572 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1
1573 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1574 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2
1575 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
1576 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1577 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1580 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1582 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1583 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1584 ; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1
1585 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1586 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2
1587 ; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1
1588 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
1589 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1592 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1594 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1595 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1596 ; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1
1597 ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1598 ; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1
1599 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1600 ; AVX512F-NEXT: vzeroupper
1601 ; AVX512F-NEXT: retq
1603 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1604 ; AVX512BW: # %bb.0:
1605 ; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1
1606 ; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1
1607 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1608 ; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1
1609 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1610 ; AVX512BW-NEXT: retq
1612 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1614 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1615 ; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1
1616 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1617 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1618 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1620 %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1624 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1625 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1627 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1628 ; SSE2-NEXT: psrad $31, %xmm2
1629 ; SSE2-NEXT: psrlq $62, %xmm2
1630 ; SSE2-NEXT: paddq %xmm0, %xmm2
1631 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
1632 ; SSE2-NEXT: psrad $2, %xmm3
1633 ; SSE2-NEXT: psrlq $2, %xmm2
1634 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1635 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1636 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1637 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1638 ; SSE2-NEXT: psrad $31, %xmm2
1639 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1640 ; SSE2-NEXT: psrlq $61, %xmm3
1641 ; SSE2-NEXT: psrlq $60, %xmm2
1642 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
1643 ; SSE2-NEXT: paddq %xmm2, %xmm1
1644 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1645 ; SSE2-NEXT: psrlq $3, %xmm2
1646 ; SSE2-NEXT: psrlq $4, %xmm1
1647 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1648 ; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
1649 ; SSE2-NEXT: xorpd %xmm2, %xmm1
1650 ; SSE2-NEXT: psubq %xmm2, %xmm1
1653 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1655 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1656 ; SSE41-NEXT: psrad $31, %xmm2
1657 ; SSE41-NEXT: psrlq $62, %xmm2
1658 ; SSE41-NEXT: paddq %xmm0, %xmm2
1659 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1660 ; SSE41-NEXT: psrad $2, %xmm3
1661 ; SSE41-NEXT: psrlq $2, %xmm2
1662 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1663 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1664 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1665 ; SSE41-NEXT: psrad $31, %xmm2
1666 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1667 ; SSE41-NEXT: psrlq $60, %xmm3
1668 ; SSE41-NEXT: psrlq $61, %xmm2
1669 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1670 ; SSE41-NEXT: paddq %xmm2, %xmm1
1671 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1672 ; SSE41-NEXT: psrlq $4, %xmm2
1673 ; SSE41-NEXT: psrlq $3, %xmm1
1674 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1675 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
1676 ; SSE41-NEXT: pxor %xmm2, %xmm1
1677 ; SSE41-NEXT: psubq %xmm2, %xmm1
1680 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1682 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1683 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1684 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
1685 ; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4
1686 ; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3
1687 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1688 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
1689 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
1690 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1
1691 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1692 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1693 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1694 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
1695 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
1696 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1697 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1698 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3
1699 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1700 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1701 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1702 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1705 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1707 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1708 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
1709 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1710 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1711 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1712 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1713 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
1714 ; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1
1715 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1718 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1720 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1721 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,2,3,4>
1722 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2
1723 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
1724 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
1725 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1726 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1727 ; AVX512F-NEXT: retq
1729 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1730 ; AVX512BW: # %bb.0:
1731 ; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1
1732 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1733 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1734 ; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1735 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1736 ; AVX512BW-NEXT: retq
1738 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1740 ; XOP-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1741 ; XOP-NEXT: # xmm1 = mem[0,0]
1742 ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2
1743 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2
1744 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1745 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1746 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1747 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1
1748 ; XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1749 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
1750 ; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1751 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1752 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1754 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1758 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1759 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1761 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1762 ; SSE2-NEXT: psrad $31, %xmm4
1763 ; SSE2-NEXT: psrlq $62, %xmm4
1764 ; SSE2-NEXT: paddq %xmm0, %xmm4
1765 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
1766 ; SSE2-NEXT: psrad $2, %xmm5
1767 ; SSE2-NEXT: psrlq $2, %xmm4
1768 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1769 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1770 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
1771 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1772 ; SSE2-NEXT: psrad $31, %xmm4
1773 ; SSE2-NEXT: psrlq $62, %xmm4
1774 ; SSE2-NEXT: paddq %xmm2, %xmm4
1775 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
1776 ; SSE2-NEXT: psrad $2, %xmm5
1777 ; SSE2-NEXT: psrlq $2, %xmm4
1778 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1779 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1780 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
1781 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1782 ; SSE2-NEXT: psrad $31, %xmm4
1783 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1784 ; SSE2-NEXT: psrlq $61, %xmm5
1785 ; SSE2-NEXT: psrlq $60, %xmm4
1786 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1787 ; SSE2-NEXT: paddq %xmm4, %xmm1
1788 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1789 ; SSE2-NEXT: psrlq $3, %xmm4
1790 ; SSE2-NEXT: psrlq $4, %xmm1
1791 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
1792 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1793 ; SSE2-NEXT: xorpd %xmm4, %xmm1
1794 ; SSE2-NEXT: psubq %xmm4, %xmm1
1795 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1796 ; SSE2-NEXT: psrad $31, %xmm5
1797 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1798 ; SSE2-NEXT: psrlq $61, %xmm6
1799 ; SSE2-NEXT: psrlq $60, %xmm5
1800 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
1801 ; SSE2-NEXT: paddq %xmm5, %xmm3
1802 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1803 ; SSE2-NEXT: psrlq $3, %xmm5
1804 ; SSE2-NEXT: psrlq $4, %xmm3
1805 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1]
1806 ; SSE2-NEXT: xorpd %xmm4, %xmm3
1807 ; SSE2-NEXT: psubq %xmm4, %xmm3
1810 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1812 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1813 ; SSE41-NEXT: psrad $31, %xmm4
1814 ; SSE41-NEXT: psrlq $62, %xmm4
1815 ; SSE41-NEXT: paddq %xmm0, %xmm4
1816 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1817 ; SSE41-NEXT: psrad $2, %xmm5
1818 ; SSE41-NEXT: psrlq $2, %xmm4
1819 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
1820 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
1821 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1822 ; SSE41-NEXT: psrad $31, %xmm4
1823 ; SSE41-NEXT: psrlq $62, %xmm4
1824 ; SSE41-NEXT: paddq %xmm2, %xmm4
1825 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1826 ; SSE41-NEXT: psrad $2, %xmm5
1827 ; SSE41-NEXT: psrlq $2, %xmm4
1828 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
1829 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1830 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1831 ; SSE41-NEXT: psrad $31, %xmm4
1832 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1833 ; SSE41-NEXT: psrlq $60, %xmm5
1834 ; SSE41-NEXT: psrlq $61, %xmm4
1835 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1836 ; SSE41-NEXT: paddq %xmm4, %xmm1
1837 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1838 ; SSE41-NEXT: psrlq $4, %xmm4
1839 ; SSE41-NEXT: psrlq $3, %xmm1
1840 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1841 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1842 ; SSE41-NEXT: pxor %xmm4, %xmm1
1843 ; SSE41-NEXT: psubq %xmm4, %xmm1
1844 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1845 ; SSE41-NEXT: psrad $31, %xmm5
1846 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1847 ; SSE41-NEXT: psrlq $60, %xmm6
1848 ; SSE41-NEXT: psrlq $61, %xmm5
1849 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1850 ; SSE41-NEXT: paddq %xmm5, %xmm3
1851 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1852 ; SSE41-NEXT: psrlq $4, %xmm5
1853 ; SSE41-NEXT: psrlq $3, %xmm3
1854 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1855 ; SSE41-NEXT: pxor %xmm4, %xmm3
1856 ; SSE41-NEXT: psubq %xmm4, %xmm3
1859 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1861 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1862 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1863 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1864 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5
1865 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4
1866 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1867 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
1868 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4
1869 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1870 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1871 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1872 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1873 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1874 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
1875 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
1876 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
1877 ; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6
1878 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5
1879 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1880 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1881 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1882 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1883 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5
1884 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6
1885 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5
1886 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1887 ; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
1888 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5
1889 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1890 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1891 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1892 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1893 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
1894 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1895 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
1896 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1897 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2
1898 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1899 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1900 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1903 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1905 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1906 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1907 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,62,61,60>
1908 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3
1909 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3
1910 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,2,3,4>
1911 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3
1912 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1913 ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3
1914 ; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3
1915 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1916 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
1917 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2
1918 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2
1919 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2
1920 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
1921 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1922 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1925 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1927 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1928 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1929 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1930 ; AVX512F-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1931 ; AVX512F-NEXT: movb $17, %al
1932 ; AVX512F-NEXT: kmovw %eax, %k1
1933 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1934 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1935 ; AVX512F-NEXT: retq
1937 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1938 ; AVX512BW: # %bb.0:
1939 ; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1
1940 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1941 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1942 ; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1943 ; AVX512BW-NEXT: movb $17, %al
1944 ; AVX512BW-NEXT: kmovd %eax, %k1
1945 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1946 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1947 ; AVX512BW-NEXT: retq
1949 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1951 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1952 ; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1953 ; XOP-NEXT: # xmm3 = mem[0,0]
1954 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4
1955 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
1956 ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4
1957 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1958 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1959 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1960 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6
1961 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6
1962 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6
1963 ; XOP-NEXT: vmovddup {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614]
1964 ; XOP-NEXT: # xmm7 = mem[0,0]
1965 ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6
1966 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
1967 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1968 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1969 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6
1970 ; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5
1971 ; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2
1972 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1973 ; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3
1974 ; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3
1975 ; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3
1976 ; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3
1977 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1978 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1980 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
1984 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
1985 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
1987 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1988 ; SSE2-NEXT: psrad $31, %xmm1
1989 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1990 ; SSE2-NEXT: psrld $28, %xmm2
1991 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1992 ; SSE2-NEXT: psrld $29, %xmm3
1993 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1994 ; SSE2-NEXT: psrld $30, %xmm1
1995 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1996 ; SSE2-NEXT: paddd %xmm0, %xmm1
1997 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1998 ; SSE2-NEXT: psrad $4, %xmm2
1999 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2000 ; SSE2-NEXT: psrad $3, %xmm3
2001 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2002 ; SSE2-NEXT: psrad $2, %xmm1
2003 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
2004 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2005 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
2006 ; SSE2-NEXT: pxor %xmm2, %xmm2
2007 ; SSE2-NEXT: psubd %xmm1, %xmm2
2008 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2009 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2012 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2014 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2015 ; SSE41-NEXT: psrad $31, %xmm1
2016 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2017 ; SSE41-NEXT: psrld $28, %xmm2
2018 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2019 ; SSE41-NEXT: psrld $30, %xmm3
2020 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2021 ; SSE41-NEXT: psrld $29, %xmm1
2022 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2023 ; SSE41-NEXT: paddd %xmm0, %xmm1
2024 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2025 ; SSE41-NEXT: psrad $4, %xmm2
2026 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2027 ; SSE41-NEXT: psrad $2, %xmm3
2028 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2029 ; SSE41-NEXT: pxor %xmm2, %xmm2
2030 ; SSE41-NEXT: psubd %xmm3, %xmm2
2031 ; SSE41-NEXT: psrad $3, %xmm1
2032 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2033 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2036 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2038 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
2039 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2040 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
2041 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2042 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
2043 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2044 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2045 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
2046 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
2047 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2048 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2049 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
2050 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
2051 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2052 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2055 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2056 ; AVX2ORLATER: # %bb.0:
2057 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
2058 ; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2059 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2060 ; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2061 ; AVX2ORLATER-NEXT: vpxor %xmm2, %xmm2, %xmm2
2062 ; AVX2ORLATER-NEXT: vpsubd %xmm1, %xmm2, %xmm2
2063 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2064 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
2065 ; AVX2ORLATER-NEXT: retq
2067 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2069 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
2070 ; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2071 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2072 ; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2073 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2074 ; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm2
2075 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2076 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2078 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2082 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2083 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2086 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2090 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2091 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2094 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2098 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2099 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2102 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2107 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2108 ; SSE-LABEL: non_splat_minus_one_divisor_0:
2110 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2111 ; SSE-NEXT: pxor %xmm1, %xmm0
2112 ; SSE-NEXT: psubb %xmm1, %xmm0
2115 ; AVX1-LABEL: non_splat_minus_one_divisor_0:
2117 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2118 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2119 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2122 ; AVX2-LABEL: non_splat_minus_one_divisor_0:
2124 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2125 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
2126 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2129 ; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2131 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2132 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
2133 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2134 ; AVX512F-NEXT: retq
2136 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2137 ; AVX512BW: # %bb.0:
2138 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2139 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2140 ; AVX512BW-NEXT: kmovd %eax, %k1
2141 ; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2142 ; AVX512BW-NEXT: retq
2144 ; XOP-LABEL: non_splat_minus_one_divisor_0:
2146 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2147 ; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
2148 ; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2150 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2154 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2155 ; SSE2-LABEL: non_splat_minus_one_divisor_1:
2157 ; SSE2-NEXT: pxor %xmm1, %xmm1
2158 ; SSE2-NEXT: pxor %xmm2, %xmm2
2159 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
2160 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2161 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2162 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2163 ; SSE2-NEXT: psrlw $8, %xmm3
2164 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2165 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2166 ; SSE2-NEXT: psrlw $8, %xmm2
2167 ; SSE2-NEXT: packuswb %xmm3, %xmm2
2168 ; SSE2-NEXT: paddb %xmm0, %xmm2
2169 ; SSE2-NEXT: movdqa %xmm2, %xmm1
2170 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2171 ; SSE2-NEXT: psraw $8, %xmm1
2172 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2173 ; SSE2-NEXT: psrlw $8, %xmm1
2174 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2175 ; SSE2-NEXT: psraw $8, %xmm2
2176 ; SSE2-NEXT: psllw $7, %xmm2
2177 ; SSE2-NEXT: psrlw $8, %xmm2
2178 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2179 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2180 ; SSE2-NEXT: pand %xmm1, %xmm2
2181 ; SSE2-NEXT: pandn %xmm0, %xmm1
2182 ; SSE2-NEXT: por %xmm2, %xmm1
2183 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2184 ; SSE2-NEXT: pxor %xmm0, %xmm1
2185 ; SSE2-NEXT: psubb %xmm0, %xmm1
2186 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2189 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
2191 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2192 ; SSE41-NEXT: pxor %xmm0, %xmm0
2193 ; SSE41-NEXT: pxor %xmm3, %xmm3
2194 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
2195 ; SSE41-NEXT: pxor %xmm4, %xmm4
2196 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2197 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2198 ; SSE41-NEXT: psllw $1, %xmm2
2199 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
2200 ; SSE41-NEXT: psrlw $8, %xmm2
2201 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2202 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2203 ; SSE41-NEXT: psrlw $8, %xmm3
2204 ; SSE41-NEXT: packuswb %xmm3, %xmm2
2205 ; SSE41-NEXT: paddb %xmm1, %xmm2
2206 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2207 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2208 ; SSE41-NEXT: psraw $8, %xmm0
2209 ; SSE41-NEXT: movdqa %xmm0, %xmm3
2210 ; SSE41-NEXT: psllw $1, %xmm3
2211 ; SSE41-NEXT: psllw $7, %xmm0
2212 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2213 ; SSE41-NEXT: psrlw $8, %xmm0
2214 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2215 ; SSE41-NEXT: psraw $8, %xmm2
2216 ; SSE41-NEXT: psllw $7, %xmm2
2217 ; SSE41-NEXT: psrlw $8, %xmm2
2218 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2219 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2220 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
2221 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2222 ; SSE41-NEXT: pxor %xmm0, %xmm1
2223 ; SSE41-NEXT: psubb %xmm0, %xmm1
2224 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2227 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
2229 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2230 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2231 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2232 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2233 ; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
2234 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2235 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2236 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2237 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2238 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2239 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
2240 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2241 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2242 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2243 ; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3
2244 ; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2245 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2246 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2247 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2248 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
2249 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
2250 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2251 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2252 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2253 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2254 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2255 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2256 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2259 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
2261 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2262 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2263 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2264 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2265 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2266 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2267 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2268 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2269 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
2270 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2271 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2272 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2273 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2274 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2275 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2276 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2277 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
2278 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2279 ; AVX2-NEXT: vzeroupper
2282 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2284 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2285 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2286 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2287 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2288 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
2289 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2290 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
2291 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2292 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
2293 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2294 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2295 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2296 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
2297 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2298 ; AVX512F-NEXT: vzeroupper
2299 ; AVX512F-NEXT: retq
2301 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2302 ; AVX512BW: # %bb.0:
2303 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2304 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2305 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2306 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2307 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2308 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2309 ; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2
2310 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2311 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2312 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2313 ; AVX512BW-NEXT: kmovd %eax, %k1
2314 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
2315 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0
2316 ; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44
2317 ; AVX512BW-NEXT: kmovd %eax, %k1
2318 ; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1}
2319 ; AVX512BW-NEXT: vzeroupper
2320 ; AVX512BW-NEXT: retq
2322 ; XOP-LABEL: non_splat_minus_one_divisor_1:
2324 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2325 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2326 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2327 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2328 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2329 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2330 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2331 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2332 ; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
2333 ; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2335 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2339 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2340 ; SSE2-LABEL: non_splat_minus_one_divisor_2:
2342 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2343 ; SSE2-NEXT: psrld $31, %xmm1
2344 ; SSE2-NEXT: paddd %xmm0, %xmm1
2345 ; SSE2-NEXT: psrad $1, %xmm1
2346 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2347 ; SSE2-NEXT: pxor %xmm0, %xmm0
2348 ; SSE2-NEXT: psubd %xmm1, %xmm0
2349 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2350 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2353 ; SSE41-LABEL: non_splat_minus_one_divisor_2:
2355 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2356 ; SSE41-NEXT: psrld $31, %xmm1
2357 ; SSE41-NEXT: paddd %xmm0, %xmm1
2358 ; SSE41-NEXT: psrad $1, %xmm1
2359 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2360 ; SSE41-NEXT: pxor %xmm1, %xmm1
2361 ; SSE41-NEXT: psubd %xmm0, %xmm1
2362 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2365 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
2367 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
2368 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2369 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1
2370 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2371 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2372 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2373 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2376 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2377 ; AVX2ORLATER: # %bb.0:
2378 ; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1
2379 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2380 ; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1
2381 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2382 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2383 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2384 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2385 ; AVX2ORLATER-NEXT: retq
2387 ; XOP-LABEL: non_splat_minus_one_divisor_2:
2389 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm1
2390 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2391 ; XOP-NEXT: vpsrad $1, %xmm1, %xmm1
2392 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2393 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2394 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2395 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2397 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2401 define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2402 ; SSE-LABEL: combine_vec_sdiv_nonuniform:
2404 ; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2405 ; SSE-NEXT: movdqa %xmm0, %xmm1
2406 ; SSE-NEXT: psrlw $15, %xmm1
2407 ; SSE-NEXT: paddw %xmm1, %xmm0
2410 ; AVX-LABEL: combine_vec_sdiv_nonuniform:
2412 ; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2413 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
2414 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2416 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2420 define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2421 ; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2423 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2424 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2425 ; SSE2-NEXT: psraw $2, %xmm1
2426 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2427 ; SSE2-NEXT: psraw $1, %xmm2
2428 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2429 ; SSE2-NEXT: psrlw $15, %xmm0
2430 ; SSE2-NEXT: paddw %xmm2, %xmm0
2433 ; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2435 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2436 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2437 ; SSE41-NEXT: psraw $1, %xmm1
2438 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2439 ; SSE41-NEXT: psraw $2, %xmm2
2440 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2441 ; SSE41-NEXT: psrlw $15, %xmm0
2442 ; SSE41-NEXT: paddw %xmm2, %xmm0
2445 ; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2447 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2448 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
2449 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
2450 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2451 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2452 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2455 ; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2457 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2458 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
2459 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
2460 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2461 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2462 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2465 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2467 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2468 ; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
2469 ; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
2470 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2471 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2472 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2473 ; AVX512F-NEXT: retq
2475 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2476 ; AVX512BW: # %bb.0:
2477 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2478 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2479 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2480 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2481 ; AVX512BW-NEXT: retq
2483 ; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2485 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2486 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2487 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2488 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2490 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2494 define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2495 ; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2497 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2498 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2499 ; SSE2-NEXT: paddw %xmm1, %xmm0
2500 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2501 ; SSE2-NEXT: psraw $4, %xmm1
2502 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2503 ; SSE2-NEXT: psraw $8, %xmm2
2504 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2505 ; SSE2-NEXT: psrlw $15, %xmm0
2506 ; SSE2-NEXT: paddw %xmm2, %xmm0
2509 ; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2511 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2512 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2513 ; SSE41-NEXT: paddw %xmm1, %xmm0
2514 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2515 ; SSE41-NEXT: psraw $8, %xmm1
2516 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2517 ; SSE41-NEXT: psraw $4, %xmm2
2518 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2519 ; SSE41-NEXT: psrlw $15, %xmm0
2520 ; SSE41-NEXT: paddw %xmm2, %xmm0
2523 ; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2525 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2526 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2527 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2528 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2529 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2530 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2531 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2534 ; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2536 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2537 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2538 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2539 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2540 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2541 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2542 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2545 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2547 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2548 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2549 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2550 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2551 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2552 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2553 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2554 ; AVX512F-NEXT: retq
2556 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2557 ; AVX512BW: # %bb.0:
2558 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2559 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2560 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2561 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2562 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2563 ; AVX512BW-NEXT: retq
2565 ; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2567 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2568 ; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2569 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2570 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2571 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2573 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2577 define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2578 ; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2580 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2581 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2582 ; SSE2-NEXT: psubw %xmm0, %xmm1
2583 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2584 ; SSE2-NEXT: psraw $4, %xmm0
2585 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2586 ; SSE2-NEXT: psraw $8, %xmm2
2587 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2588 ; SSE2-NEXT: psrlw $15, %xmm1
2589 ; SSE2-NEXT: paddw %xmm2, %xmm1
2590 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2593 ; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2595 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2596 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2597 ; SSE41-NEXT: psubw %xmm0, %xmm1
2598 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2599 ; SSE41-NEXT: psraw $8, %xmm0
2600 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2601 ; SSE41-NEXT: psraw $4, %xmm2
2602 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2603 ; SSE41-NEXT: psrlw $15, %xmm1
2604 ; SSE41-NEXT: paddw %xmm2, %xmm1
2605 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2608 ; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2610 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2611 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2612 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2613 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2614 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2615 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2616 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2619 ; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2621 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2622 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2623 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2624 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2625 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2626 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2627 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2630 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2632 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2633 ; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2634 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2635 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2636 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2637 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2638 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2639 ; AVX512F-NEXT: retq
2641 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2642 ; AVX512BW: # %bb.0:
2643 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2644 ; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2645 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2646 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2647 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2648 ; AVX512BW-NEXT: retq
2650 ; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2652 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2653 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2654 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2655 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2656 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2658 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2662 define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2663 ; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2665 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2666 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2667 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2668 ; SSE2-NEXT: paddw %xmm1, %xmm0
2669 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2670 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2671 ; SSE2-NEXT: pand %xmm1, %xmm2
2672 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2673 ; SSE2-NEXT: psraw $8, %xmm3
2674 ; SSE2-NEXT: pandn %xmm3, %xmm1
2675 ; SSE2-NEXT: por %xmm2, %xmm1
2676 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2677 ; SSE2-NEXT: pand %xmm2, %xmm1
2678 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2679 ; SSE2-NEXT: psraw $4, %xmm3
2680 ; SSE2-NEXT: pandn %xmm3, %xmm2
2681 ; SSE2-NEXT: por %xmm1, %xmm2
2682 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2683 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2684 ; SSE2-NEXT: pand %xmm1, %xmm3
2685 ; SSE2-NEXT: psraw $2, %xmm2
2686 ; SSE2-NEXT: pandn %xmm2, %xmm1
2687 ; SSE2-NEXT: por %xmm3, %xmm1
2688 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2689 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2690 ; SSE2-NEXT: pand %xmm2, %xmm3
2691 ; SSE2-NEXT: psraw $1, %xmm1
2692 ; SSE2-NEXT: pandn %xmm1, %xmm2
2693 ; SSE2-NEXT: por %xmm3, %xmm2
2694 ; SSE2-NEXT: psrlw $15, %xmm0
2695 ; SSE2-NEXT: paddw %xmm2, %xmm0
2698 ; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2700 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2701 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2702 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2703 ; SSE41-NEXT: paddw %xmm1, %xmm0
2704 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,u,512,256>
2705 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2706 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2707 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2708 ; SSE41-NEXT: psraw $1, %xmm2
2709 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2710 ; SSE41-NEXT: psrlw $15, %xmm0
2711 ; SSE41-NEXT: paddw %xmm2, %xmm0
2714 ; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2716 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2717 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2718 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2719 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2720 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2721 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
2722 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2723 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2724 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2727 ; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2729 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2730 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2731 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2732 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2733 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2734 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2
2735 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2736 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2737 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2740 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2742 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2743 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2744 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2745 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2746 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2747 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2748 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2749 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2750 ; AVX512F-NEXT: vzeroupper
2751 ; AVX512F-NEXT: retq
2753 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2754 ; AVX512BW: # %bb.0:
2755 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2756 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2757 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2758 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2759 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2760 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2761 ; AVX512BW-NEXT: retq
2763 ; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2765 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2766 ; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2767 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2768 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2769 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2771 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2775 define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2776 ; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2778 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2779 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2780 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2781 ; SSE2-NEXT: paddw %xmm1, %xmm0
2782 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2783 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2784 ; SSE2-NEXT: psraw $8, %xmm2
2785 ; SSE2-NEXT: pand %xmm1, %xmm2
2786 ; SSE2-NEXT: pandn %xmm0, %xmm1
2787 ; SSE2-NEXT: por %xmm2, %xmm1
2788 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2789 ; SSE2-NEXT: psraw $6, %xmm2
2790 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535]
2791 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0]
2792 ; SSE2-NEXT: pand %xmm4, %xmm1
2793 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2794 ; SSE2-NEXT: psraw $12, %xmm5
2795 ; SSE2-NEXT: pandn %xmm5, %xmm4
2796 ; SSE2-NEXT: por %xmm1, %xmm4
2797 ; SSE2-NEXT: pand %xmm3, %xmm4
2798 ; SSE2-NEXT: pandn %xmm2, %xmm3
2799 ; SSE2-NEXT: por %xmm4, %xmm3
2800 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0]
2801 ; SSE2-NEXT: movdqa %xmm3, %xmm2
2802 ; SSE2-NEXT: pand %xmm1, %xmm2
2803 ; SSE2-NEXT: psraw $1, %xmm3
2804 ; SSE2-NEXT: pandn %xmm3, %xmm1
2805 ; SSE2-NEXT: por %xmm2, %xmm1
2806 ; SSE2-NEXT: psrlw $15, %xmm0
2807 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2808 ; SSE2-NEXT: paddw %xmm1, %xmm0
2811 ; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2813 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2814 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2815 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2816 ; SSE41-NEXT: paddw %xmm1, %xmm0
2817 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <4,256,256,u,u,512,256,8>
2818 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2819 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2820 ; SSE41-NEXT: psrlw $15, %xmm0
2821 ; SSE41-NEXT: pxor %xmm2, %xmm2
2822 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2823 ; SSE41-NEXT: paddw %xmm1, %xmm0
2826 ; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2828 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2829 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2830 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2831 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2832 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2833 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2834 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2835 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2836 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2839 ; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2841 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2842 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2843 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2844 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2845 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2846 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2847 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2848 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2849 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2852 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2854 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2855 ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2856 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2857 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2858 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2859 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2860 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2861 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2862 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2863 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2864 ; AVX512F-NEXT: vzeroupper
2865 ; AVX512F-NEXT: retq
2867 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2868 ; AVX512BW: # %bb.0:
2869 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2870 ; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2871 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2872 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2873 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2874 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2875 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2876 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2877 ; AVX512BW-NEXT: retq
2879 ; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2881 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2882 ; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2883 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2884 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2885 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2886 ; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2887 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2889 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2893 define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2894 ; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2896 ; SSE2-NEXT: pxor %xmm1, %xmm1
2897 ; SSE2-NEXT: psubw %xmm0, %xmm1
2898 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2901 ; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2903 ; SSE41-NEXT: pxor %xmm1, %xmm1
2904 ; SSE41-NEXT: psubw %xmm0, %xmm1
2905 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2908 ; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2910 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2911 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2912 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2915 ; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2916 ; AVX2ORLATER: # %bb.0:
2917 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2918 ; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2919 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2920 ; AVX2ORLATER-NEXT: retq
2922 ; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2924 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2925 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2926 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2928 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2932 define <16 x i8> @pr38658(<16 x i8> %x) {
2933 ; SSE2-LABEL: pr38658:
2935 ; SSE2-NEXT: pxor %xmm1, %xmm1
2936 ; SSE2-NEXT: pxor %xmm2, %xmm2
2937 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2938 ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2939 ; SSE2-NEXT: psrlw $8, %xmm2
2940 ; SSE2-NEXT: pxor %xmm3, %xmm3
2941 ; SSE2-NEXT: packuswb %xmm2, %xmm3
2942 ; SSE2-NEXT: paddb %xmm3, %xmm0
2943 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2944 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2945 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2946 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2947 ; SSE2-NEXT: psraw $8, %xmm1
2948 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2949 ; SSE2-NEXT: psrlw $8, %xmm1
2950 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2951 ; SSE2-NEXT: psrlw $7, %xmm0
2952 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2953 ; SSE2-NEXT: paddb %xmm2, %xmm0
2956 ; SSE41-LABEL: pr38658:
2958 ; SSE41-NEXT: pxor %xmm1, %xmm1
2959 ; SSE41-NEXT: pxor %xmm2, %xmm2
2960 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2961 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2962 ; SSE41-NEXT: psrlw $8, %xmm2
2963 ; SSE41-NEXT: packuswb %xmm2, %xmm1
2964 ; SSE41-NEXT: paddb %xmm1, %xmm0
2965 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2966 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2967 ; SSE41-NEXT: psraw $8, %xmm1
2968 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2969 ; SSE41-NEXT: psllw $6, %xmm2
2970 ; SSE41-NEXT: psllw $8, %xmm1
2971 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2972 ; SSE41-NEXT: psrlw $8, %xmm1
2973 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2974 ; SSE41-NEXT: packuswb %xmm1, %xmm2
2975 ; SSE41-NEXT: psrlw $7, %xmm0
2976 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2977 ; SSE41-NEXT: paddb %xmm2, %xmm0
2980 ; AVX1-LABEL: pr38658:
2982 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2983 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2984 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2985 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2986 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2987 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
2988 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2989 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
2990 ; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2
2991 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
2992 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2993 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2994 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2995 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2996 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
2997 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2998 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3001 ; AVX2-LABEL: pr38658:
3003 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3004 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3005 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3006 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3007 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3008 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3009 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3010 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3011 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3012 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3013 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3014 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
3015 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3016 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3017 ; AVX2-NEXT: vzeroupper
3020 ; AVX512F-LABEL: pr38658:
3022 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
3023 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3024 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
3025 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3026 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3027 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3028 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1
3029 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3030 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
3031 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
3032 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3033 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3034 ; AVX512F-NEXT: vzeroupper
3035 ; AVX512F-NEXT: retq
3037 ; AVX512BW-LABEL: pr38658:
3038 ; AVX512BW: # %bb.0:
3039 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
3040 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3041 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
3042 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
3043 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3044 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1
3045 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3046 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
3047 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3048 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
3049 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3050 ; AVX512BW-NEXT: vzeroupper
3051 ; AVX512BW-NEXT: retq
3053 ; XOP-LABEL: pr38658:
3055 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
3056 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3057 ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3058 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
3059 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3060 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3061 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3062 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3063 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3065 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3069 define i1 @bool_sdiv(i1 %x, i1 %y) {
3070 ; CHECK-LABEL: bool_sdiv:
3072 ; CHECK-NEXT: movl %edi, %eax
3073 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3079 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3080 ; CHECK-LABEL: boolvec_sdiv:
3083 %r = sdiv <4 x i1> %x, %y
3087 define i32 @combine_sdiv_two(i32 %x) {
3088 ; CHECK-LABEL: combine_sdiv_two:
3090 ; CHECK-NEXT: movl %edi, %eax
3091 ; CHECK-NEXT: shrl $31, %eax
3092 ; CHECK-NEXT: addl %edi, %eax
3093 ; CHECK-NEXT: sarl %eax
3099 define i32 @combine_sdiv_negtwo(i32 %x) {
3100 ; CHECK-LABEL: combine_sdiv_negtwo:
3102 ; CHECK-NEXT: movl %edi, %eax
3103 ; CHECK-NEXT: shrl $31, %eax
3104 ; CHECK-NEXT: addl %edi, %eax
3105 ; CHECK-NEXT: sarl %eax
3106 ; CHECK-NEXT: negl %eax
3108 %1 = sdiv i32 %x, -2
3112 define i8 @combine_i8_sdiv_pow2(i8 %x) {
3113 ; CHECK-LABEL: combine_i8_sdiv_pow2:
3115 ; CHECK-NEXT: movl %edi, %eax
3116 ; CHECK-NEXT: sarb $7, %al
3117 ; CHECK-NEXT: shrb $4, %al
3118 ; CHECK-NEXT: addb %dil, %al
3119 ; CHECK-NEXT: sarb $4, %al
3125 define i8 @combine_i8_sdiv_negpow2(i8 %x) {
3126 ; CHECK-LABEL: combine_i8_sdiv_negpow2:
3128 ; CHECK-NEXT: movl %edi, %eax
3129 ; CHECK-NEXT: sarb $7, %al
3130 ; CHECK-NEXT: shrb $2, %al
3131 ; CHECK-NEXT: addb %dil, %al
3132 ; CHECK-NEXT: sarb $6, %al
3133 ; CHECK-NEXT: negb %al
3135 %1 = sdiv i8 %x, -64
3139 define i16 @combine_i16_sdiv_pow2(i16 %x) {
3140 ; CHECK-LABEL: combine_i16_sdiv_pow2:
3142 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3143 ; CHECK-NEXT: leal 15(%rdi), %eax
3144 ; CHECK-NEXT: testw %di, %di
3145 ; CHECK-NEXT: cmovnsl %edi, %eax
3147 ; CHECK-NEXT: shrl $4, %eax
3148 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
3150 %1 = sdiv i16 %x, 16
3154 define i16 @combine_i16_sdiv_negpow2(i16 %x) {
3155 ; CHECK-LABEL: combine_i16_sdiv_negpow2:
3157 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3158 ; CHECK-NEXT: leal 255(%rdi), %eax
3159 ; CHECK-NEXT: testw %di, %di
3160 ; CHECK-NEXT: cmovnsl %edi, %eax
3162 ; CHECK-NEXT: sarl $8, %eax
3163 ; CHECK-NEXT: negl %eax
3164 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
3166 %1 = sdiv i16 %x, -256
3170 define i32 @combine_i32_sdiv_pow2(i32 %x) {
3171 ; CHECK-LABEL: combine_i32_sdiv_pow2:
3173 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3174 ; CHECK-NEXT: leal 15(%rdi), %eax
3175 ; CHECK-NEXT: testl %edi, %edi
3176 ; CHECK-NEXT: cmovnsl %edi, %eax
3177 ; CHECK-NEXT: sarl $4, %eax
3179 %1 = sdiv i32 %x, 16
3183 define i32 @combine_i32_sdiv_negpow2(i32 %x) {
3184 ; CHECK-LABEL: combine_i32_sdiv_negpow2:
3186 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
3187 ; CHECK-NEXT: leal 255(%rdi), %eax
3188 ; CHECK-NEXT: testl %edi, %edi
3189 ; CHECK-NEXT: cmovnsl %edi, %eax
3190 ; CHECK-NEXT: sarl $8, %eax
3191 ; CHECK-NEXT: negl %eax
3193 %1 = sdiv i32 %x, -256
3197 define i64 @combine_i64_sdiv_pow2(i64 %x) {
3198 ; CHECK-LABEL: combine_i64_sdiv_pow2:
3200 ; CHECK-NEXT: leaq 15(%rdi), %rax
3201 ; CHECK-NEXT: testq %rdi, %rdi
3202 ; CHECK-NEXT: cmovnsq %rdi, %rax
3203 ; CHECK-NEXT: sarq $4, %rax
3205 %1 = sdiv i64 %x, 16
3209 define i64 @combine_i64_sdiv_negpow2(i64 %x) {
3210 ; CHECK-LABEL: combine_i64_sdiv_negpow2:
3212 ; CHECK-NEXT: leaq 255(%rdi), %rax
3213 ; CHECK-NEXT: testq %rdi, %rdi
3214 ; CHECK-NEXT: cmovnsq %rdi, %rax
3215 ; CHECK-NEXT: sarq $8, %rax
3216 ; CHECK-NEXT: negq %rax
3218 %1 = sdiv i64 %x, -256