1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
10 ; fold (sdiv x, 1) -> x
11 define i32 @combine_sdiv_by_one(i32 %x) {
12 ; CHECK-LABEL: combine_sdiv_by_one:
14 ; CHECK-NEXT: movl %edi, %eax
20 define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21 ; CHECK-LABEL: combine_vec_sdiv_by_one:
24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
28 ; fold (sdiv x, -1) -> 0 - x
29 define i32 @combine_sdiv_by_negone(i32 %x) {
30 ; CHECK-LABEL: combine_sdiv_by_negone:
32 ; CHECK-NEXT: movl %edi, %eax
33 ; CHECK-NEXT: negl %eax
39 define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40 ; SSE-LABEL: combine_vec_sdiv_by_negone:
42 ; SSE-NEXT: pxor %xmm1, %xmm1
43 ; SSE-NEXT: psubd %xmm0, %xmm1
44 ; SSE-NEXT: movdqa %xmm1, %xmm0
47 ; AVX-LABEL: combine_vec_sdiv_by_negone:
49 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
50 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
56 ; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57 define i32 @combine_sdiv_by_minsigned(i32 %x) {
58 ; CHECK-LABEL: combine_sdiv_by_minsigned:
60 ; CHECK-NEXT: xorl %eax, %eax
61 ; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000
62 ; CHECK-NEXT: sete %al
64 %1 = sdiv i32 %x, -2147483648
68 define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69 ; SSE-LABEL: combine_vec_sdiv_by_minsigned:
71 ; SSE-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
72 ; SSE-NEXT: psrld $31, %xmm0
75 ; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
77 ; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
78 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
81 ; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
83 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
85 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
88 ; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
90 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91 ; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
92 ; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0
95 ; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
97 ; AVX512BW-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1
98 ; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
101 ; XOP-LABEL: combine_vec_sdiv_by_minsigned:
103 ; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm0, %xmm0
104 ; XOP-NEXT: vpsrld $31, %xmm0, %xmm0
106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
110 ; fold (sdiv 0, x) -> 0
111 define i32 @combine_sdiv_zero(i32 %x) {
112 ; CHECK-LABEL: combine_sdiv_zero:
114 ; CHECK-NEXT: xorl %eax, %eax
120 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121 ; SSE-LABEL: combine_vec_sdiv_zero:
123 ; SSE-NEXT: xorps %xmm0, %xmm0
126 ; AVX-LABEL: combine_vec_sdiv_zero:
128 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
130 %1 = sdiv <4 x i32> zeroinitializer, %x
134 ; fold (sdiv x, x) -> 1
135 define i32 @combine_sdiv_dupe(i32 %x) {
136 ; CHECK-LABEL: combine_sdiv_dupe:
138 ; CHECK-NEXT: movl $1, %eax
144 define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145 ; SSE-LABEL: combine_vec_sdiv_dupe:
147 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
150 ; AVX1-LABEL: combine_vec_sdiv_dupe:
152 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
155 ; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
156 ; AVX2ORLATER: # %bb.0:
157 ; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
158 ; AVX2ORLATER-NEXT: retq
160 ; XOP-LABEL: combine_vec_sdiv_dupe:
162 ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
164 %1 = sdiv <4 x i32> %x, %x
168 ; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
169 define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
170 ; SSE-LABEL: combine_vec_sdiv_by_pos0:
172 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
173 ; SSE-NEXT: psrld $2, %xmm0
176 ; AVX-LABEL: combine_vec_sdiv_by_pos0:
178 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
179 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
181 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
182 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
186 define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
187 ; SSE2-LABEL: combine_vec_sdiv_by_pos1:
189 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
190 ; SSE2-NEXT: movdqa %xmm0, %xmm1
191 ; SSE2-NEXT: psrld $4, %xmm1
192 ; SSE2-NEXT: movdqa %xmm0, %xmm2
193 ; SSE2-NEXT: psrld $3, %xmm2
194 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
195 ; SSE2-NEXT: movdqa %xmm0, %xmm1
196 ; SSE2-NEXT: psrld $2, %xmm1
197 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
201 ; SSE41-LABEL: combine_vec_sdiv_by_pos1:
203 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
204 ; SSE41-NEXT: movdqa %xmm0, %xmm2
205 ; SSE41-NEXT: movdqa %xmm0, %xmm1
206 ; SSE41-NEXT: psrld $3, %xmm1
207 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
208 ; SSE41-NEXT: psrld $4, %xmm0
209 ; SSE41-NEXT: psrld $2, %xmm2
210 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
211 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
212 ; SSE41-NEXT: movdqa %xmm1, %xmm0
215 ; AVX1-LABEL: combine_vec_sdiv_by_pos1:
217 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
218 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
219 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
220 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
221 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2
222 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
223 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
226 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1:
227 ; AVX2ORLATER: # %bb.0:
228 ; AVX2ORLATER-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
229 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
230 ; AVX2ORLATER-NEXT: retq
232 ; XOP-LABEL: combine_vec_sdiv_by_pos1:
234 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
235 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
237 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
238 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
242 ; fold (sdiv x, (1 << c)) -> x >>u c
243 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
244 ; SSE-LABEL: combine_vec_sdiv_by_pow2a:
246 ; SSE-NEXT: movdqa %xmm0, %xmm1
247 ; SSE-NEXT: psrad $31, %xmm1
248 ; SSE-NEXT: psrld $30, %xmm1
249 ; SSE-NEXT: paddd %xmm0, %xmm1
250 ; SSE-NEXT: psrad $2, %xmm1
251 ; SSE-NEXT: movdqa %xmm1, %xmm0
254 ; AVX-LABEL: combine_vec_sdiv_by_pow2a:
256 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
257 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
258 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
259 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
261 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
265 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
266 ; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
268 ; SSE-NEXT: movdqa %xmm0, %xmm1
269 ; SSE-NEXT: psrad $31, %xmm1
270 ; SSE-NEXT: psrld $30, %xmm1
271 ; SSE-NEXT: paddd %xmm0, %xmm1
272 ; SSE-NEXT: psrad $2, %xmm1
273 ; SSE-NEXT: pxor %xmm0, %xmm0
274 ; SSE-NEXT: psubd %xmm1, %xmm0
277 ; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
279 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
280 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
281 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
282 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
283 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
284 ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
286 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
290 define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
291 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
293 ; SSE2-NEXT: pxor %xmm1, %xmm1
294 ; SSE2-NEXT: pxor %xmm2, %xmm2
295 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
296 ; SSE2-NEXT: movdqa %xmm2, %xmm3
297 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
298 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4,2,16,8,32,64,2]
299 ; SSE2-NEXT: pmullw %xmm4, %xmm3
300 ; SSE2-NEXT: psrlw $8, %xmm3
301 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
302 ; SSE2-NEXT: pmullw %xmm4, %xmm2
303 ; SSE2-NEXT: psrlw $8, %xmm2
304 ; SSE2-NEXT: packuswb %xmm3, %xmm2
305 ; SSE2-NEXT: paddb %xmm0, %xmm2
306 ; SSE2-NEXT: movdqa %xmm2, %xmm1
307 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
308 ; SSE2-NEXT: psraw $8, %xmm1
309 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
310 ; SSE2-NEXT: pmullw %xmm3, %xmm1
311 ; SSE2-NEXT: psrlw $8, %xmm1
312 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313 ; SSE2-NEXT: psraw $8, %xmm2
314 ; SSE2-NEXT: pmullw %xmm3, %xmm2
315 ; SSE2-NEXT: psrlw $8, %xmm2
316 ; SSE2-NEXT: packuswb %xmm1, %xmm2
317 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
318 ; SSE2-NEXT: pand %xmm1, %xmm2
319 ; SSE2-NEXT: pandn %xmm0, %xmm1
320 ; SSE2-NEXT: por %xmm2, %xmm1
321 ; SSE2-NEXT: movdqa %xmm1, %xmm0
324 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
326 ; SSE41-NEXT: movdqa %xmm0, %xmm1
327 ; SSE41-NEXT: pxor %xmm0, %xmm0
328 ; SSE41-NEXT: pxor %xmm3, %xmm3
329 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
330 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
331 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
332 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,4,2,16,8,32,64,2]
333 ; SSE41-NEXT: pmullw %xmm0, %xmm3
334 ; SSE41-NEXT: psrlw $8, %xmm3
335 ; SSE41-NEXT: pmullw %xmm0, %xmm2
336 ; SSE41-NEXT: psrlw $8, %xmm2
337 ; SSE41-NEXT: packuswb %xmm3, %xmm2
338 ; SSE41-NEXT: paddb %xmm1, %xmm2
339 ; SSE41-NEXT: movdqa %xmm2, %xmm0
340 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
341 ; SSE41-NEXT: psraw $8, %xmm0
342 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
343 ; SSE41-NEXT: pmullw %xmm3, %xmm0
344 ; SSE41-NEXT: psrlw $8, %xmm0
345 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
346 ; SSE41-NEXT: psraw $8, %xmm2
347 ; SSE41-NEXT: pmullw %xmm3, %xmm2
348 ; SSE41-NEXT: psrlw $8, %xmm2
349 ; SSE41-NEXT: packuswb %xmm0, %xmm2
350 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
351 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
352 ; SSE41-NEXT: movdqa %xmm1, %xmm0
355 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
357 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
358 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
359 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
360 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
361 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
362 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
363 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
364 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
365 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
366 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
367 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
368 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
369 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
371 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
372 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
373 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
375 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
376 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
377 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
378 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
379 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
382 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
384 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
385 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
386 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
387 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
388 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
389 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
390 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
391 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
392 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
393 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
394 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
395 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
396 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
397 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
398 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
399 ; AVX2-NEXT: vzeroupper
402 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
404 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
405 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
406 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
407 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
408 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
409 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1
410 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
411 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
412 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
413 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
414 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
415 ; AVX512F-NEXT: vzeroupper
418 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
420 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
421 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
422 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
423 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
424 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
425 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1
426 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
427 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1
428 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
429 ; AVX512BW-NEXT: movw $257, %ax # imm = 0x101
430 ; AVX512BW-NEXT: kmovd %eax, %k1
431 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
432 ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
433 ; AVX512BW-NEXT: vzeroupper
434 ; AVX512BW-NEXT: retq
436 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
438 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
439 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
440 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
441 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1
442 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm1, %xmm1
443 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
444 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
446 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
450 define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
451 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
453 ; SSE2-NEXT: movdqa %xmm0, %xmm1
454 ; SSE2-NEXT: psraw $15, %xmm1
455 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1
456 ; SSE2-NEXT: paddw %xmm0, %xmm1
457 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
458 ; SSE2-NEXT: movdqa %xmm1, %xmm3
459 ; SSE2-NEXT: pand %xmm2, %xmm3
460 ; SSE2-NEXT: psraw $4, %xmm1
461 ; SSE2-NEXT: pandn %xmm1, %xmm2
462 ; SSE2-NEXT: por %xmm3, %xmm2
463 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
464 ; SSE2-NEXT: movdqa %xmm2, %xmm3
465 ; SSE2-NEXT: pand %xmm1, %xmm3
466 ; SSE2-NEXT: psraw $2, %xmm2
467 ; SSE2-NEXT: pandn %xmm2, %xmm1
468 ; SSE2-NEXT: por %xmm3, %xmm1
469 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
470 ; SSE2-NEXT: movdqa %xmm1, %xmm3
471 ; SSE2-NEXT: pand %xmm2, %xmm3
472 ; SSE2-NEXT: psraw $1, %xmm1
473 ; SSE2-NEXT: pandn %xmm1, %xmm2
474 ; SSE2-NEXT: por %xmm3, %xmm2
475 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
476 ; SSE2-NEXT: pand %xmm1, %xmm2
477 ; SSE2-NEXT: pandn %xmm0, %xmm1
478 ; SSE2-NEXT: por %xmm2, %xmm1
479 ; SSE2-NEXT: movdqa %xmm1, %xmm0
482 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
484 ; SSE41-NEXT: movdqa %xmm0, %xmm1
485 ; SSE41-NEXT: psraw $15, %xmm1
486 ; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1
487 ; SSE41-NEXT: paddw %xmm0, %xmm1
488 ; SSE41-NEXT: movdqa %xmm1, %xmm2
489 ; SSE41-NEXT: psraw $1, %xmm2
490 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm1
491 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
492 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
493 ; SSE41-NEXT: movdqa %xmm1, %xmm0
496 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
498 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
499 ; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
500 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
501 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
502 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1
503 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
504 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
507 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
509 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1
510 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
511 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
512 ; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2
513 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1
514 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
515 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
518 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
520 ; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1
521 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
522 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1
523 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
524 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
525 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
526 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
527 ; AVX512F-NEXT: vzeroupper
530 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
532 ; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1
533 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
534 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1
535 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm1, %xmm1
536 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
537 ; AVX512BW-NEXT: retq
539 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
541 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm1
542 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm1
543 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1
544 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm1, %xmm1
545 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
547 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
551 define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
552 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
554 ; SSE2-NEXT: movdqa %xmm0, %xmm3
555 ; SSE2-NEXT: psraw $15, %xmm0
556 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [1,4,2,16,8,32,64,2]
557 ; SSE2-NEXT: pmulhuw %xmm8, %xmm0
558 ; SSE2-NEXT: paddw %xmm3, %xmm0
559 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
560 ; SSE2-NEXT: movdqa %xmm0, %xmm2
561 ; SSE2-NEXT: pand %xmm4, %xmm2
562 ; SSE2-NEXT: psraw $4, %xmm0
563 ; SSE2-NEXT: movdqa %xmm4, %xmm6
564 ; SSE2-NEXT: pandn %xmm0, %xmm6
565 ; SSE2-NEXT: por %xmm2, %xmm6
566 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
567 ; SSE2-NEXT: movdqa %xmm6, %xmm0
568 ; SSE2-NEXT: pand %xmm5, %xmm0
569 ; SSE2-NEXT: psraw $2, %xmm6
570 ; SSE2-NEXT: movdqa %xmm5, %xmm2
571 ; SSE2-NEXT: pandn %xmm6, %xmm2
572 ; SSE2-NEXT: por %xmm0, %xmm2
573 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0]
574 ; SSE2-NEXT: movdqa %xmm2, %xmm0
575 ; SSE2-NEXT: pand %xmm7, %xmm0
576 ; SSE2-NEXT: psraw $1, %xmm2
577 ; SSE2-NEXT: movdqa %xmm7, %xmm6
578 ; SSE2-NEXT: pandn %xmm2, %xmm6
579 ; SSE2-NEXT: por %xmm0, %xmm6
580 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
581 ; SSE2-NEXT: pand %xmm2, %xmm6
582 ; SSE2-NEXT: movdqa %xmm2, %xmm0
583 ; SSE2-NEXT: pandn %xmm3, %xmm0
584 ; SSE2-NEXT: por %xmm6, %xmm0
585 ; SSE2-NEXT: movdqa %xmm1, %xmm3
586 ; SSE2-NEXT: psraw $15, %xmm3
587 ; SSE2-NEXT: pmulhuw %xmm8, %xmm3
588 ; SSE2-NEXT: paddw %xmm1, %xmm3
589 ; SSE2-NEXT: movdqa %xmm3, %xmm6
590 ; SSE2-NEXT: pand %xmm4, %xmm6
591 ; SSE2-NEXT: psraw $4, %xmm3
592 ; SSE2-NEXT: pandn %xmm3, %xmm4
593 ; SSE2-NEXT: por %xmm6, %xmm4
594 ; SSE2-NEXT: movdqa %xmm4, %xmm3
595 ; SSE2-NEXT: pand %xmm5, %xmm3
596 ; SSE2-NEXT: psraw $2, %xmm4
597 ; SSE2-NEXT: pandn %xmm4, %xmm5
598 ; SSE2-NEXT: por %xmm3, %xmm5
599 ; SSE2-NEXT: movdqa %xmm5, %xmm3
600 ; SSE2-NEXT: pand %xmm7, %xmm3
601 ; SSE2-NEXT: psraw $1, %xmm5
602 ; SSE2-NEXT: pandn %xmm5, %xmm7
603 ; SSE2-NEXT: por %xmm3, %xmm7
604 ; SSE2-NEXT: pand %xmm2, %xmm7
605 ; SSE2-NEXT: pandn %xmm1, %xmm2
606 ; SSE2-NEXT: por %xmm7, %xmm2
607 ; SSE2-NEXT: movdqa %xmm2, %xmm1
610 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
612 ; SSE41-NEXT: movdqa %xmm0, %xmm2
613 ; SSE41-NEXT: psraw $15, %xmm2
614 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,4,2,16,8,32,64,2]
615 ; SSE41-NEXT: pmulhuw %xmm4, %xmm2
616 ; SSE41-NEXT: paddw %xmm0, %xmm2
617 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768>
618 ; SSE41-NEXT: movdqa %xmm2, %xmm3
619 ; SSE41-NEXT: pmulhw %xmm5, %xmm3
620 ; SSE41-NEXT: psraw $1, %xmm2
621 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7]
622 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
623 ; SSE41-NEXT: movdqa %xmm1, %xmm3
624 ; SSE41-NEXT: psraw $15, %xmm3
625 ; SSE41-NEXT: pmulhuw %xmm4, %xmm3
626 ; SSE41-NEXT: paddw %xmm1, %xmm3
627 ; SSE41-NEXT: pmulhw %xmm3, %xmm5
628 ; SSE41-NEXT: psraw $1, %xmm3
629 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7]
630 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
631 ; SSE41-NEXT: movdqa %xmm2, %xmm0
632 ; SSE41-NEXT: movdqa %xmm3, %xmm1
635 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
637 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
638 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
639 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2]
640 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2
641 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
642 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
643 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4
644 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
645 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
646 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
647 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4
648 ; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3
649 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3
650 ; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2
651 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
652 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
653 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
654 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
655 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
656 ; AVX1-NEXT: # ymm2 = mem[0,1,0,1]
657 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
658 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
659 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
662 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
664 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
665 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
666 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1
667 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2
668 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm1, %ymm1
669 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
670 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
673 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
675 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
676 ; AVX512F-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
677 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
678 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
679 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
680 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
681 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
684 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
686 ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1
687 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
688 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1
689 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm1, %ymm1
690 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
691 ; AVX512BW-NEXT: retq
693 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
695 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
696 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2
697 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [65520,65522,65521,65524,65523,65525,65526,65521]
698 ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2
699 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1
700 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535]
701 ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1
702 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4
703 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3
704 ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3
705 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2
706 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
707 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
708 ; XOP-NEXT: # ymm2 = mem[0,1,0,1]
709 ; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0
711 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
715 define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
716 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
718 ; SSE2-NEXT: movdqa %xmm1, %xmm8
719 ; SSE2-NEXT: movdqa %xmm0, %xmm1
720 ; SSE2-NEXT: psraw $15, %xmm0
721 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,4,2,16,8,32,64,2]
722 ; SSE2-NEXT: pmulhuw %xmm9, %xmm0
723 ; SSE2-NEXT: paddw %xmm1, %xmm0
724 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535]
725 ; SSE2-NEXT: movdqa %xmm0, %xmm4
726 ; SSE2-NEXT: pand %xmm11, %xmm4
727 ; SSE2-NEXT: psraw $4, %xmm0
728 ; SSE2-NEXT: movdqa %xmm11, %xmm5
729 ; SSE2-NEXT: pandn %xmm0, %xmm5
730 ; SSE2-NEXT: por %xmm4, %xmm5
731 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
732 ; SSE2-NEXT: movdqa %xmm5, %xmm0
733 ; SSE2-NEXT: pand %xmm7, %xmm0
734 ; SSE2-NEXT: psraw $2, %xmm5
735 ; SSE2-NEXT: movdqa %xmm7, %xmm4
736 ; SSE2-NEXT: pandn %xmm5, %xmm4
737 ; SSE2-NEXT: por %xmm0, %xmm4
738 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0]
739 ; SSE2-NEXT: movdqa %xmm4, %xmm0
740 ; SSE2-NEXT: pand %xmm10, %xmm0
741 ; SSE2-NEXT: psraw $1, %xmm4
742 ; SSE2-NEXT: movdqa %xmm10, %xmm5
743 ; SSE2-NEXT: pandn %xmm4, %xmm5
744 ; SSE2-NEXT: por %xmm0, %xmm5
745 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535]
746 ; SSE2-NEXT: pand %xmm12, %xmm5
747 ; SSE2-NEXT: movdqa %xmm12, %xmm0
748 ; SSE2-NEXT: pandn %xmm1, %xmm0
749 ; SSE2-NEXT: por %xmm5, %xmm0
750 ; SSE2-NEXT: movdqa %xmm8, %xmm1
751 ; SSE2-NEXT: psraw $15, %xmm1
752 ; SSE2-NEXT: pmulhuw %xmm9, %xmm1
753 ; SSE2-NEXT: paddw %xmm8, %xmm1
754 ; SSE2-NEXT: movdqa %xmm1, %xmm5
755 ; SSE2-NEXT: pand %xmm11, %xmm5
756 ; SSE2-NEXT: psraw $4, %xmm1
757 ; SSE2-NEXT: movdqa %xmm11, %xmm6
758 ; SSE2-NEXT: pandn %xmm1, %xmm6
759 ; SSE2-NEXT: por %xmm5, %xmm6
760 ; SSE2-NEXT: movdqa %xmm6, %xmm1
761 ; SSE2-NEXT: pand %xmm7, %xmm1
762 ; SSE2-NEXT: psraw $2, %xmm6
763 ; SSE2-NEXT: movdqa %xmm7, %xmm5
764 ; SSE2-NEXT: pandn %xmm6, %xmm5
765 ; SSE2-NEXT: por %xmm1, %xmm5
766 ; SSE2-NEXT: movdqa %xmm5, %xmm1
767 ; SSE2-NEXT: pand %xmm10, %xmm1
768 ; SSE2-NEXT: psraw $1, %xmm5
769 ; SSE2-NEXT: movdqa %xmm10, %xmm6
770 ; SSE2-NEXT: pandn %xmm5, %xmm6
771 ; SSE2-NEXT: por %xmm1, %xmm6
772 ; SSE2-NEXT: pand %xmm12, %xmm6
773 ; SSE2-NEXT: movdqa %xmm12, %xmm1
774 ; SSE2-NEXT: pandn %xmm8, %xmm1
775 ; SSE2-NEXT: por %xmm6, %xmm1
776 ; SSE2-NEXT: movdqa %xmm2, %xmm5
777 ; SSE2-NEXT: psraw $15, %xmm5
778 ; SSE2-NEXT: pmulhuw %xmm9, %xmm5
779 ; SSE2-NEXT: paddw %xmm2, %xmm5
780 ; SSE2-NEXT: movdqa %xmm5, %xmm6
781 ; SSE2-NEXT: pand %xmm11, %xmm6
782 ; SSE2-NEXT: psraw $4, %xmm5
783 ; SSE2-NEXT: movdqa %xmm11, %xmm4
784 ; SSE2-NEXT: pandn %xmm5, %xmm4
785 ; SSE2-NEXT: por %xmm6, %xmm4
786 ; SSE2-NEXT: movdqa %xmm4, %xmm5
787 ; SSE2-NEXT: pand %xmm7, %xmm5
788 ; SSE2-NEXT: psraw $2, %xmm4
789 ; SSE2-NEXT: movdqa %xmm7, %xmm6
790 ; SSE2-NEXT: pandn %xmm4, %xmm6
791 ; SSE2-NEXT: por %xmm5, %xmm6
792 ; SSE2-NEXT: movdqa %xmm6, %xmm4
793 ; SSE2-NEXT: pand %xmm10, %xmm4
794 ; SSE2-NEXT: psraw $1, %xmm6
795 ; SSE2-NEXT: movdqa %xmm10, %xmm5
796 ; SSE2-NEXT: pandn %xmm6, %xmm5
797 ; SSE2-NEXT: por %xmm4, %xmm5
798 ; SSE2-NEXT: pand %xmm12, %xmm5
799 ; SSE2-NEXT: movdqa %xmm12, %xmm8
800 ; SSE2-NEXT: pandn %xmm2, %xmm8
801 ; SSE2-NEXT: por %xmm5, %xmm8
802 ; SSE2-NEXT: movdqa %xmm3, %xmm2
803 ; SSE2-NEXT: psraw $15, %xmm2
804 ; SSE2-NEXT: pmulhuw %xmm9, %xmm2
805 ; SSE2-NEXT: paddw %xmm3, %xmm2
806 ; SSE2-NEXT: movdqa %xmm2, %xmm4
807 ; SSE2-NEXT: pand %xmm11, %xmm4
808 ; SSE2-NEXT: psraw $4, %xmm2
809 ; SSE2-NEXT: pandn %xmm2, %xmm11
810 ; SSE2-NEXT: por %xmm4, %xmm11
811 ; SSE2-NEXT: movdqa %xmm11, %xmm2
812 ; SSE2-NEXT: pand %xmm7, %xmm2
813 ; SSE2-NEXT: psraw $2, %xmm11
814 ; SSE2-NEXT: pandn %xmm11, %xmm7
815 ; SSE2-NEXT: por %xmm2, %xmm7
816 ; SSE2-NEXT: movdqa %xmm7, %xmm2
817 ; SSE2-NEXT: pand %xmm10, %xmm2
818 ; SSE2-NEXT: psraw $1, %xmm7
819 ; SSE2-NEXT: pandn %xmm7, %xmm10
820 ; SSE2-NEXT: por %xmm2, %xmm10
821 ; SSE2-NEXT: pand %xmm12, %xmm10
822 ; SSE2-NEXT: pandn %xmm3, %xmm12
823 ; SSE2-NEXT: por %xmm10, %xmm12
824 ; SSE2-NEXT: movdqa %xmm8, %xmm2
825 ; SSE2-NEXT: movdqa %xmm12, %xmm3
828 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
830 ; SSE41-NEXT: movdqa %xmm1, %xmm4
831 ; SSE41-NEXT: movdqa %xmm0, %xmm1
832 ; SSE41-NEXT: psraw $15, %xmm0
833 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [1,4,2,16,8,32,64,2]
834 ; SSE41-NEXT: pmulhuw %xmm7, %xmm0
835 ; SSE41-NEXT: paddw %xmm1, %xmm0
836 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768>
837 ; SSE41-NEXT: movdqa %xmm0, %xmm5
838 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
839 ; SSE41-NEXT: psraw $1, %xmm0
840 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7]
841 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
842 ; SSE41-NEXT: movdqa %xmm4, %xmm1
843 ; SSE41-NEXT: psraw $15, %xmm1
844 ; SSE41-NEXT: pmulhuw %xmm7, %xmm1
845 ; SSE41-NEXT: paddw %xmm4, %xmm1
846 ; SSE41-NEXT: movdqa %xmm1, %xmm5
847 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
848 ; SSE41-NEXT: psraw $1, %xmm1
849 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
850 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
851 ; SSE41-NEXT: movdqa %xmm2, %xmm4
852 ; SSE41-NEXT: psraw $15, %xmm4
853 ; SSE41-NEXT: pmulhuw %xmm7, %xmm4
854 ; SSE41-NEXT: paddw %xmm2, %xmm4
855 ; SSE41-NEXT: movdqa %xmm4, %xmm5
856 ; SSE41-NEXT: pmulhw %xmm6, %xmm5
857 ; SSE41-NEXT: psraw $1, %xmm4
858 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
859 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
860 ; SSE41-NEXT: movdqa %xmm3, %xmm5
861 ; SSE41-NEXT: psraw $15, %xmm5
862 ; SSE41-NEXT: pmulhuw %xmm7, %xmm5
863 ; SSE41-NEXT: paddw %xmm3, %xmm5
864 ; SSE41-NEXT: pmulhw %xmm5, %xmm6
865 ; SSE41-NEXT: psraw $1, %xmm5
866 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
867 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
868 ; SSE41-NEXT: movdqa %xmm4, %xmm2
869 ; SSE41-NEXT: movdqa %xmm5, %xmm3
872 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
874 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
875 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3
876 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,4,2,16,8,32,64,2]
877 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3
878 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
879 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
880 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5
881 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3,4,5,6,7]
882 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
883 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
884 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5
885 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5
886 ; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5
887 ; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6
888 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1,2,3,4,5,6,7]
889 ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5
890 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
891 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
892 ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
893 ; AVX1-NEXT: # ymm5 = mem[0,1,0,1]
894 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
895 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0
896 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
897 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
898 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6
899 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6
900 ; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2
901 ; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6
902 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3,4,5,6,7]
903 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
904 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
905 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6
906 ; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4
907 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4
908 ; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
909 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
910 ; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4
911 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
912 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
913 ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
914 ; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1
915 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
918 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
920 ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2
921 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2]
922 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
923 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
924 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2
925 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
926 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
927 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5
928 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
929 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
930 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
931 ; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2
932 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
933 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2
934 ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3
935 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2
936 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
937 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
940 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
942 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
943 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,4,2,16,8,32,64,2,1,4,2,16,8,32,64,2]
944 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
945 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
946 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
947 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
948 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
949 ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
950 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
951 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
952 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
953 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
954 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
955 ; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
956 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
957 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
958 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
959 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
962 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
964 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1
965 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
966 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1
967 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm1, %zmm1
968 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
969 ; AVX512BW-NEXT: kmovd %eax, %k1
970 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
971 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
972 ; AVX512BW-NEXT: retq
974 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
976 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
977 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3
978 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [65520,65522,65521,65524,65523,65525,65526,65521]
979 ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3
980 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2
981 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535]
982 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
983 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5
984 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5
985 ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5
986 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5
987 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
988 ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
989 ; XOP-NEXT: # ymm5 = mem[0,1,0,1]
990 ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0
991 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
992 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6
993 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6
994 ; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2
995 ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2
996 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm6
997 ; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4
998 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4
999 ; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3
1000 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1001 ; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1
1003 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1007 define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1008 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1010 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1011 ; SSE2-NEXT: psrad $31, %xmm1
1012 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1013 ; SSE2-NEXT: psrld $28, %xmm2
1014 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1015 ; SSE2-NEXT: psrld $29, %xmm3
1016 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1017 ; SSE2-NEXT: psrld $30, %xmm1
1018 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1019 ; SSE2-NEXT: paddd %xmm0, %xmm1
1020 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1021 ; SSE2-NEXT: psrad $4, %xmm2
1022 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1023 ; SSE2-NEXT: psrad $3, %xmm3
1024 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1025 ; SSE2-NEXT: psrad $2, %xmm1
1026 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1027 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1028 ; SSE2-NEXT: movaps %xmm1, %xmm0
1031 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1033 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1034 ; SSE41-NEXT: psrad $31, %xmm1
1035 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1036 ; SSE41-NEXT: psrld $28, %xmm2
1037 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1038 ; SSE41-NEXT: psrld $30, %xmm3
1039 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1040 ; SSE41-NEXT: psrld $29, %xmm1
1041 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1042 ; SSE41-NEXT: paddd %xmm0, %xmm1
1043 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1044 ; SSE41-NEXT: psrad $4, %xmm2
1045 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1046 ; SSE41-NEXT: psrad $2, %xmm3
1047 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1048 ; SSE41-NEXT: psrad $3, %xmm1
1049 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1050 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1051 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1054 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1056 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
1057 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1058 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
1059 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1060 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
1061 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1062 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1063 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1064 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1065 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1066 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
1067 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1068 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1071 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1072 ; AVX2ORLATER: # %bb.0:
1073 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
1074 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
1075 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1076 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
1077 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1078 ; AVX2ORLATER-NEXT: retq
1080 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1082 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
1083 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
1084 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1085 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
1086 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1088 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1092 define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1093 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1095 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1096 ; SSE2-NEXT: psrad $31, %xmm0
1097 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1098 ; SSE2-NEXT: psrld $28, %xmm3
1099 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1100 ; SSE2-NEXT: psrld $29, %xmm4
1101 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1102 ; SSE2-NEXT: psrld $30, %xmm0
1103 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1104 ; SSE2-NEXT: paddd %xmm2, %xmm0
1105 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1106 ; SSE2-NEXT: psrad $4, %xmm3
1107 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1108 ; SSE2-NEXT: psrad $3, %xmm4
1109 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1110 ; SSE2-NEXT: psrad $2, %xmm0
1111 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1112 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1113 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1114 ; SSE2-NEXT: psrad $31, %xmm2
1115 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1116 ; SSE2-NEXT: psrld $28, %xmm3
1117 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1118 ; SSE2-NEXT: psrld $29, %xmm4
1119 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1120 ; SSE2-NEXT: psrld $30, %xmm2
1121 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1122 ; SSE2-NEXT: paddd %xmm1, %xmm2
1123 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1124 ; SSE2-NEXT: psrad $4, %xmm3
1125 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1126 ; SSE2-NEXT: psrad $3, %xmm4
1127 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1128 ; SSE2-NEXT: psrad $2, %xmm2
1129 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1130 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1131 ; SSE2-NEXT: movaps %xmm2, %xmm1
1134 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1136 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1137 ; SSE41-NEXT: psrad $31, %xmm0
1138 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1139 ; SSE41-NEXT: psrld $28, %xmm3
1140 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1141 ; SSE41-NEXT: psrld $30, %xmm4
1142 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1143 ; SSE41-NEXT: psrld $29, %xmm0
1144 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1145 ; SSE41-NEXT: paddd %xmm2, %xmm0
1146 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1147 ; SSE41-NEXT: psrad $4, %xmm3
1148 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1149 ; SSE41-NEXT: psrad $2, %xmm4
1150 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1151 ; SSE41-NEXT: psrad $3, %xmm0
1152 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1153 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1154 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1155 ; SSE41-NEXT: psrad $31, %xmm2
1156 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1157 ; SSE41-NEXT: psrld $28, %xmm3
1158 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1159 ; SSE41-NEXT: psrld $30, %xmm4
1160 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1161 ; SSE41-NEXT: psrld $29, %xmm2
1162 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1163 ; SSE41-NEXT: paddd %xmm1, %xmm2
1164 ; SSE41-NEXT: movdqa %xmm2, %xmm3
1165 ; SSE41-NEXT: psrad $4, %xmm3
1166 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1167 ; SSE41-NEXT: psrad $2, %xmm4
1168 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1169 ; SSE41-NEXT: psrad $3, %xmm2
1170 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1171 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1172 ; SSE41-NEXT: movdqa %xmm2, %xmm1
1175 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1177 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1178 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
1179 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1180 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1181 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1182 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1183 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1184 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1185 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
1186 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
1187 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1188 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
1189 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1190 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1191 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
1192 ; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
1193 ; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
1194 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1195 ; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
1196 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1197 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
1198 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1199 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1200 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1201 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2
1202 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1203 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1204 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1207 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1208 ; AVX2ORLATER: # %bb.0:
1209 ; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1
1210 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
1211 ; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1
1212 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %ymm1, %ymm1
1213 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1214 ; AVX2ORLATER-NEXT: retq
1216 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1218 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
1219 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2
1220 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967264,4294967266,4294967267,4294967268]
1221 ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2
1222 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1223 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292]
1224 ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1
1225 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4
1226 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3
1227 ; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1228 ; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2
1229 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1230 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1232 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1236 define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1237 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1239 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1240 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1241 ; SSE2-NEXT: psrad $31, %xmm0
1242 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1243 ; SSE2-NEXT: psrld $28, %xmm5
1244 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1245 ; SSE2-NEXT: psrld $29, %xmm6
1246 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1247 ; SSE2-NEXT: psrld $30, %xmm0
1248 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1249 ; SSE2-NEXT: paddd %xmm1, %xmm0
1250 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1251 ; SSE2-NEXT: psrad $4, %xmm5
1252 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1253 ; SSE2-NEXT: psrad $3, %xmm6
1254 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1255 ; SSE2-NEXT: psrad $2, %xmm0
1256 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1257 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1258 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1259 ; SSE2-NEXT: psrad $31, %xmm1
1260 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1261 ; SSE2-NEXT: psrld $28, %xmm5
1262 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1263 ; SSE2-NEXT: psrld $29, %xmm6
1264 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1265 ; SSE2-NEXT: psrld $30, %xmm1
1266 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1267 ; SSE2-NEXT: paddd %xmm4, %xmm1
1268 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1269 ; SSE2-NEXT: psrad $4, %xmm5
1270 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1271 ; SSE2-NEXT: psrad $3, %xmm6
1272 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1273 ; SSE2-NEXT: psrad $2, %xmm1
1274 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1275 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1276 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1277 ; SSE2-NEXT: psrad $31, %xmm4
1278 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1279 ; SSE2-NEXT: psrld $28, %xmm5
1280 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1281 ; SSE2-NEXT: psrld $29, %xmm6
1282 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1283 ; SSE2-NEXT: psrld $30, %xmm4
1284 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1285 ; SSE2-NEXT: paddd %xmm2, %xmm4
1286 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1287 ; SSE2-NEXT: psrad $4, %xmm5
1288 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1289 ; SSE2-NEXT: psrad $3, %xmm6
1290 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1291 ; SSE2-NEXT: psrad $2, %xmm4
1292 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1293 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1294 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1295 ; SSE2-NEXT: psrad $31, %xmm5
1296 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1297 ; SSE2-NEXT: psrld $28, %xmm2
1298 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1299 ; SSE2-NEXT: psrld $29, %xmm6
1300 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1301 ; SSE2-NEXT: psrld $30, %xmm5
1302 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1303 ; SSE2-NEXT: paddd %xmm3, %xmm5
1304 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1305 ; SSE2-NEXT: psrad $4, %xmm2
1306 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1307 ; SSE2-NEXT: psrad $3, %xmm6
1308 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1309 ; SSE2-NEXT: psrad $2, %xmm5
1310 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1311 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1312 ; SSE2-NEXT: movaps %xmm4, %xmm2
1313 ; SSE2-NEXT: movaps %xmm5, %xmm3
1316 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1318 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1319 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1320 ; SSE41-NEXT: psrad $31, %xmm0
1321 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1322 ; SSE41-NEXT: psrld $28, %xmm5
1323 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1324 ; SSE41-NEXT: psrld $30, %xmm6
1325 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1326 ; SSE41-NEXT: psrld $29, %xmm0
1327 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1328 ; SSE41-NEXT: paddd %xmm1, %xmm0
1329 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1330 ; SSE41-NEXT: psrad $4, %xmm5
1331 ; SSE41-NEXT: movdqa %xmm0, %xmm6
1332 ; SSE41-NEXT: psrad $2, %xmm6
1333 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1334 ; SSE41-NEXT: psrad $3, %xmm0
1335 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1336 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1337 ; SSE41-NEXT: movdqa %xmm4, %xmm1
1338 ; SSE41-NEXT: psrad $31, %xmm1
1339 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1340 ; SSE41-NEXT: psrld $28, %xmm5
1341 ; SSE41-NEXT: movdqa %xmm1, %xmm6
1342 ; SSE41-NEXT: psrld $30, %xmm6
1343 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1344 ; SSE41-NEXT: psrld $29, %xmm1
1345 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1346 ; SSE41-NEXT: paddd %xmm4, %xmm1
1347 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1348 ; SSE41-NEXT: psrad $4, %xmm5
1349 ; SSE41-NEXT: movdqa %xmm1, %xmm6
1350 ; SSE41-NEXT: psrad $2, %xmm6
1351 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1352 ; SSE41-NEXT: psrad $3, %xmm1
1353 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1354 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
1355 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1356 ; SSE41-NEXT: psrad $31, %xmm4
1357 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1358 ; SSE41-NEXT: psrld $28, %xmm5
1359 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1360 ; SSE41-NEXT: psrld $30, %xmm6
1361 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1362 ; SSE41-NEXT: psrld $29, %xmm4
1363 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1364 ; SSE41-NEXT: paddd %xmm2, %xmm4
1365 ; SSE41-NEXT: movdqa %xmm4, %xmm5
1366 ; SSE41-NEXT: psrad $4, %xmm5
1367 ; SSE41-NEXT: movdqa %xmm4, %xmm6
1368 ; SSE41-NEXT: psrad $2, %xmm6
1369 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1370 ; SSE41-NEXT: psrad $3, %xmm4
1371 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1372 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1373 ; SSE41-NEXT: movdqa %xmm3, %xmm5
1374 ; SSE41-NEXT: psrad $31, %xmm5
1375 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1376 ; SSE41-NEXT: psrld $28, %xmm2
1377 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1378 ; SSE41-NEXT: psrld $30, %xmm6
1379 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1380 ; SSE41-NEXT: psrld $29, %xmm5
1381 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1382 ; SSE41-NEXT: paddd %xmm3, %xmm5
1383 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1384 ; SSE41-NEXT: psrad $4, %xmm2
1385 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1386 ; SSE41-NEXT: psrad $2, %xmm6
1387 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1388 ; SSE41-NEXT: psrad $3, %xmm5
1389 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1390 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
1391 ; SSE41-NEXT: movdqa %xmm4, %xmm2
1392 ; SSE41-NEXT: movdqa %xmm5, %xmm3
1395 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1397 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1398 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1399 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1400 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1401 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1402 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1403 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1404 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1405 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1406 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1407 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1408 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm4
1409 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1410 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1411 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
1412 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1413 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1414 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1415 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1416 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1417 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3
1418 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1419 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1420 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1421 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1422 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1423 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1424 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1425 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1426 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
1427 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1428 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1429 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1430 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1431 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1432 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1433 ; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
1434 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
1435 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1436 ; AVX1-NEXT: vpsrad $3, %xmm2, %xmm4
1437 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1438 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1439 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
1440 ; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
1441 ; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
1442 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1443 ; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
1444 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1445 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
1446 ; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
1447 ; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
1448 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1449 ; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3
1450 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1451 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1452 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1455 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1457 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
1458 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [32,30,29,28,32,30,29,28]
1459 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1460 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1461 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
1462 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1463 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1464 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1465 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1466 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
1467 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2
1468 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2
1469 ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2
1470 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1473 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1475 ; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1
1476 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1477 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1478 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
1479 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
1480 ; AVX512F-NEXT: kmovw %eax, %k1
1481 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1482 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1483 ; AVX512F-NEXT: retq
1485 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1486 ; AVX512BW: # %bb.0:
1487 ; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1
1488 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1489 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1
1490 ; AVX512BW-NEXT: vpsravd {{.*}}(%rip), %zmm1, %zmm1
1491 ; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111
1492 ; AVX512BW-NEXT: kmovd %eax, %k1
1493 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1494 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1495 ; AVX512BW-NEXT: retq
1497 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1499 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1500 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3
1501 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967264,4294967266,4294967267,4294967268]
1502 ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3
1503 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1504 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292]
1505 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1506 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5
1507 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1508 ; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5
1509 ; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5
1510 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
1511 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1512 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1513 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm5
1514 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5
1515 ; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1516 ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2
1517 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm5
1518 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4
1519 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4
1520 ; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3
1521 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1522 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1524 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1528 define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1529 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1531 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1532 ; SSE2-NEXT: psrad $31, %xmm1
1533 ; SSE2-NEXT: psrlq $62, %xmm1
1534 ; SSE2-NEXT: paddq %xmm0, %xmm1
1535 ; SSE2-NEXT: psrlq $2, %xmm1
1536 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
1537 ; SSE2-NEXT: pxor %xmm2, %xmm1
1538 ; SSE2-NEXT: psubq %xmm2, %xmm1
1539 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1540 ; SSE2-NEXT: movapd %xmm1, %xmm0
1543 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1545 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1546 ; SSE41-NEXT: psrad $31, %xmm1
1547 ; SSE41-NEXT: psrlq $62, %xmm1
1548 ; SSE41-NEXT: paddq %xmm0, %xmm1
1549 ; SSE41-NEXT: psrlq $2, %xmm1
1550 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
1551 ; SSE41-NEXT: pxor %xmm2, %xmm1
1552 ; SSE41-NEXT: psubq %xmm2, %xmm1
1553 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1554 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1557 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1559 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1560 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1561 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1
1562 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1563 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
1564 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
1565 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
1566 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1567 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1570 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1572 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1573 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
1574 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
1575 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1576 ; AVX2-NEXT: movl $2, %eax
1577 ; AVX2-NEXT: vmovq %rax, %xmm2
1578 ; AVX2-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1579 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm1
1580 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
1581 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
1582 ; AVX2-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1583 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1586 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1588 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1589 ; AVX512F-NEXT: movl $2, %eax
1590 ; AVX512F-NEXT: vmovq %rax, %xmm1
1591 ; AVX512F-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1592 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2
1593 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm2, %xmm2
1594 ; AVX512F-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1595 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1596 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1597 ; AVX512F-NEXT: vzeroupper
1598 ; AVX512F-NEXT: retq
1600 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1601 ; AVX512BW: # %bb.0:
1602 ; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1
1603 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
1604 ; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1605 ; AVX512BW-NEXT: movl $2, %eax
1606 ; AVX512BW-NEXT: vmovq %rax, %xmm2
1607 ; AVX512BW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1608 ; AVX512BW-NEXT: vpsravq %xmm2, %xmm1, %xmm1
1609 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1610 ; AVX512BW-NEXT: retq
1612 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1614 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm1
1615 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
1616 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
1617 ; XOP-NEXT: movq $-2, %rax
1618 ; XOP-NEXT: vmovq %rax, %xmm2
1619 ; XOP-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1620 ; XOP-NEXT: vpshaq %xmm2, %xmm1, %xmm1
1621 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1623 %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1627 define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1628 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1630 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1631 ; SSE2-NEXT: psrad $31, %xmm1
1632 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1633 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1634 ; SSE2-NEXT: psrlq $61, %xmm3
1635 ; SSE2-NEXT: psrlq $60, %xmm1
1636 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
1637 ; SSE2-NEXT: paddq %xmm2, %xmm1
1638 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1639 ; SSE2-NEXT: psrlq $3, %xmm2
1640 ; SSE2-NEXT: psrlq $4, %xmm1
1641 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1642 ; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
1643 ; SSE2-NEXT: xorpd %xmm2, %xmm1
1644 ; SSE2-NEXT: psubq %xmm2, %xmm1
1645 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1646 ; SSE2-NEXT: psrad $31, %xmm2
1647 ; SSE2-NEXT: psrlq $62, %xmm2
1648 ; SSE2-NEXT: paddq %xmm0, %xmm2
1649 ; SSE2-NEXT: psrlq $2, %xmm2
1650 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
1651 ; SSE2-NEXT: pxor %xmm3, %xmm2
1652 ; SSE2-NEXT: psubq %xmm3, %xmm2
1653 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1654 ; SSE2-NEXT: movapd %xmm2, %xmm0
1657 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1659 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1660 ; SSE41-NEXT: psrad $31, %xmm1
1661 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1662 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1663 ; SSE41-NEXT: psrlq $60, %xmm3
1664 ; SSE41-NEXT: psrlq $61, %xmm1
1665 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1666 ; SSE41-NEXT: paddq %xmm2, %xmm1
1667 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1668 ; SSE41-NEXT: psrlq $4, %xmm2
1669 ; SSE41-NEXT: psrlq $3, %xmm1
1670 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1671 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
1672 ; SSE41-NEXT: pxor %xmm2, %xmm1
1673 ; SSE41-NEXT: psubq %xmm2, %xmm1
1674 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1675 ; SSE41-NEXT: psrad $31, %xmm2
1676 ; SSE41-NEXT: psrlq $62, %xmm2
1677 ; SSE41-NEXT: paddq %xmm0, %xmm2
1678 ; SSE41-NEXT: psrlq $2, %xmm2
1679 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
1680 ; SSE41-NEXT: pxor %xmm3, %xmm2
1681 ; SSE41-NEXT: psubq %xmm3, %xmm2
1682 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1683 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1686 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1688 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1689 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1690 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
1691 ; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4
1692 ; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3
1693 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1694 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
1695 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3
1696 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1
1697 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1698 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1699 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
1700 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
1701 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
1702 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1703 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1704 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm3
1705 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1706 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
1707 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
1708 ; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
1709 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1710 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1713 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1715 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1716 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
1717 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1718 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1719 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1720 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [9223372036854775808,2305843009213693952,1152921504606846976,576460752303423488]
1721 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
1722 ; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1
1723 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1726 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1728 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1729 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,4]
1730 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2
1731 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm2, %ymm2
1732 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
1733 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1734 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1735 ; AVX512F-NEXT: retq
1737 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1738 ; AVX512BW: # %bb.0:
1739 ; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1
1740 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1741 ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1742 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %ymm1, %ymm1
1743 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1744 ; AVX512BW-NEXT: retq
1746 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1748 ; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1749 ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2
1750 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm2, %xmm2
1751 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
1752 ; XOP-NEXT: movq $-2, %rax
1753 ; XOP-NEXT: vmovq %rax, %xmm3
1754 ; XOP-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
1755 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm2
1756 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
1757 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1
1758 ; XOP-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
1759 ; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
1760 ; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm1, %xmm1
1761 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1762 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1764 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1768 define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1769 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1771 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1772 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1773 ; SSE2-NEXT: psrad $31, %xmm1
1774 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1775 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1776 ; SSE2-NEXT: psrlq $61, %xmm5
1777 ; SSE2-NEXT: psrlq $60, %xmm1
1778 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
1779 ; SSE2-NEXT: paddq %xmm3, %xmm1
1780 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1781 ; SSE2-NEXT: psrlq $3, %xmm3
1782 ; SSE2-NEXT: psrlq $4, %xmm1
1783 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
1784 ; SSE2-NEXT: movapd {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
1785 ; SSE2-NEXT: xorpd %xmm5, %xmm1
1786 ; SSE2-NEXT: psubq %xmm5, %xmm1
1787 ; SSE2-NEXT: movdqa %xmm4, %xmm3
1788 ; SSE2-NEXT: psrad $31, %xmm3
1789 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1790 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1791 ; SSE2-NEXT: psrlq $61, %xmm6
1792 ; SSE2-NEXT: psrlq $60, %xmm3
1793 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1]
1794 ; SSE2-NEXT: paddq %xmm4, %xmm3
1795 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1796 ; SSE2-NEXT: psrlq $3, %xmm4
1797 ; SSE2-NEXT: psrlq $4, %xmm3
1798 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
1799 ; SSE2-NEXT: xorpd %xmm5, %xmm3
1800 ; SSE2-NEXT: psubq %xmm5, %xmm3
1801 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1802 ; SSE2-NEXT: psrad $31, %xmm4
1803 ; SSE2-NEXT: psrlq $62, %xmm4
1804 ; SSE2-NEXT: paddq %xmm0, %xmm4
1805 ; SSE2-NEXT: psrlq $2, %xmm4
1806 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
1807 ; SSE2-NEXT: pxor %xmm6, %xmm4
1808 ; SSE2-NEXT: psubq %xmm6, %xmm4
1809 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
1810 ; SSE2-NEXT: movdqa %xmm2, %xmm5
1811 ; SSE2-NEXT: psrad $31, %xmm5
1812 ; SSE2-NEXT: psrlq $62, %xmm5
1813 ; SSE2-NEXT: paddq %xmm2, %xmm5
1814 ; SSE2-NEXT: psrlq $2, %xmm5
1815 ; SSE2-NEXT: pxor %xmm6, %xmm5
1816 ; SSE2-NEXT: psubq %xmm6, %xmm5
1817 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
1818 ; SSE2-NEXT: movapd %xmm4, %xmm0
1819 ; SSE2-NEXT: movapd %xmm5, %xmm2
1822 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1824 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1825 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1826 ; SSE41-NEXT: psrad $31, %xmm1
1827 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1828 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1829 ; SSE41-NEXT: psrlq $60, %xmm5
1830 ; SSE41-NEXT: psrlq $61, %xmm1
1831 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
1832 ; SSE41-NEXT: paddq %xmm3, %xmm1
1833 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1834 ; SSE41-NEXT: psrlq $4, %xmm3
1835 ; SSE41-NEXT: psrlq $3, %xmm1
1836 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1837 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
1838 ; SSE41-NEXT: pxor %xmm5, %xmm1
1839 ; SSE41-NEXT: psubq %xmm5, %xmm1
1840 ; SSE41-NEXT: movdqa %xmm4, %xmm3
1841 ; SSE41-NEXT: psrad $31, %xmm3
1842 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1843 ; SSE41-NEXT: movdqa %xmm3, %xmm6
1844 ; SSE41-NEXT: psrlq $60, %xmm6
1845 ; SSE41-NEXT: psrlq $61, %xmm3
1846 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
1847 ; SSE41-NEXT: paddq %xmm4, %xmm3
1848 ; SSE41-NEXT: movdqa %xmm3, %xmm4
1849 ; SSE41-NEXT: psrlq $4, %xmm4
1850 ; SSE41-NEXT: psrlq $3, %xmm3
1851 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1852 ; SSE41-NEXT: pxor %xmm5, %xmm3
1853 ; SSE41-NEXT: psubq %xmm5, %xmm3
1854 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1855 ; SSE41-NEXT: psrad $31, %xmm4
1856 ; SSE41-NEXT: psrlq $62, %xmm4
1857 ; SSE41-NEXT: paddq %xmm0, %xmm4
1858 ; SSE41-NEXT: psrlq $2, %xmm4
1859 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
1860 ; SSE41-NEXT: pxor %xmm6, %xmm4
1861 ; SSE41-NEXT: psubq %xmm6, %xmm4
1862 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7]
1863 ; SSE41-NEXT: movdqa %xmm2, %xmm5
1864 ; SSE41-NEXT: psrad $31, %xmm5
1865 ; SSE41-NEXT: psrlq $62, %xmm5
1866 ; SSE41-NEXT: paddq %xmm2, %xmm5
1867 ; SSE41-NEXT: psrlq $2, %xmm5
1868 ; SSE41-NEXT: pxor %xmm6, %xmm5
1869 ; SSE41-NEXT: psubq %xmm6, %xmm5
1870 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
1871 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1872 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1875 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1877 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1878 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1879 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
1880 ; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5
1881 ; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4
1882 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1883 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
1884 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4
1885 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1886 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1887 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1888 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1889 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1890 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
1891 ; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
1892 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
1893 ; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm6
1894 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1895 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952]
1896 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
1897 ; AVX1-NEXT: vpsubq %xmm6, %xmm5, %xmm5
1898 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1899 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1900 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1901 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5
1902 ; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm7
1903 ; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5
1904 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
1905 ; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
1906 ; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5
1907 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3
1908 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1909 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1910 ; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
1911 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
1912 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
1913 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
1914 ; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm4
1915 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1916 ; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
1917 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
1918 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1919 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1922 ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1924 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1925 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
1926 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [64,62,61,60]
1927 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3
1928 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3
1929 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,2,3,4]
1930 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3
1931 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [9223372036854775808,2305843009213693952,1152921504606846976,576460752303423488]
1932 ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3
1933 ; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3
1934 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1935 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
1936 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2
1937 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2
1938 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2
1939 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
1940 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1941 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1944 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1946 ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
1947 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1948 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1949 ; AVX512F-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1
1950 ; AVX512F-NEXT: movb $17, %al
1951 ; AVX512F-NEXT: kmovw %eax, %k1
1952 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1953 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
1954 ; AVX512F-NEXT: retq
1956 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1957 ; AVX512BW: # %bb.0:
1958 ; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1
1959 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1960 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1
1961 ; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %zmm1, %zmm1
1962 ; AVX512BW-NEXT: movb $17, %al
1963 ; AVX512BW-NEXT: kmovd %eax, %k1
1964 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1965 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1966 ; AVX512BW-NEXT: retq
1968 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1970 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
1971 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1972 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4
1973 ; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [18446744073709551555,18446744073709551556]
1974 ; XOP-NEXT: vpshlq %xmm8, %xmm4, %xmm4
1975 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1976 ; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1977 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1978 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6
1979 ; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551552,18446744073709551554]
1980 ; XOP-NEXT: vpshlq %xmm7, %xmm6, %xmm6
1981 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6
1982 ; XOP-NEXT: movq $-2, %rax
1983 ; XOP-NEXT: vmovq %rax, %xmm5
1984 ; XOP-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7]
1985 ; XOP-NEXT: vpshaq %xmm5, %xmm6, %xmm6
1986 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
1987 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1988 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
1989 ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6
1990 ; XOP-NEXT: vpshlq %xmm8, %xmm6, %xmm6
1991 ; XOP-NEXT: vpaddq %xmm6, %xmm2, %xmm2
1992 ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2
1993 ; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3
1994 ; XOP-NEXT: vpshlq %xmm7, %xmm3, %xmm3
1995 ; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3
1996 ; XOP-NEXT: vpshaq %xmm5, %xmm3, %xmm3
1997 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1998 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
2000 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
2004 define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
2005 ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2007 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2008 ; SSE2-NEXT: psrad $31, %xmm0
2009 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2010 ; SSE2-NEXT: psrld $28, %xmm2
2011 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2012 ; SSE2-NEXT: psrld $29, %xmm3
2013 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2014 ; SSE2-NEXT: psrld $30, %xmm0
2015 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
2016 ; SSE2-NEXT: paddd %xmm1, %xmm0
2017 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2018 ; SSE2-NEXT: psrad $4, %xmm2
2019 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2020 ; SSE2-NEXT: psrad $3, %xmm3
2021 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2022 ; SSE2-NEXT: psrad $2, %xmm0
2023 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
2024 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2025 ; SSE2-NEXT: pxor %xmm1, %xmm1
2026 ; SSE2-NEXT: psubd %xmm0, %xmm1
2027 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2028 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
2029 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2032 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2034 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2035 ; SSE41-NEXT: psrad $31, %xmm1
2036 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2037 ; SSE41-NEXT: psrld $28, %xmm2
2038 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2039 ; SSE41-NEXT: psrld $30, %xmm3
2040 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2041 ; SSE41-NEXT: psrld $29, %xmm1
2042 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2043 ; SSE41-NEXT: paddd %xmm0, %xmm1
2044 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2045 ; SSE41-NEXT: psrad $4, %xmm2
2046 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2047 ; SSE41-NEXT: psrad $2, %xmm3
2048 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2049 ; SSE41-NEXT: psrad $3, %xmm1
2050 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2051 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2052 ; SSE41-NEXT: pxor %xmm0, %xmm0
2053 ; SSE41-NEXT: psubd %xmm1, %xmm0
2054 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
2055 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2058 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2060 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
2061 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2062 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
2063 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2064 ; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
2065 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2066 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2067 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
2068 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
2069 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2070 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
2071 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2072 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2073 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2074 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2075 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2078 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2079 ; AVX2ORLATER: # %bb.0:
2080 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
2081 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2082 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2083 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
2084 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2085 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2086 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2087 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2088 ; AVX2ORLATER-NEXT: retq
2090 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2092 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
2093 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2094 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2095 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
2096 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2097 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2098 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2099 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
2101 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2105 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2106 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2109 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2113 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2114 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2117 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2121 define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2122 ; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2125 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2130 define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2131 ; SSE2-LABEL: non_splat_minus_one_divisor_0:
2133 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2134 ; SSE2-NEXT: pxor %xmm2, %xmm2
2135 ; SSE2-NEXT: psubb %xmm0, %xmm2
2136 ; SSE2-NEXT: pand %xmm1, %xmm0
2137 ; SSE2-NEXT: pandn %xmm2, %xmm1
2138 ; SSE2-NEXT: por %xmm1, %xmm0
2141 ; SSE41-LABEL: non_splat_minus_one_divisor_0:
2143 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2144 ; SSE41-NEXT: pxor %xmm2, %xmm2
2145 ; SSE41-NEXT: psubb %xmm0, %xmm2
2146 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2147 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
2148 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2151 ; AVX1-LABEL: non_splat_minus_one_divisor_0:
2153 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2154 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2155 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2156 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2159 ; AVX2-LABEL: non_splat_minus_one_divisor_0:
2161 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2162 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2163 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2164 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2167 ; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2169 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2170 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2171 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2172 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2173 ; AVX512F-NEXT: retq
2175 ; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2176 ; AVX512BW: # %bb.0:
2177 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2178 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2179 ; AVX512BW-NEXT: kmovd %eax, %k1
2180 ; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2181 ; AVX512BW-NEXT: retq
2183 ; XOP-LABEL: non_splat_minus_one_divisor_0:
2185 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2186 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2187 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2188 ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2190 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2194 define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2195 ; SSE2-LABEL: non_splat_minus_one_divisor_1:
2197 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2198 ; SSE2-NEXT: pxor %xmm0, %xmm0
2199 ; SSE2-NEXT: pxor %xmm2, %xmm2
2200 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
2201 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2202 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2203 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2204 ; SSE2-NEXT: psrlw $8, %xmm3
2205 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2206 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2207 ; SSE2-NEXT: psrlw $8, %xmm2
2208 ; SSE2-NEXT: packuswb %xmm3, %xmm2
2209 ; SSE2-NEXT: paddb %xmm1, %xmm2
2210 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2211 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2212 ; SSE2-NEXT: psraw $8, %xmm3
2213 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2214 ; SSE2-NEXT: psrlw $8, %xmm3
2215 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2216 ; SSE2-NEXT: psraw $8, %xmm2
2217 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2218 ; SSE2-NEXT: psrlw $8, %xmm2
2219 ; SSE2-NEXT: packuswb %xmm3, %xmm2
2220 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2221 ; SSE2-NEXT: pand %xmm3, %xmm2
2222 ; SSE2-NEXT: pandn %xmm1, %xmm3
2223 ; SSE2-NEXT: por %xmm2, %xmm3
2224 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2225 ; SSE2-NEXT: psubb %xmm3, %xmm0
2226 ; SSE2-NEXT: pand %xmm1, %xmm0
2227 ; SSE2-NEXT: pandn %xmm3, %xmm1
2228 ; SSE2-NEXT: por %xmm1, %xmm0
2231 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
2233 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2234 ; SSE41-NEXT: pxor %xmm2, %xmm2
2235 ; SSE41-NEXT: pxor %xmm3, %xmm3
2236 ; SSE41-NEXT: pcmpgtb %xmm0, %xmm3
2237 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2238 ; SSE41-NEXT: movdqa %xmm4, %xmm0
2239 ; SSE41-NEXT: psllw $1, %xmm0
2240 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4,5],xmm0[6],xmm4[7]
2241 ; SSE41-NEXT: psrlw $8, %xmm0
2242 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2243 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3
2244 ; SSE41-NEXT: psrlw $8, %xmm3
2245 ; SSE41-NEXT: packuswb %xmm3, %xmm0
2246 ; SSE41-NEXT: paddb %xmm1, %xmm0
2247 ; SSE41-NEXT: movdqa %xmm0, %xmm3
2248 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2249 ; SSE41-NEXT: psraw $8, %xmm3
2250 ; SSE41-NEXT: movdqa %xmm3, %xmm4
2251 ; SSE41-NEXT: psllw $7, %xmm4
2252 ; SSE41-NEXT: psllw $8, %xmm3
2253 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2254 ; SSE41-NEXT: psrlw $8, %xmm3
2255 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2256 ; SSE41-NEXT: psraw $8, %xmm0
2257 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2258 ; SSE41-NEXT: psrlw $8, %xmm0
2259 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2260 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2261 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
2262 ; SSE41-NEXT: psubb %xmm1, %xmm2
2263 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2264 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
2265 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2268 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
2270 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2271 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2272 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2273 ; AVX1-NEXT: vpsllw $1, %xmm3, %xmm4
2274 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2275 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2276 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2277 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2278 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2279 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2280 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2281 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2282 ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3
2283 ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm4
2284 ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
2285 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2286 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2287 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2288 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2289 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2290 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2291 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2292 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2293 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2294 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2295 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2296 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2299 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
2301 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2302 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2303 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2304 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2305 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2306 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2307 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2308 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2309 ; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
2310 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2311 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2312 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2313 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2314 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2315 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2316 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2317 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2318 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2319 ; AVX2-NEXT: vzeroupper
2322 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2324 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2325 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2326 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2327 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2328 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
2329 ; AVX512F-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2330 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
2331 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm2, %zmm2
2332 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
2333 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2334 ; AVX512F-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2335 ; AVX512F-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2336 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2337 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2338 ; AVX512F-NEXT: vzeroupper
2339 ; AVX512F-NEXT: retq
2341 ; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2342 ; AVX512BW: # %bb.0:
2343 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2344 ; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2345 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2346 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2347 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2348 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2349 ; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2
2350 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm2, %ymm2
2351 ; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2
2352 ; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB
2353 ; AVX512BW-NEXT: kmovd %eax, %k1
2354 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
2355 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0
2356 ; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44
2357 ; AVX512BW-NEXT: kmovd %eax, %k1
2358 ; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1}
2359 ; AVX512BW-NEXT: vzeroupper
2360 ; AVX512BW-NEXT: retq
2362 ; XOP-LABEL: non_splat_minus_one_divisor_1:
2364 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2365 ; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
2366 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm2, %xmm2
2367 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2
2368 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm2, %xmm2
2369 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2370 ; XOP-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
2371 ; XOP-NEXT: vpsubb %xmm0, %xmm1, %xmm1
2372 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2373 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2375 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2379 define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2380 ; SSE2-LABEL: non_splat_minus_one_divisor_2:
2382 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2383 ; SSE2-NEXT: psrld $31, %xmm1
2384 ; SSE2-NEXT: paddd %xmm0, %xmm1
2385 ; SSE2-NEXT: psrad $1, %xmm1
2386 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2387 ; SSE2-NEXT: pxor %xmm0, %xmm0
2388 ; SSE2-NEXT: psubd %xmm1, %xmm0
2389 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2390 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2393 ; SSE41-LABEL: non_splat_minus_one_divisor_2:
2395 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2396 ; SSE41-NEXT: psrld $31, %xmm1
2397 ; SSE41-NEXT: paddd %xmm0, %xmm1
2398 ; SSE41-NEXT: psrad $1, %xmm1
2399 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2400 ; SSE41-NEXT: pxor %xmm0, %xmm0
2401 ; SSE41-NEXT: psubd %xmm1, %xmm0
2402 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
2403 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2406 ; AVX1-LABEL: non_splat_minus_one_divisor_2:
2408 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
2409 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2410 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1
2411 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2412 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2413 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2414 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2417 ; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2418 ; AVX2ORLATER: # %bb.0:
2419 ; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1
2420 ; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2421 ; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2422 ; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
2423 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2424 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2425 ; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2426 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2427 ; AVX2ORLATER-NEXT: retq
2429 ; XOP-LABEL: non_splat_minus_one_divisor_2:
2431 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm1
2432 ; XOP-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2433 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
2434 ; XOP-NEXT: vpshad {{.*}}(%rip), %xmm1, %xmm1
2435 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2436 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2437 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1
2438 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2440 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2444 define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2445 ; SSE-LABEL: combine_vec_sdiv_nonuniform:
2447 ; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
2448 ; SSE-NEXT: movdqa %xmm0, %xmm1
2449 ; SSE-NEXT: psrlw $15, %xmm1
2450 ; SSE-NEXT: paddw %xmm0, %xmm1
2451 ; SSE-NEXT: movdqa %xmm1, %xmm0
2454 ; AVX-LABEL: combine_vec_sdiv_nonuniform:
2456 ; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2457 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
2458 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2460 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2464 define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2465 ; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2467 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2468 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2469 ; SSE2-NEXT: psraw $2, %xmm1
2470 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2471 ; SSE2-NEXT: psraw $1, %xmm2
2472 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2473 ; SSE2-NEXT: psrlw $15, %xmm0
2474 ; SSE2-NEXT: paddw %xmm2, %xmm0
2477 ; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2479 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2480 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2481 ; SSE41-NEXT: psraw $1, %xmm1
2482 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2483 ; SSE41-NEXT: psraw $2, %xmm2
2484 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2485 ; SSE41-NEXT: psrlw $15, %xmm0
2486 ; SSE41-NEXT: paddw %xmm2, %xmm0
2489 ; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2491 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2492 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
2493 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
2494 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2495 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2496 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2499 ; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2501 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2502 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
2503 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
2504 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2505 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2506 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2509 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2511 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2512 ; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
2513 ; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
2514 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2515 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2516 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2517 ; AVX512F-NEXT: retq
2519 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2520 ; AVX512BW: # %bb.0:
2521 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2522 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2523 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2524 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2525 ; AVX512BW-NEXT: retq
2527 ; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2529 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2530 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2531 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2532 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2534 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2538 define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2539 ; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2541 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2542 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2543 ; SSE2-NEXT: paddw %xmm0, %xmm1
2544 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2545 ; SSE2-NEXT: psraw $4, %xmm0
2546 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2547 ; SSE2-NEXT: psraw $8, %xmm2
2548 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2549 ; SSE2-NEXT: psrlw $15, %xmm1
2550 ; SSE2-NEXT: paddw %xmm2, %xmm1
2551 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2554 ; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2556 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2557 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2558 ; SSE41-NEXT: paddw %xmm0, %xmm1
2559 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2560 ; SSE41-NEXT: psraw $8, %xmm0
2561 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2562 ; SSE41-NEXT: psraw $4, %xmm2
2563 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2564 ; SSE41-NEXT: psrlw $15, %xmm1
2565 ; SSE41-NEXT: paddw %xmm2, %xmm1
2566 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2569 ; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2571 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2572 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2573 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2574 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2575 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2576 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2577 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2580 ; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2582 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2583 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2584 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2585 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2586 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2587 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2588 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2591 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2593 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2594 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2595 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2596 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2597 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2598 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2599 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2600 ; AVX512F-NEXT: retq
2602 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2603 ; AVX512BW: # %bb.0:
2604 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2605 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2606 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2607 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2608 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2609 ; AVX512BW-NEXT: retq
2611 ; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2613 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2614 ; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2615 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2616 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2617 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2619 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2623 define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2624 ; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2626 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2627 ; SSE2-NEXT: pmulhw %xmm0, %xmm1
2628 ; SSE2-NEXT: psubw %xmm0, %xmm1
2629 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2630 ; SSE2-NEXT: psraw $4, %xmm0
2631 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2632 ; SSE2-NEXT: psraw $8, %xmm2
2633 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2634 ; SSE2-NEXT: psrlw $15, %xmm1
2635 ; SSE2-NEXT: paddw %xmm2, %xmm1
2636 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2639 ; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2641 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2642 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2643 ; SSE41-NEXT: psubw %xmm0, %xmm1
2644 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2645 ; SSE41-NEXT: psraw $8, %xmm0
2646 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2647 ; SSE41-NEXT: psraw $4, %xmm2
2648 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2649 ; SSE41-NEXT: psrlw $15, %xmm1
2650 ; SSE41-NEXT: paddw %xmm2, %xmm1
2651 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2654 ; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2656 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2657 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2658 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
2659 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
2660 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2661 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2662 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2665 ; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2667 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2668 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2669 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
2670 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2671 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2672 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2673 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2676 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2678 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2679 ; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2680 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
2681 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2682 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2683 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
2684 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2685 ; AVX512F-NEXT: retq
2687 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2688 ; AVX512BW: # %bb.0:
2689 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2690 ; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2691 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2692 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2693 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2694 ; AVX512BW-NEXT: retq
2696 ; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2698 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2699 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0
2700 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2701 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2702 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2704 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2708 define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2709 ; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2711 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2712 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2713 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2714 ; SSE2-NEXT: paddw %xmm1, %xmm0
2715 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2716 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2717 ; SSE2-NEXT: pand %xmm1, %xmm2
2718 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2719 ; SSE2-NEXT: psraw $8, %xmm3
2720 ; SSE2-NEXT: pandn %xmm3, %xmm1
2721 ; SSE2-NEXT: por %xmm2, %xmm1
2722 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2723 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2724 ; SSE2-NEXT: pand %xmm2, %xmm3
2725 ; SSE2-NEXT: psraw $4, %xmm1
2726 ; SSE2-NEXT: pandn %xmm1, %xmm2
2727 ; SSE2-NEXT: por %xmm3, %xmm2
2728 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2729 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2730 ; SSE2-NEXT: pand %xmm1, %xmm3
2731 ; SSE2-NEXT: psraw $2, %xmm2
2732 ; SSE2-NEXT: pandn %xmm2, %xmm1
2733 ; SSE2-NEXT: por %xmm3, %xmm1
2734 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2735 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2736 ; SSE2-NEXT: pand %xmm2, %xmm3
2737 ; SSE2-NEXT: psraw $1, %xmm1
2738 ; SSE2-NEXT: pandn %xmm1, %xmm2
2739 ; SSE2-NEXT: por %xmm3, %xmm2
2740 ; SSE2-NEXT: psrlw $15, %xmm0
2741 ; SSE2-NEXT: paddw %xmm2, %xmm0
2744 ; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2746 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2747 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2748 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2749 ; SSE41-NEXT: paddw %xmm1, %xmm0
2750 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256>
2751 ; SSE41-NEXT: pmulhw %xmm0, %xmm1
2752 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2753 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2754 ; SSE41-NEXT: psraw $1, %xmm2
2755 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2756 ; SSE41-NEXT: psrlw $15, %xmm0
2757 ; SSE41-NEXT: paddw %xmm2, %xmm0
2760 ; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2762 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2763 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2764 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2765 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2766 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2767 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
2768 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2769 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2770 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2773 ; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2775 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2776 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2777 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2778 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2779 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2780 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2
2781 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2782 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2783 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2786 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2788 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2789 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2790 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2791 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2792 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2793 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
2794 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2795 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2796 ; AVX512F-NEXT: vzeroupper
2797 ; AVX512F-NEXT: retq
2799 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2800 ; AVX512BW: # %bb.0:
2801 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2802 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2803 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2804 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2805 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2806 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2807 ; AVX512BW-NEXT: retq
2809 ; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2811 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2812 ; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2813 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2814 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2815 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2817 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2821 define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2822 ; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2824 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2825 ; SSE2-NEXT: pmullw %xmm0, %xmm1
2826 ; SSE2-NEXT: pmulhw {{.*}}(%rip), %xmm0
2827 ; SSE2-NEXT: paddw %xmm1, %xmm0
2828 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2829 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2830 ; SSE2-NEXT: psraw $8, %xmm2
2831 ; SSE2-NEXT: pand %xmm1, %xmm2
2832 ; SSE2-NEXT: pandn %xmm0, %xmm1
2833 ; SSE2-NEXT: por %xmm2, %xmm1
2834 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,0,65535,0]
2835 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2836 ; SSE2-NEXT: pand %xmm2, %xmm3
2837 ; SSE2-NEXT: psraw $4, %xmm1
2838 ; SSE2-NEXT: pandn %xmm1, %xmm2
2839 ; SSE2-NEXT: por %xmm3, %xmm2
2840 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535]
2841 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2842 ; SSE2-NEXT: pand %xmm1, %xmm3
2843 ; SSE2-NEXT: psraw $2, %xmm2
2844 ; SSE2-NEXT: pandn %xmm2, %xmm1
2845 ; SSE2-NEXT: por %xmm3, %xmm1
2846 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,0]
2847 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2848 ; SSE2-NEXT: pand %xmm2, %xmm3
2849 ; SSE2-NEXT: psraw $1, %xmm1
2850 ; SSE2-NEXT: pandn %xmm1, %xmm2
2851 ; SSE2-NEXT: por %xmm3, %xmm2
2852 ; SSE2-NEXT: psrlw $15, %xmm0
2853 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
2854 ; SSE2-NEXT: paddw %xmm2, %xmm0
2857 ; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2859 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2860 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2861 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
2862 ; SSE41-NEXT: paddw %xmm1, %xmm0
2863 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8>
2864 ; SSE41-NEXT: pmulhw %xmm0, %xmm2
2865 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
2866 ; SSE41-NEXT: psrlw $15, %xmm0
2867 ; SSE41-NEXT: pxor %xmm1, %xmm1
2868 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
2869 ; SSE41-NEXT: paddw %xmm2, %xmm1
2870 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2873 ; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2875 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2876 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2877 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2878 ; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2879 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2880 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
2881 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2882 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2883 ; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2886 ; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2888 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2889 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2890 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2891 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2892 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2893 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
2894 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2895 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2896 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
2899 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2901 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2902 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2903 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2904 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1
2905 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2906 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2907 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
2908 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
2909 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
2910 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2911 ; AVX512F-NEXT: vzeroupper
2912 ; AVX512F-NEXT: retq
2914 ; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2915 ; AVX512BW: # %bb.0:
2916 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2917 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2918 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2919 ; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1
2920 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2921 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2922 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
2923 ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2924 ; AVX512BW-NEXT: retq
2926 ; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2928 ; XOP-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2929 ; XOP-NEXT: vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2930 ; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1
2931 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
2932 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2933 ; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
2934 ; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
2936 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2940 define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2941 ; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2943 ; SSE2-NEXT: pxor %xmm1, %xmm1
2944 ; SSE2-NEXT: psubw %xmm0, %xmm1
2945 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2948 ; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2950 ; SSE41-NEXT: pxor %xmm1, %xmm1
2951 ; SSE41-NEXT: psubw %xmm0, %xmm1
2952 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2955 ; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2957 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2958 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2959 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2962 ; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2963 ; AVX2ORLATER: # %bb.0:
2964 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2965 ; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2966 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2967 ; AVX2ORLATER-NEXT: retq
2969 ; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2971 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2972 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2973 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2975 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2979 define <16 x i8> @pr38658(<16 x i8> %x) {
2980 ; SSE2-LABEL: pr38658:
2982 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2983 ; SSE2-NEXT: psraw $8, %xmm2
2984 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2985 ; SSE2-NEXT: psrlw $8, %xmm2
2986 ; SSE2-NEXT: pxor %xmm3, %xmm3
2987 ; SSE2-NEXT: pxor %xmm1, %xmm1
2988 ; SSE2-NEXT: packuswb %xmm2, %xmm1
2989 ; SSE2-NEXT: paddb %xmm0, %xmm1
2990 ; SSE2-NEXT: movdqa %xmm1, %xmm0
2991 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2992 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2993 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2994 ; SSE2-NEXT: psraw $8, %xmm2
2995 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2996 ; SSE2-NEXT: psrlw $8, %xmm2
2997 ; SSE2-NEXT: packuswb %xmm2, %xmm0
2998 ; SSE2-NEXT: psrlw $7, %xmm1
2999 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
3000 ; SSE2-NEXT: paddb %xmm0, %xmm1
3001 ; SSE2-NEXT: movdqa %xmm1, %xmm0
3004 ; SSE41-LABEL: pr38658:
3006 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3007 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
3008 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
3009 ; SSE41-NEXT: psrlw $8, %xmm2
3010 ; SSE41-NEXT: pxor %xmm1, %xmm1
3011 ; SSE41-NEXT: packuswb %xmm2, %xmm1
3012 ; SSE41-NEXT: paddb %xmm0, %xmm1
3013 ; SSE41-NEXT: movdqa %xmm1, %xmm0
3014 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3015 ; SSE41-NEXT: psraw $8, %xmm0
3016 ; SSE41-NEXT: movdqa %xmm0, %xmm2
3017 ; SSE41-NEXT: psllw $6, %xmm2
3018 ; SSE41-NEXT: psllw $8, %xmm0
3019 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
3020 ; SSE41-NEXT: psrlw $8, %xmm0
3021 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
3022 ; SSE41-NEXT: packuswb %xmm0, %xmm2
3023 ; SSE41-NEXT: psrlw $7, %xmm1
3024 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
3025 ; SSE41-NEXT: paddb %xmm2, %xmm1
3026 ; SSE41-NEXT: movdqa %xmm1, %xmm0
3029 ; AVX1-LABEL: pr38658:
3031 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3032 ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
3033 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
3034 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
3035 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
3036 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
3037 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3038 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3039 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
3040 ; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2
3041 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
3042 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
3043 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
3044 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3045 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
3046 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
3047 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3048 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3051 ; AVX2-LABEL: pr38658:
3053 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3054 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3055 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3056 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3057 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3058 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3059 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
3060 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3061 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
3062 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3063 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3064 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
3065 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3066 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3067 ; AVX2-NEXT: vzeroupper
3070 ; AVX512F-LABEL: pr38658:
3072 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
3073 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3074 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
3075 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3076 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
3077 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3078 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1
3079 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3080 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
3081 ; AVX512F-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
3082 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3083 ; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3084 ; AVX512F-NEXT: vzeroupper
3085 ; AVX512F-NEXT: retq
3087 ; AVX512BW-LABEL: pr38658:
3088 ; AVX512BW: # %bb.0:
3089 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
3090 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
3091 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
3092 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1
3093 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3094 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1
3095 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3096 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
3097 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
3098 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0
3099 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
3100 ; AVX512BW-NEXT: vzeroupper
3101 ; AVX512BW-NEXT: retq
3103 ; XOP-LABEL: pr38658:
3105 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3106 ; XOP-NEXT: vpmovsxbw %xmm1, %xmm1
3107 ; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
3108 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
3109 ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
3110 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3111 ; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm1
3112 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
3113 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3114 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3116 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3120 define i1 @bool_sdiv(i1 %x, i1 %y) {
3121 ; CHECK-LABEL: bool_sdiv:
3123 ; CHECK-NEXT: movl %edi, %eax
3124 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
3130 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3131 ; CHECK-LABEL: boolvec_sdiv:
3134 %r = sdiv <4 x i1> %x, %y