1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
12 define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
13 ; SSE2-LABEL: test_div7_2i64:
15 ; SSE2-NEXT: movq %xmm0, %rcx
16 ; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
17 ; SSE2-NEXT: movq %rcx, %rax
18 ; SSE2-NEXT: mulq %rsi
19 ; SSE2-NEXT: subq %rdx, %rcx
20 ; SSE2-NEXT: shrq %rcx
21 ; SSE2-NEXT: addq %rdx, %rcx
22 ; SSE2-NEXT: movq %rcx, %xmm1
23 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
24 ; SSE2-NEXT: movq %xmm0, %rcx
25 ; SSE2-NEXT: movq %rcx, %rax
26 ; SSE2-NEXT: mulq %rsi
27 ; SSE2-NEXT: subq %rdx, %rcx
28 ; SSE2-NEXT: shrq %rcx
29 ; SSE2-NEXT: addq %rdx, %rcx
30 ; SSE2-NEXT: movq %rcx, %xmm0
31 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
32 ; SSE2-NEXT: psrlq $2, %xmm1
33 ; SSE2-NEXT: movdqa %xmm1, %xmm0
36 ; SSE41-LABEL: test_div7_2i64:
38 ; SSE41-NEXT: pextrq $1, %xmm0, %rcx
39 ; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
40 ; SSE41-NEXT: movq %rcx, %rax
41 ; SSE41-NEXT: mulq %rsi
42 ; SSE41-NEXT: subq %rdx, %rcx
43 ; SSE41-NEXT: shrq %rcx
44 ; SSE41-NEXT: addq %rdx, %rcx
45 ; SSE41-NEXT: movq %rcx, %xmm1
46 ; SSE41-NEXT: movq %xmm0, %rcx
47 ; SSE41-NEXT: movq %rcx, %rax
48 ; SSE41-NEXT: mulq %rsi
49 ; SSE41-NEXT: subq %rdx, %rcx
50 ; SSE41-NEXT: shrq %rcx
51 ; SSE41-NEXT: addq %rdx, %rcx
52 ; SSE41-NEXT: movq %rcx, %xmm0
53 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
54 ; SSE41-NEXT: psrlq $2, %xmm0
57 ; AVX-LABEL: test_div7_2i64:
59 ; AVX-NEXT: vpextrq $1, %xmm0, %rcx
60 ; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
61 ; AVX-NEXT: movq %rcx, %rax
63 ; AVX-NEXT: subq %rdx, %rcx
65 ; AVX-NEXT: addq %rdx, %rcx
66 ; AVX-NEXT: vmovq %rcx, %xmm1
67 ; AVX-NEXT: vmovq %xmm0, %rcx
68 ; AVX-NEXT: movq %rcx, %rax
70 ; AVX-NEXT: subq %rdx, %rcx
72 ; AVX-NEXT: addq %rdx, %rcx
73 ; AVX-NEXT: vmovq %rcx, %xmm0
74 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
75 ; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0
77 %res = udiv <2 x i64> %a, <i64 7, i64 7>
81 define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
82 ; SSE2-LABEL: test_div7_4i32:
84 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
85 ; SSE2-NEXT: movdqa %xmm0, %xmm2
86 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
87 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
88 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
89 ; SSE2-NEXT: pmuludq %xmm1, %xmm3
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
91 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
92 ; SSE2-NEXT: psubd %xmm2, %xmm0
93 ; SSE2-NEXT: psrld $1, %xmm0
94 ; SSE2-NEXT: paddd %xmm2, %xmm0
95 ; SSE2-NEXT: psrld $2, %xmm0
98 ; SSE41-LABEL: test_div7_4i32:
100 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
101 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
102 ; SSE41-NEXT: pmuludq %xmm2, %xmm1
103 ; SSE41-NEXT: pmuludq %xmm0, %xmm2
104 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
105 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
106 ; SSE41-NEXT: psubd %xmm2, %xmm0
107 ; SSE41-NEXT: psrld $1, %xmm0
108 ; SSE41-NEXT: paddd %xmm2, %xmm0
109 ; SSE41-NEXT: psrld $2, %xmm0
112 ; AVX1-LABEL: test_div7_4i32:
114 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
115 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
116 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
117 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
118 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
119 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
120 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
121 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
122 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
123 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
126 ; AVX2-LABEL: test_div7_4i32:
128 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
129 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
130 ; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
131 ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
132 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
133 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
134 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
135 ; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
136 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
137 ; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
139 %res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
143 define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
144 ; SSE-LABEL: test_div7_8i16:
146 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
147 ; SSE-NEXT: pmulhuw %xmm0, %xmm1
148 ; SSE-NEXT: psubw %xmm1, %xmm0
149 ; SSE-NEXT: psrlw $1, %xmm0
150 ; SSE-NEXT: paddw %xmm1, %xmm0
151 ; SSE-NEXT: psrlw $2, %xmm0
154 ; AVX-LABEL: test_div7_8i16:
156 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,9363,9363,9363,9363,9363,9363,9363]
157 ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
158 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
159 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
160 ; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
162 %res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
166 define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
167 ; SSE2-LABEL: test_div7_16i8:
169 ; SSE2-NEXT: pxor %xmm1, %xmm1
170 ; SSE2-NEXT: movdqa %xmm0, %xmm2
171 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
172 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
173 ; SSE2-NEXT: pmullw %xmm3, %xmm2
174 ; SSE2-NEXT: psrlw $8, %xmm2
175 ; SSE2-NEXT: movdqa %xmm0, %xmm4
176 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
177 ; SSE2-NEXT: pmullw %xmm3, %xmm4
178 ; SSE2-NEXT: psrlw $8, %xmm4
179 ; SSE2-NEXT: packuswb %xmm2, %xmm4
180 ; SSE2-NEXT: psubb %xmm4, %xmm0
181 ; SSE2-NEXT: psrlw $1, %xmm0
182 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
183 ; SSE2-NEXT: paddb %xmm4, %xmm0
184 ; SSE2-NEXT: psrlw $2, %xmm0
185 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
188 ; SSE41-LABEL: test_div7_16i8:
190 ; SSE41-NEXT: pxor %xmm1, %xmm1
191 ; SSE41-NEXT: movdqa %xmm0, %xmm2
192 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
193 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
194 ; SSE41-NEXT: pmullw %xmm1, %xmm2
195 ; SSE41-NEXT: psrlw $8, %xmm2
196 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
197 ; SSE41-NEXT: pmullw %xmm1, %xmm3
198 ; SSE41-NEXT: psrlw $8, %xmm3
199 ; SSE41-NEXT: packuswb %xmm2, %xmm3
200 ; SSE41-NEXT: psubb %xmm3, %xmm0
201 ; SSE41-NEXT: psrlw $1, %xmm0
202 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
203 ; SSE41-NEXT: paddb %xmm3, %xmm0
204 ; SSE41-NEXT: psrlw $2, %xmm0
205 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
208 ; AVX1-LABEL: test_div7_16i8:
210 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
211 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
212 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
213 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
214 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
215 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
216 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
217 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
218 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
219 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
220 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
221 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
222 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
223 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
224 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
227 ; AVX2NOBW-LABEL: test_div7_16i8:
229 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
230 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
231 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
232 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
233 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
234 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
235 ; AVX2NOBW-NEXT: vpsrlw $1, %xmm0, %xmm0
236 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
237 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
238 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
239 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
240 ; AVX2NOBW-NEXT: vzeroupper
241 ; AVX2NOBW-NEXT: retq
243 ; AVX512BW-LABEL: test_div7_16i8:
245 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
246 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
247 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
248 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
249 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
250 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm0
251 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
252 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
253 ; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
254 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
255 ; AVX512BW-NEXT: vzeroupper
256 ; AVX512BW-NEXT: retq
257 %res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
262 ; sdiv by non-splat constant
265 define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
266 ; SSE2-LABEL: test_divconstant_16i8:
268 ; SSE2-NEXT: pxor %xmm1, %xmm1
269 ; SSE2-NEXT: movdqa %xmm0, %xmm2
270 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
271 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
272 ; SSE2-NEXT: psrlw $8, %xmm2
273 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
274 ; SSE2-NEXT: psrlw $8, %xmm2
275 ; SSE2-NEXT: movdqa %xmm0, %xmm3
276 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
277 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
278 ; SSE2-NEXT: psrlw $8, %xmm3
279 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
280 ; SSE2-NEXT: psrlw $8, %xmm3
281 ; SSE2-NEXT: packuswb %xmm2, %xmm3
282 ; SSE2-NEXT: psubb %xmm3, %xmm0
283 ; SSE2-NEXT: movdqa %xmm0, %xmm2
284 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
285 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
286 ; SSE2-NEXT: psrlw $8, %xmm2
287 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
288 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0]
289 ; SSE2-NEXT: psrlw $8, %xmm0
290 ; SSE2-NEXT: packuswb %xmm2, %xmm0
291 ; SSE2-NEXT: paddb %xmm3, %xmm0
292 ; SSE2-NEXT: movdqa %xmm0, %xmm2
293 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
294 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64]
295 ; SSE2-NEXT: psrlw $8, %xmm2
296 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
297 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64]
298 ; SSE2-NEXT: psrlw $8, %xmm0
299 ; SSE2-NEXT: packuswb %xmm2, %xmm0
302 ; SSE41-LABEL: test_divconstant_16i8:
304 ; SSE41-NEXT: pxor %xmm1, %xmm1
305 ; SSE41-NEXT: pxor %xmm2, %xmm2
306 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
307 ; SSE41-NEXT: movdqa %xmm0, %xmm3
308 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
309 ; SSE41-NEXT: psllw $7, %xmm3
310 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
311 ; SSE41-NEXT: psrlw $8, %xmm3
312 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
313 ; SSE41-NEXT: psrlw $8, %xmm3
314 ; SSE41-NEXT: pxor %xmm2, %xmm2
315 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
316 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
317 ; SSE41-NEXT: psllw $7, %xmm4
318 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
319 ; SSE41-NEXT: psrlw $8, %xmm4
320 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
321 ; SSE41-NEXT: psrlw $8, %xmm4
322 ; SSE41-NEXT: packuswb %xmm3, %xmm4
323 ; SSE41-NEXT: psubb %xmm4, %xmm0
324 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
325 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
326 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,0,0,128]
327 ; SSE41-NEXT: psrlw $8, %xmm0
328 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
329 ; SSE41-NEXT: psrlw $8, %xmm2
330 ; SSE41-NEXT: packuswb %xmm0, %xmm2
331 ; SSE41-NEXT: paddb %xmm4, %xmm2
332 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
333 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
334 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64]
335 ; SSE41-NEXT: psrlw $8, %xmm2
336 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64]
337 ; SSE41-NEXT: psrlw $8, %xmm0
338 ; SSE41-NEXT: packuswb %xmm2, %xmm0
341 ; AVX1-LABEL: test_divconstant_16i8:
343 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
344 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
345 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
346 ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
347 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
348 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
349 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
350 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
351 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
352 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
353 ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
354 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
355 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
356 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
357 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
358 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
359 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
360 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
361 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,128,0,0,0,128]
362 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
363 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
364 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
365 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
366 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
367 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
368 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
369 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64]
370 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
371 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
372 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,256,128,32,32,32,64,64]
373 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
374 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
377 ; AVX2NOBW-LABEL: test_divconstant_16i8:
379 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
380 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
381 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
382 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
383 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
384 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
385 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
386 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
387 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
388 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
389 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
390 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm2
391 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
392 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
393 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
394 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64]
395 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
396 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
397 ; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
398 ; AVX2NOBW-NEXT: vzeroupper
399 ; AVX2NOBW-NEXT: retq
401 ; AVX512BW-LABEL: test_divconstant_16i8:
403 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
404 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
405 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1
406 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
407 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
408 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
409 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
410 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
411 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
412 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
413 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
414 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
415 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
416 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
417 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
418 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
419 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
420 ; AVX512BW-NEXT: vzeroupper
421 ; AVX512BW-NEXT: retq
422 %res = udiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
430 define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
431 ; SSE2-LABEL: test_rem7_2i64:
433 ; SSE2-NEXT: movq %xmm0, %rcx
434 ; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
435 ; SSE2-NEXT: movq %rcx, %rax
436 ; SSE2-NEXT: mulq %rsi
437 ; SSE2-NEXT: movq %rcx, %rax
438 ; SSE2-NEXT: subq %rdx, %rax
439 ; SSE2-NEXT: shrq %rax
440 ; SSE2-NEXT: addq %rdx, %rax
441 ; SSE2-NEXT: shrq $2, %rax
442 ; SSE2-NEXT: leaq (,%rax,8), %rdx
443 ; SSE2-NEXT: subq %rdx, %rax
444 ; SSE2-NEXT: addq %rcx, %rax
445 ; SSE2-NEXT: movq %rax, %xmm1
446 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
447 ; SSE2-NEXT: movq %xmm0, %rcx
448 ; SSE2-NEXT: movq %rcx, %rax
449 ; SSE2-NEXT: mulq %rsi
450 ; SSE2-NEXT: movq %rcx, %rax
451 ; SSE2-NEXT: subq %rdx, %rax
452 ; SSE2-NEXT: shrq %rax
453 ; SSE2-NEXT: addq %rdx, %rax
454 ; SSE2-NEXT: shrq $2, %rax
455 ; SSE2-NEXT: leaq (,%rax,8), %rdx
456 ; SSE2-NEXT: subq %rdx, %rax
457 ; SSE2-NEXT: addq %rcx, %rax
458 ; SSE2-NEXT: movq %rax, %xmm0
459 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
460 ; SSE2-NEXT: movdqa %xmm1, %xmm0
463 ; SSE41-LABEL: test_rem7_2i64:
465 ; SSE41-NEXT: pextrq $1, %xmm0, %rcx
466 ; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
467 ; SSE41-NEXT: movq %rcx, %rax
468 ; SSE41-NEXT: mulq %rsi
469 ; SSE41-NEXT: movq %rcx, %rax
470 ; SSE41-NEXT: subq %rdx, %rax
471 ; SSE41-NEXT: shrq %rax
472 ; SSE41-NEXT: addq %rdx, %rax
473 ; SSE41-NEXT: shrq $2, %rax
474 ; SSE41-NEXT: leaq (,%rax,8), %rdx
475 ; SSE41-NEXT: subq %rdx, %rax
476 ; SSE41-NEXT: addq %rcx, %rax
477 ; SSE41-NEXT: movq %rax, %xmm1
478 ; SSE41-NEXT: movq %xmm0, %rcx
479 ; SSE41-NEXT: movq %rcx, %rax
480 ; SSE41-NEXT: mulq %rsi
481 ; SSE41-NEXT: movq %rcx, %rax
482 ; SSE41-NEXT: subq %rdx, %rax
483 ; SSE41-NEXT: shrq %rax
484 ; SSE41-NEXT: addq %rdx, %rax
485 ; SSE41-NEXT: shrq $2, %rax
486 ; SSE41-NEXT: leaq (,%rax,8), %rdx
487 ; SSE41-NEXT: subq %rdx, %rax
488 ; SSE41-NEXT: addq %rcx, %rax
489 ; SSE41-NEXT: movq %rax, %xmm0
490 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
493 ; AVX-LABEL: test_rem7_2i64:
495 ; AVX-NEXT: vpextrq $1, %xmm0, %rcx
496 ; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
497 ; AVX-NEXT: movq %rcx, %rax
498 ; AVX-NEXT: mulq %rsi
499 ; AVX-NEXT: movq %rcx, %rax
500 ; AVX-NEXT: subq %rdx, %rax
501 ; AVX-NEXT: shrq %rax
502 ; AVX-NEXT: addq %rdx, %rax
503 ; AVX-NEXT: shrq $2, %rax
504 ; AVX-NEXT: leaq (,%rax,8), %rdx
505 ; AVX-NEXT: subq %rdx, %rax
506 ; AVX-NEXT: addq %rcx, %rax
507 ; AVX-NEXT: vmovq %rax, %xmm1
508 ; AVX-NEXT: vmovq %xmm0, %rcx
509 ; AVX-NEXT: movq %rcx, %rax
510 ; AVX-NEXT: mulq %rsi
511 ; AVX-NEXT: movq %rcx, %rax
512 ; AVX-NEXT: subq %rdx, %rax
513 ; AVX-NEXT: shrq %rax
514 ; AVX-NEXT: addq %rdx, %rax
515 ; AVX-NEXT: shrq $2, %rax
516 ; AVX-NEXT: leaq (,%rax,8), %rdx
517 ; AVX-NEXT: subq %rdx, %rax
518 ; AVX-NEXT: addq %rcx, %rax
519 ; AVX-NEXT: vmovq %rax, %xmm0
520 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
522 %res = urem <2 x i64> %a, <i64 7, i64 7>
526 define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
527 ; SSE2-LABEL: test_rem7_4i32:
529 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
530 ; SSE2-NEXT: movdqa %xmm0, %xmm2
531 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
532 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
533 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
534 ; SSE2-NEXT: pmuludq %xmm1, %xmm3
535 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
536 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
537 ; SSE2-NEXT: movdqa %xmm0, %xmm1
538 ; SSE2-NEXT: psubd %xmm2, %xmm1
539 ; SSE2-NEXT: psrld $1, %xmm1
540 ; SSE2-NEXT: paddd %xmm2, %xmm1
541 ; SSE2-NEXT: psrld $2, %xmm1
542 ; SSE2-NEXT: movdqa %xmm1, %xmm2
543 ; SSE2-NEXT: pslld $3, %xmm2
544 ; SSE2-NEXT: psubd %xmm2, %xmm1
545 ; SSE2-NEXT: paddd %xmm1, %xmm0
548 ; SSE41-LABEL: test_rem7_4i32:
550 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
551 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
552 ; SSE41-NEXT: pmuludq %xmm2, %xmm1
553 ; SSE41-NEXT: pmuludq %xmm0, %xmm2
554 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
555 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
556 ; SSE41-NEXT: movdqa %xmm0, %xmm1
557 ; SSE41-NEXT: psubd %xmm2, %xmm1
558 ; SSE41-NEXT: psrld $1, %xmm1
559 ; SSE41-NEXT: paddd %xmm2, %xmm1
560 ; SSE41-NEXT: psrld $2, %xmm1
561 ; SSE41-NEXT: movdqa %xmm1, %xmm2
562 ; SSE41-NEXT: pslld $3, %xmm2
563 ; SSE41-NEXT: psubd %xmm2, %xmm1
564 ; SSE41-NEXT: paddd %xmm1, %xmm0
567 ; AVX1-LABEL: test_rem7_4i32:
569 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
570 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
571 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
572 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
573 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
574 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
575 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
576 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
577 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
578 ; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
579 ; AVX1-NEXT: vpslld $3, %xmm1, %xmm2
580 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
581 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
584 ; AVX2-LABEL: test_rem7_4i32:
586 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
587 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
588 ; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
589 ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
590 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
591 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
592 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
593 ; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
594 ; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
595 ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
596 ; AVX2-NEXT: vpslld $3, %xmm1, %xmm2
597 ; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1
598 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
600 %res = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
604 define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
605 ; SSE-LABEL: test_rem7_8i16:
607 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
608 ; SSE-NEXT: pmulhuw %xmm0, %xmm1
609 ; SSE-NEXT: movdqa %xmm0, %xmm2
610 ; SSE-NEXT: psubw %xmm1, %xmm2
611 ; SSE-NEXT: psrlw $1, %xmm2
612 ; SSE-NEXT: paddw %xmm1, %xmm2
613 ; SSE-NEXT: psrlw $2, %xmm2
614 ; SSE-NEXT: movdqa %xmm2, %xmm1
615 ; SSE-NEXT: psllw $3, %xmm1
616 ; SSE-NEXT: psubw %xmm1, %xmm2
617 ; SSE-NEXT: paddw %xmm2, %xmm0
620 ; AVX-LABEL: test_rem7_8i16:
622 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,9363,9363,9363,9363,9363,9363,9363]
623 ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
624 ; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
625 ; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
626 ; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1
627 ; AVX-NEXT: vpsllw $3, %xmm1, %xmm2
628 ; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1
629 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
631 %res = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
635 define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
636 ; SSE2-LABEL: test_rem7_16i8:
638 ; SSE2-NEXT: pxor %xmm1, %xmm1
639 ; SSE2-NEXT: movdqa %xmm0, %xmm2
640 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
641 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
642 ; SSE2-NEXT: pmullw %xmm3, %xmm2
643 ; SSE2-NEXT: psrlw $8, %xmm2
644 ; SSE2-NEXT: movdqa %xmm0, %xmm4
645 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
646 ; SSE2-NEXT: pmullw %xmm3, %xmm4
647 ; SSE2-NEXT: psrlw $8, %xmm4
648 ; SSE2-NEXT: packuswb %xmm2, %xmm4
649 ; SSE2-NEXT: movdqa %xmm0, %xmm1
650 ; SSE2-NEXT: psubb %xmm4, %xmm1
651 ; SSE2-NEXT: psrlw $1, %xmm1
652 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
653 ; SSE2-NEXT: paddb %xmm4, %xmm1
654 ; SSE2-NEXT: psrlw $2, %xmm1
655 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
656 ; SSE2-NEXT: movdqa %xmm1, %xmm2
657 ; SSE2-NEXT: psllw $3, %xmm2
658 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
659 ; SSE2-NEXT: psubb %xmm2, %xmm1
660 ; SSE2-NEXT: paddb %xmm1, %xmm0
663 ; SSE41-LABEL: test_rem7_16i8:
665 ; SSE41-NEXT: pxor %xmm1, %xmm1
666 ; SSE41-NEXT: movdqa %xmm0, %xmm2
667 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
668 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
669 ; SSE41-NEXT: pmullw %xmm1, %xmm2
670 ; SSE41-NEXT: psrlw $8, %xmm2
671 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
672 ; SSE41-NEXT: pmullw %xmm1, %xmm3
673 ; SSE41-NEXT: psrlw $8, %xmm3
674 ; SSE41-NEXT: packuswb %xmm2, %xmm3
675 ; SSE41-NEXT: movdqa %xmm0, %xmm1
676 ; SSE41-NEXT: psubb %xmm3, %xmm1
677 ; SSE41-NEXT: psrlw $1, %xmm1
678 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
679 ; SSE41-NEXT: paddb %xmm3, %xmm1
680 ; SSE41-NEXT: psrlw $2, %xmm1
681 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
682 ; SSE41-NEXT: movdqa %xmm1, %xmm2
683 ; SSE41-NEXT: psllw $3, %xmm2
684 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
685 ; SSE41-NEXT: psubb %xmm2, %xmm1
686 ; SSE41-NEXT: paddb %xmm1, %xmm0
689 ; AVX1-LABEL: test_rem7_16i8:
691 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
692 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
693 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
694 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
695 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
696 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
697 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
698 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
699 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
700 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
701 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
702 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
703 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
704 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
705 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
706 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
707 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
708 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
709 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
712 ; AVX2NOBW-LABEL: test_rem7_16i8:
714 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
715 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
716 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
717 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
718 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
719 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
720 ; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2
721 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
722 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
723 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
724 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
725 ; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
726 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
727 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
728 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
729 ; AVX2NOBW-NEXT: vzeroupper
730 ; AVX2NOBW-NEXT: retq
732 ; AVX512BW-LABEL: test_rem7_16i8:
734 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
735 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
736 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
737 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
738 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
739 ; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2
740 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
741 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
742 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1
743 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
744 ; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
745 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
746 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
747 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
748 ; AVX512BW-NEXT: vzeroupper
749 ; AVX512BW-NEXT: retq
750 %res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
755 ; srem by non-splat constant
758 define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
759 ; SSE2-LABEL: test_remconstant_16i8:
761 ; SSE2-NEXT: pxor %xmm1, %xmm1
762 ; SSE2-NEXT: movdqa %xmm0, %xmm2
763 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
764 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
765 ; SSE2-NEXT: psrlw $8, %xmm2
766 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
767 ; SSE2-NEXT: psrlw $8, %xmm2
768 ; SSE2-NEXT: movdqa %xmm0, %xmm3
769 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
770 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
771 ; SSE2-NEXT: psrlw $8, %xmm3
772 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
773 ; SSE2-NEXT: psrlw $8, %xmm3
774 ; SSE2-NEXT: packuswb %xmm2, %xmm3
775 ; SSE2-NEXT: movdqa %xmm0, %xmm2
776 ; SSE2-NEXT: psubb %xmm3, %xmm2
777 ; SSE2-NEXT: movdqa %xmm2, %xmm4
778 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
779 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128]
780 ; SSE2-NEXT: psrlw $8, %xmm4
781 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
782 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
783 ; SSE2-NEXT: psrlw $8, %xmm2
784 ; SSE2-NEXT: packuswb %xmm4, %xmm2
785 ; SSE2-NEXT: paddb %xmm3, %xmm2
786 ; SSE2-NEXT: movdqa %xmm2, %xmm3
787 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
788 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
789 ; SSE2-NEXT: psrlw $8, %xmm3
790 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,13,12,11,10,9,9,7]
791 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
792 ; SSE2-NEXT: pand %xmm4, %xmm3
793 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
794 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
795 ; SSE2-NEXT: psrlw $8, %xmm2
796 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14]
797 ; SSE2-NEXT: pand %xmm4, %xmm2
798 ; SSE2-NEXT: packuswb %xmm3, %xmm2
799 ; SSE2-NEXT: psubb %xmm2, %xmm0
802 ; SSE41-LABEL: test_remconstant_16i8:
804 ; SSE41-NEXT: pxor %xmm1, %xmm1
805 ; SSE41-NEXT: pxor %xmm2, %xmm2
806 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
807 ; SSE41-NEXT: movdqa %xmm0, %xmm3
808 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
809 ; SSE41-NEXT: psllw $7, %xmm3
810 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
811 ; SSE41-NEXT: psrlw $8, %xmm3
812 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
813 ; SSE41-NEXT: psrlw $8, %xmm3
814 ; SSE41-NEXT: pxor %xmm2, %xmm2
815 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
816 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
817 ; SSE41-NEXT: psllw $7, %xmm4
818 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
819 ; SSE41-NEXT: psrlw $8, %xmm4
820 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
821 ; SSE41-NEXT: psrlw $8, %xmm4
822 ; SSE41-NEXT: packuswb %xmm3, %xmm4
823 ; SSE41-NEXT: movdqa %xmm0, %xmm2
824 ; SSE41-NEXT: psubb %xmm4, %xmm2
825 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
826 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
827 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
828 ; SSE41-NEXT: psrlw $8, %xmm2
829 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0]
830 ; SSE41-NEXT: psrlw $8, %xmm3
831 ; SSE41-NEXT: packuswb %xmm2, %xmm3
832 ; SSE41-NEXT: paddb %xmm4, %xmm3
833 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
834 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
835 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
836 ; SSE41-NEXT: psrlw $8, %xmm3
837 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
838 ; SSE41-NEXT: psrlw $8, %xmm2
839 ; SSE41-NEXT: packuswb %xmm3, %xmm2
840 ; SSE41-NEXT: movdqa %xmm2, %xmm1
841 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
842 ; SSE41-NEXT: psllw $8, %xmm1
843 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
844 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
845 ; SSE41-NEXT: por %xmm1, %xmm2
846 ; SSE41-NEXT: psubb %xmm2, %xmm0
849 ; AVX1-LABEL: test_remconstant_16i8:
851 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
852 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
853 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
854 ; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
855 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
856 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
857 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
858 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
859 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
860 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
861 ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
862 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
863 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
864 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
865 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
866 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
867 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
868 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
869 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
870 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
871 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
872 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,128,0,0,0]
873 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
874 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
875 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
876 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
877 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64]
878 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
879 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
880 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,256,128,32,32,32,64,64]
881 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
882 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
883 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
884 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
885 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
886 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
887 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
888 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
891 ; AVX2NOBW-LABEL: test_remconstant_16i8:
893 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
894 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
895 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
896 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
897 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
898 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
899 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
900 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
901 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
902 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
903 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
904 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3
905 ; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
906 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
907 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
908 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64]
909 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
910 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
911 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
912 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
913 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
914 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
915 ; AVX2NOBW-NEXT: vzeroupper
916 ; AVX2NOBW-NEXT: retq
918 ; AVX512BW-LABEL: test_remconstant_16i8:
920 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
921 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
922 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1
923 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
924 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
925 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
926 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
927 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
928 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
929 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
930 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
931 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
932 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
933 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
934 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
935 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
936 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
937 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
938 ; AVX512BW-NEXT: vzeroupper
939 ; AVX512BW-NEXT: retq
940 %res = urem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>