1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
10 define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
11 ; AVX1-LABEL: test_div7_4i64:
13 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
14 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
15 ; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
16 ; AVX1-NEXT: imulq %rcx
17 ; AVX1-NEXT: movq %rdx, %rax
18 ; AVX1-NEXT: shrq $63, %rax
19 ; AVX1-NEXT: sarq %rdx
20 ; AVX1-NEXT: addq %rax, %rdx
21 ; AVX1-NEXT: vmovq %rdx, %xmm2
22 ; AVX1-NEXT: vmovq %xmm1, %rax
23 ; AVX1-NEXT: imulq %rcx
24 ; AVX1-NEXT: movq %rdx, %rax
25 ; AVX1-NEXT: shrq $63, %rax
26 ; AVX1-NEXT: sarq %rdx
27 ; AVX1-NEXT: addq %rax, %rdx
28 ; AVX1-NEXT: vmovq %rdx, %xmm1
29 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
30 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
31 ; AVX1-NEXT: imulq %rcx
32 ; AVX1-NEXT: movq %rdx, %rax
33 ; AVX1-NEXT: shrq $63, %rax
34 ; AVX1-NEXT: sarq %rdx
35 ; AVX1-NEXT: addq %rax, %rdx
36 ; AVX1-NEXT: vmovq %rdx, %xmm2
37 ; AVX1-NEXT: vmovq %xmm0, %rax
38 ; AVX1-NEXT: imulq %rcx
39 ; AVX1-NEXT: movq %rdx, %rax
40 ; AVX1-NEXT: shrq $63, %rax
41 ; AVX1-NEXT: sarq %rdx
42 ; AVX1-NEXT: addq %rax, %rdx
43 ; AVX1-NEXT: vmovq %rdx, %xmm0
44 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
45 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
48 ; AVX2-LABEL: test_div7_4i64:
50 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
51 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
52 ; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
53 ; AVX2-NEXT: imulq %rcx
54 ; AVX2-NEXT: movq %rdx, %rax
55 ; AVX2-NEXT: shrq $63, %rax
56 ; AVX2-NEXT: sarq %rdx
57 ; AVX2-NEXT: addq %rax, %rdx
58 ; AVX2-NEXT: vmovq %rdx, %xmm2
59 ; AVX2-NEXT: vmovq %xmm1, %rax
60 ; AVX2-NEXT: imulq %rcx
61 ; AVX2-NEXT: movq %rdx, %rax
62 ; AVX2-NEXT: shrq $63, %rax
63 ; AVX2-NEXT: sarq %rdx
64 ; AVX2-NEXT: addq %rax, %rdx
65 ; AVX2-NEXT: vmovq %rdx, %xmm1
66 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
67 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
68 ; AVX2-NEXT: imulq %rcx
69 ; AVX2-NEXT: movq %rdx, %rax
70 ; AVX2-NEXT: shrq $63, %rax
71 ; AVX2-NEXT: sarq %rdx
72 ; AVX2-NEXT: addq %rax, %rdx
73 ; AVX2-NEXT: vmovq %rdx, %xmm2
74 ; AVX2-NEXT: vmovq %xmm0, %rax
75 ; AVX2-NEXT: imulq %rcx
76 ; AVX2-NEXT: movq %rdx, %rax
77 ; AVX2-NEXT: shrq $63, %rax
78 ; AVX2-NEXT: sarq %rdx
79 ; AVX2-NEXT: addq %rax, %rdx
80 ; AVX2-NEXT: vmovq %rdx, %xmm0
81 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
82 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
84 %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
88 define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
89 ; AVX1-LABEL: test_div7_8i32:
91 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
92 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
93 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
94 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
95 ; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
96 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
97 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
98 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
99 ; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
100 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
101 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
102 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
103 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
104 ; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
105 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
106 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
107 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
108 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm2
109 ; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
110 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
111 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
114 ; AVX2-LABEL: test_div7_8i32:
116 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
117 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
118 ; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
119 ; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
120 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
121 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
122 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
123 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1
124 ; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0
125 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
127 %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
131 define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
132 ; AVX1-LABEL: test_div7_16i16:
134 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
135 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
136 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1
137 ; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3
138 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
139 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
140 ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0
141 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2
142 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0
143 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
144 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
147 ; AVX2-LABEL: test_div7_16i16:
149 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
150 ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1
151 ; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
152 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
154 %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
158 define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
159 ; AVX1-LABEL: test_div7_32i8:
161 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
162 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
163 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
164 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
165 ; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
166 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
167 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
168 ; AVX1-NEXT: vpmulhw %xmm4, %xmm5, %xmm5
169 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
170 ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
171 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
172 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3
173 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
174 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
175 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
176 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
177 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
178 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
179 ; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1
180 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
181 ; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1
182 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
183 ; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
184 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
185 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
186 ; AVX1-NEXT: vpmulhw %xmm4, %xmm2, %xmm2
187 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
188 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
189 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
190 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2
191 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
192 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
193 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
194 ; AVX1-NEXT: vpxor %xmm7, %xmm0, %xmm0
195 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
196 ; AVX1-NEXT: vpsubb %xmm7, %xmm0, %xmm0
197 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
200 ; AVX2NOBW-LABEL: test_div7_32i8:
202 ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
203 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
204 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
205 ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
206 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
207 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
208 ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
209 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
210 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
211 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
212 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1
213 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
214 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
215 ; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1
216 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0
217 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
218 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
219 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0
220 ; AVX2NOBW-NEXT: retq
222 ; AVX512BW-LABEL: test_div7_32i8:
224 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
225 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
226 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
227 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
228 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
229 ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1
230 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
231 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
232 ; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1
233 ; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0
234 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
235 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
236 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm0, %ymm0
237 ; AVX512BW-NEXT: retq
238 %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
243 ; sdiv by non-splat constant
246 define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
247 ; AVX1-LABEL: test_divconstant_32i8:
249 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
250 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
251 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
252 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
253 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
254 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
255 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
256 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
257 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
258 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
259 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
260 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
261 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
262 ; AVX1-NEXT: vpsraw $8, %xmm4, %xmm4
263 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
264 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
265 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
266 ; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5
267 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
268 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
269 ; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
270 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
271 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
272 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
273 ; AVX1-NEXT: vpaddb %xmm1, %xmm4, %xmm1
274 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
275 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
276 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
277 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
278 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
279 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
280 ; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
281 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
282 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
283 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
284 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
285 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
286 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
287 ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3
288 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
289 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
290 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
291 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
292 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
293 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
294 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
297 ; AVX2NOBW-LABEL: test_divconstant_32i8:
299 ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
300 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
301 ; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
302 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
303 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
304 ; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
305 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
306 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
307 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
308 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
309 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
310 ; AVX2NOBW-NEXT: vpsraw $8, %ymm1, %ymm1
311 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
312 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
313 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
314 ; AVX2NOBW-NEXT: vpsraw $8, %ymm2, %ymm2
315 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
316 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
317 ; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
318 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0
319 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
320 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
321 ; AVX2NOBW-NEXT: retq
323 ; AVX512BW-LABEL: test_divconstant_32i8:
325 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
326 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
327 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
328 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
329 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
330 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
331 ; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm1
332 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
333 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
334 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
335 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
336 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
337 ; AVX512BW-NEXT: retq
338 %res = sdiv <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
346 define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
347 ; AVX1-LABEL: test_rem7_4i64:
349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
350 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
351 ; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
352 ; AVX1-NEXT: movq %rcx, %rax
353 ; AVX1-NEXT: imulq %rsi
354 ; AVX1-NEXT: movq %rdx, %rax
355 ; AVX1-NEXT: shrq $63, %rax
356 ; AVX1-NEXT: sarq %rdx
357 ; AVX1-NEXT: addq %rax, %rdx
358 ; AVX1-NEXT: leaq (,%rdx,8), %rax
359 ; AVX1-NEXT: subq %rax, %rdx
360 ; AVX1-NEXT: addq %rcx, %rdx
361 ; AVX1-NEXT: vmovq %rdx, %xmm2
362 ; AVX1-NEXT: vmovq %xmm1, %rcx
363 ; AVX1-NEXT: movq %rcx, %rax
364 ; AVX1-NEXT: imulq %rsi
365 ; AVX1-NEXT: movq %rdx, %rax
366 ; AVX1-NEXT: shrq $63, %rax
367 ; AVX1-NEXT: sarq %rdx
368 ; AVX1-NEXT: addq %rax, %rdx
369 ; AVX1-NEXT: leaq (,%rdx,8), %rax
370 ; AVX1-NEXT: subq %rax, %rdx
371 ; AVX1-NEXT: addq %rcx, %rdx
372 ; AVX1-NEXT: vmovq %rdx, %xmm1
373 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
374 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
375 ; AVX1-NEXT: movq %rcx, %rax
376 ; AVX1-NEXT: imulq %rsi
377 ; AVX1-NEXT: movq %rdx, %rax
378 ; AVX1-NEXT: shrq $63, %rax
379 ; AVX1-NEXT: sarq %rdx
380 ; AVX1-NEXT: addq %rax, %rdx
381 ; AVX1-NEXT: leaq (,%rdx,8), %rax
382 ; AVX1-NEXT: subq %rax, %rdx
383 ; AVX1-NEXT: addq %rcx, %rdx
384 ; AVX1-NEXT: vmovq %rdx, %xmm2
385 ; AVX1-NEXT: vmovq %xmm0, %rcx
386 ; AVX1-NEXT: movq %rcx, %rax
387 ; AVX1-NEXT: imulq %rsi
388 ; AVX1-NEXT: movq %rdx, %rax
389 ; AVX1-NEXT: shrq $63, %rax
390 ; AVX1-NEXT: sarq %rdx
391 ; AVX1-NEXT: addq %rax, %rdx
392 ; AVX1-NEXT: leaq (,%rdx,8), %rax
393 ; AVX1-NEXT: subq %rax, %rdx
394 ; AVX1-NEXT: addq %rcx, %rdx
395 ; AVX1-NEXT: vmovq %rdx, %xmm0
396 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
397 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
400 ; AVX2-LABEL: test_rem7_4i64:
402 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
404 ; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
405 ; AVX2-NEXT: movq %rcx, %rax
406 ; AVX2-NEXT: imulq %rsi
407 ; AVX2-NEXT: movq %rdx, %rax
408 ; AVX2-NEXT: shrq $63, %rax
409 ; AVX2-NEXT: sarq %rdx
410 ; AVX2-NEXT: addq %rax, %rdx
411 ; AVX2-NEXT: leaq (,%rdx,8), %rax
412 ; AVX2-NEXT: subq %rax, %rdx
413 ; AVX2-NEXT: addq %rcx, %rdx
414 ; AVX2-NEXT: vmovq %rdx, %xmm2
415 ; AVX2-NEXT: vmovq %xmm1, %rcx
416 ; AVX2-NEXT: movq %rcx, %rax
417 ; AVX2-NEXT: imulq %rsi
418 ; AVX2-NEXT: movq %rdx, %rax
419 ; AVX2-NEXT: shrq $63, %rax
420 ; AVX2-NEXT: sarq %rdx
421 ; AVX2-NEXT: addq %rax, %rdx
422 ; AVX2-NEXT: leaq (,%rdx,8), %rax
423 ; AVX2-NEXT: subq %rax, %rdx
424 ; AVX2-NEXT: addq %rcx, %rdx
425 ; AVX2-NEXT: vmovq %rdx, %xmm1
426 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
427 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
428 ; AVX2-NEXT: movq %rcx, %rax
429 ; AVX2-NEXT: imulq %rsi
430 ; AVX2-NEXT: movq %rdx, %rax
431 ; AVX2-NEXT: shrq $63, %rax
432 ; AVX2-NEXT: sarq %rdx
433 ; AVX2-NEXT: addq %rax, %rdx
434 ; AVX2-NEXT: leaq (,%rdx,8), %rax
435 ; AVX2-NEXT: subq %rax, %rdx
436 ; AVX2-NEXT: addq %rcx, %rdx
437 ; AVX2-NEXT: vmovq %rdx, %xmm2
438 ; AVX2-NEXT: vmovq %xmm0, %rcx
439 ; AVX2-NEXT: movq %rcx, %rax
440 ; AVX2-NEXT: imulq %rsi
441 ; AVX2-NEXT: movq %rdx, %rax
442 ; AVX2-NEXT: shrq $63, %rax
443 ; AVX2-NEXT: sarq %rdx
444 ; AVX2-NEXT: addq %rax, %rdx
445 ; AVX2-NEXT: leaq (,%rdx,8), %rax
446 ; AVX2-NEXT: subq %rax, %rdx
447 ; AVX2-NEXT: addq %rcx, %rdx
448 ; AVX2-NEXT: vmovq %rdx, %xmm0
449 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
450 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
452 %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
456 define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
457 ; AVX1-LABEL: test_rem7_8i32:
459 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
460 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
461 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
462 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
463 ; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
464 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
465 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
466 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
467 ; AVX1-NEXT: vpsrld $31, %xmm2, %xmm4
468 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
469 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
470 ; AVX1-NEXT: vpslld $3, %xmm2, %xmm4
471 ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
472 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
473 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
474 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
475 ; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
476 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
477 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
478 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
479 ; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
480 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
481 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
482 ; AVX1-NEXT: vpslld $3, %xmm2, %xmm3
483 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
484 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
485 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
488 ; AVX2-LABEL: test_rem7_8i32:
490 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
491 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
492 ; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
493 ; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
494 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
495 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
496 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
497 ; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
498 ; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
499 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
500 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
501 ; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
502 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
504 %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
508 define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
509 ; AVX1-LABEL: test_rem7_16i16:
511 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
512 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
513 ; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3
514 ; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4
515 ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
516 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
517 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4
518 ; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3
519 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
520 ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2
521 ; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3
522 ; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
523 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
524 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
525 ; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
526 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
527 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
530 ; AVX2-LABEL: test_rem7_16i16:
532 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
533 ; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2
534 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1
535 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
536 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
537 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
539 %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
543 define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
544 ; AVX1-LABEL: test_rem7_32i8:
546 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
547 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
548 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
549 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
550 ; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
551 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
552 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
553 ; AVX1-NEXT: vpmulhw %xmm4, %xmm5, %xmm5
554 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
555 ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
556 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm3
557 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm5
558 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
559 ; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
560 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
561 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
562 ; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3
563 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
564 ; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
565 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
566 ; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
567 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5
568 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
569 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
570 ; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
571 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
572 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
573 ; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
574 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
575 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
576 ; AVX1-NEXT: vpmulhw %xmm4, %xmm2, %xmm2
577 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
578 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
579 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
580 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
581 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
582 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
583 ; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2
584 ; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
585 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
586 ; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
587 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
588 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
589 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
590 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
591 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
594 ; AVX2NOBW-LABEL: test_rem7_32i8:
596 ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
597 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
598 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
599 ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
600 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
601 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
602 ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
603 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
604 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
605 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
606 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2
607 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
608 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
609 ; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2
610 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1
611 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
612 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
613 ; AVX2NOBW-NEXT: vpsubb %ymm3, %ymm1, %ymm1
614 ; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2
615 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
616 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
617 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
618 ; AVX2NOBW-NEXT: retq
620 ; AVX512BW-LABEL: test_rem7_32i8:
622 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
623 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
624 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
625 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
626 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
627 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2
628 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
629 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
630 ; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2
631 ; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1
632 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
633 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
634 ; AVX512BW-NEXT: vpsubb %ymm3, %ymm1, %ymm1
635 ; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2
636 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
637 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
638 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
639 ; AVX512BW-NEXT: retq
640 %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
645 ; srem by non-splat constant
648 define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
649 ; AVX1-LABEL: test_remconstant_32i8:
651 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
652 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
653 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
654 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
655 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
656 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
657 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
658 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
659 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm3
660 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
661 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
662 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
663 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
664 ; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5
665 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
666 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
667 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
668 ; AVX1-NEXT: vpsraw $8, %xmm6, %xmm6
669 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
670 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
671 ; AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5
672 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
673 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
674 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
675 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm5
676 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
677 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
678 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
679 ; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7
680 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
681 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
682 ; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
683 ; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5
684 ; AVX1-NEXT: vpsubb %xmm5, %xmm4, %xmm4
685 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
686 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
687 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
688 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
689 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
690 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
691 ; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
692 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
693 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
694 ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
695 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
696 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
697 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
698 ; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5
699 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
700 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
701 ; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
702 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
703 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
704 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
705 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
706 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
707 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
708 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
709 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
710 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
711 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
712 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
713 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
716 ; AVX2NOBW-LABEL: test_remconstant_32i8:
718 ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
719 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
720 ; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
721 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
722 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
723 ; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
724 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
725 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
726 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
727 ; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
728 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
729 ; AVX2NOBW-NEXT: vpsraw $8, %ymm2, %ymm2
730 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
731 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
732 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
733 ; AVX2NOBW-NEXT: vpsraw $8, %ymm3, %ymm3
734 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
735 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
736 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
737 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1
738 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
739 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
740 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
741 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
742 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
743 ; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2
744 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
745 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
746 ; AVX2NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
747 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
748 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
749 ; AVX2NOBW-NEXT: retq
751 ; AVX512BW-LABEL: test_remconstant_32i8:
753 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
754 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2
755 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
756 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
757 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
758 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
759 ; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm2
760 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
761 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
762 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
763 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
764 ; AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
765 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
766 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
767 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
768 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
769 ; AVX512BW-NEXT: retq
770 %res = srem <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>