1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
9 define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
10 ; AVX-LABEL: test_div7_8i64:
12 ; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
13 ; AVX-NEXT: vpextrq $1, %xmm1, %rax
14 ; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
15 ; AVX-NEXT: imulq %rcx
16 ; AVX-NEXT: movq %rdx, %rax
17 ; AVX-NEXT: shrq $63, %rax
19 ; AVX-NEXT: addq %rax, %rdx
20 ; AVX-NEXT: vmovq %rdx, %xmm2
21 ; AVX-NEXT: vmovq %xmm1, %rax
22 ; AVX-NEXT: imulq %rcx
23 ; AVX-NEXT: movq %rdx, %rax
24 ; AVX-NEXT: shrq $63, %rax
26 ; AVX-NEXT: addq %rax, %rdx
27 ; AVX-NEXT: vmovq %rdx, %xmm1
28 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
29 ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
30 ; AVX-NEXT: vpextrq $1, %xmm2, %rax
31 ; AVX-NEXT: imulq %rcx
32 ; AVX-NEXT: movq %rdx, %rax
33 ; AVX-NEXT: shrq $63, %rax
35 ; AVX-NEXT: addq %rax, %rdx
36 ; AVX-NEXT: vmovq %rdx, %xmm3
37 ; AVX-NEXT: vmovq %xmm2, %rax
38 ; AVX-NEXT: imulq %rcx
39 ; AVX-NEXT: movq %rdx, %rax
40 ; AVX-NEXT: shrq $63, %rax
42 ; AVX-NEXT: addq %rax, %rdx
43 ; AVX-NEXT: vmovq %rdx, %xmm2
44 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
45 ; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
46 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
47 ; AVX-NEXT: vpextrq $1, %xmm2, %rax
48 ; AVX-NEXT: imulq %rcx
49 ; AVX-NEXT: movq %rdx, %rax
50 ; AVX-NEXT: shrq $63, %rax
52 ; AVX-NEXT: addq %rax, %rdx
53 ; AVX-NEXT: vmovq %rdx, %xmm3
54 ; AVX-NEXT: vmovq %xmm2, %rax
55 ; AVX-NEXT: imulq %rcx
56 ; AVX-NEXT: movq %rdx, %rax
57 ; AVX-NEXT: shrq $63, %rax
59 ; AVX-NEXT: addq %rax, %rdx
60 ; AVX-NEXT: vmovq %rdx, %xmm2
61 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
62 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
63 ; AVX-NEXT: imulq %rcx
64 ; AVX-NEXT: movq %rdx, %rax
65 ; AVX-NEXT: shrq $63, %rax
67 ; AVX-NEXT: addq %rax, %rdx
68 ; AVX-NEXT: vmovq %rdx, %xmm3
69 ; AVX-NEXT: vmovq %xmm0, %rax
70 ; AVX-NEXT: imulq %rcx
71 ; AVX-NEXT: movq %rdx, %rax
72 ; AVX-NEXT: shrq $63, %rax
74 ; AVX-NEXT: addq %rax, %rdx
75 ; AVX-NEXT: vmovq %rdx, %xmm0
76 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
77 ; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
78 ; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
80 %res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
84 define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
85 ; AVX-LABEL: test_div7_16i32:
87 ; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
88 ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
89 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
90 ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
91 ; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
92 ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
93 ; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm0
94 ; AVX-NEXT: vpsrld $31, %zmm0, %zmm1
95 ; AVX-NEXT: vpsrad $2, %zmm0, %zmm0
96 ; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
98 %res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
102 define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
103 ; AVX512F-LABEL: test_div7_32i16:
105 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
106 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
107 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1
108 ; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3
109 ; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1
110 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
111 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
112 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
113 ; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0
114 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
115 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
118 ; AVX512BW-LABEL: test_div7_32i16:
120 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm0
121 ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm1
122 ; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm0
123 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
124 ; AVX512BW-NEXT: retq
125 %res = sdiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
129 define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
130 ; AVX512F-LABEL: test_div7_64i8:
132 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
133 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
134 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
135 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
136 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
137 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
138 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
139 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
140 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
141 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
142 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
143 ; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
144 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
145 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
146 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
147 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
148 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
149 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
150 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
151 ; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1
152 ; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
153 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
154 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
155 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
156 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
157 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
158 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm7
159 ; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm3
160 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
161 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
162 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
163 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
164 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
165 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
166 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
167 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
168 ; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0
169 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
170 ; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
171 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
174 ; AVX512BW-LABEL: test_div7_64i8:
176 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
177 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
178 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
179 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
180 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
181 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
182 ; AVX512BW-NEXT: vpmovsxbw %ymm3, %zmm3
183 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
184 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
185 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
186 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
187 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
188 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
189 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
190 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
191 ; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1
192 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
193 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
194 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
195 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
196 ; AVX512BW-NEXT: retq
197 %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
202 ; sdiv by non-splat constant
205 define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
206 ; AVX512F-LABEL: test_divconstant_64i8:
208 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
209 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
210 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm3
211 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
212 ; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3
213 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
214 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
215 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
216 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
217 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4
218 ; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
219 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
220 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
221 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
222 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
223 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
224 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
225 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
226 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
227 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
228 ; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3
229 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
230 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
231 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
232 ; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4
233 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
234 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
235 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
236 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm2
237 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
238 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
239 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
240 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
241 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
242 ; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3
243 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
244 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
245 ; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm1
246 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
247 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
248 ; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
249 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
250 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
251 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
252 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
253 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
254 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
255 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
256 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
257 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
258 ; AVX512F-NEXT: vpsraw $8, %ymm1, %ymm1
259 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
260 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
261 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
262 ; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3
263 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
264 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
265 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1
266 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0
267 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
268 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
269 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
272 ; AVX512BW-LABEL: test_divconstant_64i8:
274 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
275 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
276 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
277 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
278 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
279 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3
280 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
281 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1
282 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2
283 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2
284 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
285 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
286 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
287 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
288 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0
289 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
290 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
291 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
292 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
293 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
294 ; AVX512BW-NEXT: vpsraw $8, %zmm1, %zmm1
295 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
296 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
297 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
298 ; AVX512BW-NEXT: vpsraw $8, %zmm2, %zmm2
299 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
300 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
301 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1
302 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
303 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
304 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
305 ; AVX512BW-NEXT: retq
306 %res = sdiv <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
314 define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
315 ; AVX-LABEL: test_rem7_8i64:
317 ; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
318 ; AVX-NEXT: vpextrq $1, %xmm1, %rcx
319 ; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
320 ; AVX-NEXT: movq %rcx, %rax
321 ; AVX-NEXT: imulq %rsi
322 ; AVX-NEXT: movq %rdx, %rax
323 ; AVX-NEXT: shrq $63, %rax
324 ; AVX-NEXT: sarq %rdx
325 ; AVX-NEXT: addq %rax, %rdx
326 ; AVX-NEXT: leaq (,%rdx,8), %rax
327 ; AVX-NEXT: subq %rax, %rdx
328 ; AVX-NEXT: addq %rcx, %rdx
329 ; AVX-NEXT: vmovq %rdx, %xmm2
330 ; AVX-NEXT: vmovq %xmm1, %rcx
331 ; AVX-NEXT: movq %rcx, %rax
332 ; AVX-NEXT: imulq %rsi
333 ; AVX-NEXT: movq %rdx, %rax
334 ; AVX-NEXT: shrq $63, %rax
335 ; AVX-NEXT: sarq %rdx
336 ; AVX-NEXT: addq %rax, %rdx
337 ; AVX-NEXT: leaq (,%rdx,8), %rax
338 ; AVX-NEXT: subq %rax, %rdx
339 ; AVX-NEXT: addq %rcx, %rdx
340 ; AVX-NEXT: vmovq %rdx, %xmm1
341 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
342 ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
343 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx
344 ; AVX-NEXT: movq %rcx, %rax
345 ; AVX-NEXT: imulq %rsi
346 ; AVX-NEXT: movq %rdx, %rax
347 ; AVX-NEXT: shrq $63, %rax
348 ; AVX-NEXT: sarq %rdx
349 ; AVX-NEXT: addq %rax, %rdx
350 ; AVX-NEXT: leaq (,%rdx,8), %rax
351 ; AVX-NEXT: subq %rax, %rdx
352 ; AVX-NEXT: addq %rcx, %rdx
353 ; AVX-NEXT: vmovq %rdx, %xmm3
354 ; AVX-NEXT: vmovq %xmm2, %rcx
355 ; AVX-NEXT: movq %rcx, %rax
356 ; AVX-NEXT: imulq %rsi
357 ; AVX-NEXT: movq %rdx, %rax
358 ; AVX-NEXT: shrq $63, %rax
359 ; AVX-NEXT: sarq %rdx
360 ; AVX-NEXT: addq %rax, %rdx
361 ; AVX-NEXT: leaq (,%rdx,8), %rax
362 ; AVX-NEXT: subq %rax, %rdx
363 ; AVX-NEXT: addq %rcx, %rdx
364 ; AVX-NEXT: vmovq %rdx, %xmm2
365 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
366 ; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
367 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
368 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx
369 ; AVX-NEXT: movq %rcx, %rax
370 ; AVX-NEXT: imulq %rsi
371 ; AVX-NEXT: movq %rdx, %rax
372 ; AVX-NEXT: shrq $63, %rax
373 ; AVX-NEXT: sarq %rdx
374 ; AVX-NEXT: addq %rax, %rdx
375 ; AVX-NEXT: leaq (,%rdx,8), %rax
376 ; AVX-NEXT: subq %rax, %rdx
377 ; AVX-NEXT: addq %rcx, %rdx
378 ; AVX-NEXT: vmovq %rdx, %xmm3
379 ; AVX-NEXT: vmovq %xmm2, %rcx
380 ; AVX-NEXT: movq %rcx, %rax
381 ; AVX-NEXT: imulq %rsi
382 ; AVX-NEXT: movq %rdx, %rax
383 ; AVX-NEXT: shrq $63, %rax
384 ; AVX-NEXT: sarq %rdx
385 ; AVX-NEXT: addq %rax, %rdx
386 ; AVX-NEXT: leaq (,%rdx,8), %rax
387 ; AVX-NEXT: subq %rax, %rdx
388 ; AVX-NEXT: addq %rcx, %rdx
389 ; AVX-NEXT: vmovq %rdx, %xmm2
390 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
391 ; AVX-NEXT: vpextrq $1, %xmm0, %rcx
392 ; AVX-NEXT: movq %rcx, %rax
393 ; AVX-NEXT: imulq %rsi
394 ; AVX-NEXT: movq %rdx, %rax
395 ; AVX-NEXT: shrq $63, %rax
396 ; AVX-NEXT: sarq %rdx
397 ; AVX-NEXT: addq %rax, %rdx
398 ; AVX-NEXT: leaq (,%rdx,8), %rax
399 ; AVX-NEXT: subq %rax, %rdx
400 ; AVX-NEXT: addq %rcx, %rdx
401 ; AVX-NEXT: vmovq %rdx, %xmm3
402 ; AVX-NEXT: vmovq %xmm0, %rcx
403 ; AVX-NEXT: movq %rcx, %rax
404 ; AVX-NEXT: imulq %rsi
405 ; AVX-NEXT: movq %rdx, %rax
406 ; AVX-NEXT: shrq $63, %rax
407 ; AVX-NEXT: sarq %rdx
408 ; AVX-NEXT: addq %rax, %rdx
409 ; AVX-NEXT: leaq (,%rdx,8), %rax
410 ; AVX-NEXT: subq %rax, %rdx
411 ; AVX-NEXT: addq %rcx, %rdx
412 ; AVX-NEXT: vmovq %rdx, %xmm0
413 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
414 ; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
415 ; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
417 %res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
421 define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
422 ; AVX-LABEL: test_rem7_16i32:
424 ; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
425 ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
426 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
427 ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
428 ; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
429 ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
430 ; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm1
431 ; AVX-NEXT: vpsrld $31, %zmm1, %zmm2
432 ; AVX-NEXT: vpsrad $2, %zmm1, %zmm1
433 ; AVX-NEXT: vpaddd %zmm2, %zmm1, %zmm1
434 ; AVX-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1
435 ; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
437 %res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
441 define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
442 ; AVX512F-LABEL: test_rem7_32i16:
444 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
445 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
446 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3
447 ; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
448 ; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3
449 ; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3
450 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
451 ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
452 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
453 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2
454 ; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3
455 ; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2
456 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
457 ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
458 ; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
459 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
462 ; AVX512BW-LABEL: test_rem7_32i16:
464 ; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm1
465 ; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2
466 ; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1
467 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
468 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
469 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
470 ; AVX512BW-NEXT: retq
471 %res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
475 define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
476 ; AVX512F-LABEL: test_rem7_64i8:
478 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
479 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
480 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
481 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
482 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
483 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
484 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
485 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
486 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
487 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
488 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
489 ; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
490 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4
491 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
492 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
493 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
494 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
495 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
496 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
497 ; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
498 ; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2
499 ; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
500 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm4
501 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
502 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
503 ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
504 ; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
505 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
506 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
507 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
508 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
509 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
510 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
511 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
512 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
513 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
514 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm2
515 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
516 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
517 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
518 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
519 ; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
520 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
521 ; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
522 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
523 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
524 ; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
525 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
526 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
529 ; AVX512BW-LABEL: test_rem7_64i8:
531 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
532 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
533 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
534 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
535 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
536 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
537 ; AVX512BW-NEXT: vpmovsxbw %ymm3, %zmm3
538 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
539 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
540 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
541 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
542 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1
543 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2
544 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
545 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
546 ; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2
547 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
548 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
549 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
550 ; AVX512BW-NEXT: vpsubb %zmm3, %zmm1, %zmm1
551 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
552 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
553 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
554 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
555 ; AVX512BW-NEXT: retq
556 %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
561 ; srem by non-splat constant
564 define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
565 ; AVX512F-LABEL: test_remconstant_64i8:
567 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
568 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
569 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm3
570 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
571 ; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3
572 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
573 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
574 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
575 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
576 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4
577 ; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
578 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
579 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
580 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm5
581 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
582 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
583 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
584 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
585 ; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
586 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
587 ; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4
588 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
589 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
590 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
591 ; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5
592 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
593 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
594 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
595 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5
596 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
597 ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
598 ; AVX512F-NEXT: vpaddb %ymm5, %ymm4, %ymm4
599 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
600 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
601 ; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm5
602 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
603 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
604 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
605 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
606 ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
607 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
608 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
609 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
610 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
611 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
612 ; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm5
613 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
614 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
615 ; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
616 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
617 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
618 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm6
619 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm6, %ymm6
620 ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
621 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5
622 ; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3]
623 ; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4
624 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
625 ; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5
626 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
627 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
628 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
629 ; AVX512F-NEXT: vpsraw $8, %ymm6, %ymm6
630 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm6, %ymm6
631 ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
632 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5
633 ; AVX512F-NEXT: vpsrlw $7, %ymm4, %ymm4
634 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
635 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3
636 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
637 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
638 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
639 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
640 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
641 ; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
642 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
643 ; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0
644 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
647 ; AVX512BW-LABEL: test_remconstant_64i8:
649 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
650 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
651 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
652 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
653 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
654 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3
655 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
656 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm3, %zmm1
657 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3
658 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3
659 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
660 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm3
661 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm4
662 ; AVX512BW-NEXT: vpmovsxbw %ymm4, %zmm4
663 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm4, %zmm4
664 ; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
665 ; AVX512BW-NEXT: vpmovwb %zmm4, %ymm4
666 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
667 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1
668 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
669 ; AVX512BW-NEXT: vpsraw $8, %zmm3, %zmm3
670 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
671 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
672 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
673 ; AVX512BW-NEXT: vpsraw $8, %zmm4, %zmm4
674 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm4, %zmm4
675 ; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
676 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm4, %zmm3
677 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
678 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
679 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1
680 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
681 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm3, %zmm3
682 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
683 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
684 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
685 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
686 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
687 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
688 ; AVX512BW-NEXT: retq
689 %res = srem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>