1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
8 define void @vp_add_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
9 ; X86-LABEL: vp_add_v4i32:
11 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
12 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
13 ; X86-NEXT: vmovdqa %xmm0, (%eax)
16 ; SSE-LABEL: vp_add_v4i32:
18 ; SSE-NEXT: paddd %xmm1, %xmm0
19 ; SSE-NEXT: movdqa %xmm0, (%rdi)
22 ; AVX-LABEL: vp_add_v4i32:
24 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
25 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
27 %res = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
28 store <4 x i32> %res, ptr %out
31 declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
33 define void @vp_sub_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
34 ; X86-LABEL: vp_sub_v4i32:
36 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
37 ; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm0
38 ; X86-NEXT: vmovdqa %xmm0, (%eax)
41 ; SSE-LABEL: vp_sub_v4i32:
43 ; SSE-NEXT: psubd %xmm1, %xmm0
44 ; SSE-NEXT: movdqa %xmm0, (%rdi)
47 ; AVX-LABEL: vp_sub_v4i32:
49 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
50 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
52 %res = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
53 store <4 x i32> %res, ptr %out
56 declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
58 define void @vp_mul_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
59 ; X86-LABEL: vp_mul_v4i32:
61 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
62 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
63 ; X86-NEXT: vmovdqa %xmm0, (%eax)
66 ; SSE-LABEL: vp_mul_v4i32:
68 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
69 ; SSE-NEXT: pmuludq %xmm1, %xmm0
70 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
71 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
72 ; SSE-NEXT: pmuludq %xmm2, %xmm1
73 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
74 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
75 ; SSE-NEXT: movdqa %xmm0, (%rdi)
78 ; AVX-LABEL: vp_mul_v4i32:
80 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
81 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
83 %res = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
84 store <4 x i32> %res, ptr %out
87 declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
89 define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
90 ; X86-LABEL: vp_sdiv_v4i32:
92 ; X86-NEXT: pushl %edi
93 ; X86-NEXT: pushl %esi
94 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
95 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2
96 ; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
97 ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
98 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
99 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
100 ; X86-NEXT: vextractps $1, %xmm1, %ecx
101 ; X86-NEXT: vpextrd $1, %xmm0, %eax
103 ; X86-NEXT: idivl %ecx
104 ; X86-NEXT: movl %eax, %ecx
105 ; X86-NEXT: vmovd %xmm1, %edi
106 ; X86-NEXT: vmovd %xmm0, %eax
108 ; X86-NEXT: idivl %edi
109 ; X86-NEXT: vmovd %eax, %xmm2
110 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
111 ; X86-NEXT: vpextrd $2, %xmm1, %ecx
112 ; X86-NEXT: vpextrd $2, %xmm0, %eax
114 ; X86-NEXT: idivl %ecx
115 ; X86-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
116 ; X86-NEXT: vpextrd $3, %xmm1, %ecx
117 ; X86-NEXT: vpextrd $3, %xmm0, %eax
119 ; X86-NEXT: idivl %ecx
120 ; X86-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
121 ; X86-NEXT: vmovdqa %xmm0, (%esi)
122 ; X86-NEXT: popl %esi
123 ; X86-NEXT: popl %edi
126 ; SSE-LABEL: vp_sdiv_v4i32:
128 ; SSE-NEXT: movd %esi, %xmm2
129 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
130 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
131 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
132 ; SSE-NEXT: pand %xmm2, %xmm1
133 ; SSE-NEXT: paddd %xmm2, %xmm1
134 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
135 ; SSE-NEXT: psubd %xmm2, %xmm1
136 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
137 ; SSE-NEXT: movd %xmm2, %ecx
138 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
139 ; SSE-NEXT: movd %xmm2, %eax
141 ; SSE-NEXT: idivl %ecx
142 ; SSE-NEXT: movd %eax, %xmm2
143 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
144 ; SSE-NEXT: movd %xmm3, %ecx
145 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
146 ; SSE-NEXT: movd %xmm3, %eax
148 ; SSE-NEXT: idivl %ecx
149 ; SSE-NEXT: movd %eax, %xmm3
150 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
151 ; SSE-NEXT: movd %xmm1, %ecx
152 ; SSE-NEXT: movd %xmm0, %eax
154 ; SSE-NEXT: idivl %ecx
155 ; SSE-NEXT: movd %eax, %xmm2
156 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
157 ; SSE-NEXT: movd %xmm1, %ecx
158 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
159 ; SSE-NEXT: movd %xmm0, %eax
161 ; SSE-NEXT: idivl %ecx
162 ; SSE-NEXT: movd %eax, %xmm0
163 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
164 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
165 ; SSE-NEXT: movdqa %xmm2, (%rdi)
168 ; AVX1-LABEL: vp_sdiv_v4i32:
170 ; AVX1-NEXT: vmovd %esi, %xmm2
171 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
172 ; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
173 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
174 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
175 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
176 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
177 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
179 ; AVX1-NEXT: idivl %ecx
180 ; AVX1-NEXT: movl %eax, %ecx
181 ; AVX1-NEXT: vmovd %xmm1, %esi
182 ; AVX1-NEXT: vmovd %xmm0, %eax
184 ; AVX1-NEXT: idivl %esi
185 ; AVX1-NEXT: vmovd %eax, %xmm2
186 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
187 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
188 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
190 ; AVX1-NEXT: idivl %ecx
191 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
192 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
193 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
195 ; AVX1-NEXT: idivl %ecx
196 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
197 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
200 ; AVX2-LABEL: vp_sdiv_v4i32:
202 ; AVX2-NEXT: vmovd %esi, %xmm2
203 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
204 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
205 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
206 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
207 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
208 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
209 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
210 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
212 ; AVX2-NEXT: idivl %ecx
213 ; AVX2-NEXT: movl %eax, %ecx
214 ; AVX2-NEXT: vmovd %xmm1, %esi
215 ; AVX2-NEXT: vmovd %xmm0, %eax
217 ; AVX2-NEXT: idivl %esi
218 ; AVX2-NEXT: vmovd %eax, %xmm2
219 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
220 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
221 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
223 ; AVX2-NEXT: idivl %ecx
224 ; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
225 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
226 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
228 ; AVX2-NEXT: idivl %ecx
229 ; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
230 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
233 ; AVX512-LABEL: vp_sdiv_v4i32:
235 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
236 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
237 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
238 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
239 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
240 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
242 ; AVX512-NEXT: idivl %ecx
243 ; AVX512-NEXT: movl %eax, %ecx
244 ; AVX512-NEXT: vmovd %xmm2, %esi
245 ; AVX512-NEXT: vmovd %xmm0, %eax
247 ; AVX512-NEXT: idivl %esi
248 ; AVX512-NEXT: vmovd %eax, %xmm1
249 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
250 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
251 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
253 ; AVX512-NEXT: idivl %ecx
254 ; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
255 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
256 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
258 ; AVX512-NEXT: idivl %ecx
259 ; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
260 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
262 %res = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
263 store <4 x i32> %res, ptr %out
266 declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
268 define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
269 ; X86-LABEL: vp_udiv_v4i32:
271 ; X86-NEXT: pushl %edi
272 ; X86-NEXT: pushl %esi
273 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
274 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2
275 ; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
276 ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
277 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
278 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
279 ; X86-NEXT: vextractps $1, %xmm1, %ecx
280 ; X86-NEXT: vpextrd $1, %xmm0, %eax
281 ; X86-NEXT: xorl %edx, %edx
282 ; X86-NEXT: divl %ecx
283 ; X86-NEXT: movl %eax, %ecx
284 ; X86-NEXT: vmovd %xmm1, %edi
285 ; X86-NEXT: vmovd %xmm0, %eax
286 ; X86-NEXT: xorl %edx, %edx
287 ; X86-NEXT: divl %edi
288 ; X86-NEXT: vmovd %eax, %xmm2
289 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
290 ; X86-NEXT: vpextrd $2, %xmm1, %ecx
291 ; X86-NEXT: vpextrd $2, %xmm0, %eax
292 ; X86-NEXT: xorl %edx, %edx
293 ; X86-NEXT: divl %ecx
294 ; X86-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
295 ; X86-NEXT: vpextrd $3, %xmm1, %ecx
296 ; X86-NEXT: vpextrd $3, %xmm0, %eax
297 ; X86-NEXT: xorl %edx, %edx
298 ; X86-NEXT: divl %ecx
299 ; X86-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
300 ; X86-NEXT: vmovdqa %xmm0, (%esi)
301 ; X86-NEXT: popl %esi
302 ; X86-NEXT: popl %edi
305 ; SSE-LABEL: vp_udiv_v4i32:
307 ; SSE-NEXT: movd %esi, %xmm2
308 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
309 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
310 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
311 ; SSE-NEXT: pand %xmm2, %xmm1
312 ; SSE-NEXT: paddd %xmm2, %xmm1
313 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
314 ; SSE-NEXT: psubd %xmm2, %xmm1
315 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
316 ; SSE-NEXT: movd %xmm2, %ecx
317 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
318 ; SSE-NEXT: movd %xmm2, %eax
319 ; SSE-NEXT: xorl %edx, %edx
320 ; SSE-NEXT: divl %ecx
321 ; SSE-NEXT: movd %eax, %xmm2
322 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
323 ; SSE-NEXT: movd %xmm3, %ecx
324 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
325 ; SSE-NEXT: movd %xmm3, %eax
326 ; SSE-NEXT: xorl %edx, %edx
327 ; SSE-NEXT: divl %ecx
328 ; SSE-NEXT: movd %eax, %xmm3
329 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
330 ; SSE-NEXT: movd %xmm1, %ecx
331 ; SSE-NEXT: movd %xmm0, %eax
332 ; SSE-NEXT: xorl %edx, %edx
333 ; SSE-NEXT: divl %ecx
334 ; SSE-NEXT: movd %eax, %xmm2
335 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
336 ; SSE-NEXT: movd %xmm1, %ecx
337 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
338 ; SSE-NEXT: movd %xmm0, %eax
339 ; SSE-NEXT: xorl %edx, %edx
340 ; SSE-NEXT: divl %ecx
341 ; SSE-NEXT: movd %eax, %xmm0
342 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
343 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
344 ; SSE-NEXT: movdqa %xmm2, (%rdi)
347 ; AVX1-LABEL: vp_udiv_v4i32:
349 ; AVX1-NEXT: vmovd %esi, %xmm2
350 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
351 ; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
352 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
353 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
354 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
355 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
356 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
357 ; AVX1-NEXT: xorl %edx, %edx
358 ; AVX1-NEXT: divl %ecx
359 ; AVX1-NEXT: movl %eax, %ecx
360 ; AVX1-NEXT: vmovd %xmm1, %esi
361 ; AVX1-NEXT: vmovd %xmm0, %eax
362 ; AVX1-NEXT: xorl %edx, %edx
363 ; AVX1-NEXT: divl %esi
364 ; AVX1-NEXT: vmovd %eax, %xmm2
365 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
366 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
367 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
368 ; AVX1-NEXT: xorl %edx, %edx
369 ; AVX1-NEXT: divl %ecx
370 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
371 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
372 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
373 ; AVX1-NEXT: xorl %edx, %edx
374 ; AVX1-NEXT: divl %ecx
375 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
376 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
379 ; AVX2-LABEL: vp_udiv_v4i32:
381 ; AVX2-NEXT: vmovd %esi, %xmm2
382 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
383 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
384 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
385 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
386 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
387 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
388 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
389 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
390 ; AVX2-NEXT: xorl %edx, %edx
391 ; AVX2-NEXT: divl %ecx
392 ; AVX2-NEXT: movl %eax, %ecx
393 ; AVX2-NEXT: vmovd %xmm1, %esi
394 ; AVX2-NEXT: vmovd %xmm0, %eax
395 ; AVX2-NEXT: xorl %edx, %edx
396 ; AVX2-NEXT: divl %esi
397 ; AVX2-NEXT: vmovd %eax, %xmm2
398 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
399 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
400 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
401 ; AVX2-NEXT: xorl %edx, %edx
402 ; AVX2-NEXT: divl %ecx
403 ; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
404 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
405 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
406 ; AVX2-NEXT: xorl %edx, %edx
407 ; AVX2-NEXT: divl %ecx
408 ; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
409 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
412 ; AVX512-LABEL: vp_udiv_v4i32:
414 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
415 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
416 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
417 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
418 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
419 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
420 ; AVX512-NEXT: xorl %edx, %edx
421 ; AVX512-NEXT: divl %ecx
422 ; AVX512-NEXT: movl %eax, %ecx
423 ; AVX512-NEXT: vmovd %xmm2, %esi
424 ; AVX512-NEXT: vmovd %xmm0, %eax
425 ; AVX512-NEXT: xorl %edx, %edx
426 ; AVX512-NEXT: divl %esi
427 ; AVX512-NEXT: vmovd %eax, %xmm1
428 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
429 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
430 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
431 ; AVX512-NEXT: xorl %edx, %edx
432 ; AVX512-NEXT: divl %ecx
433 ; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
434 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
435 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
436 ; AVX512-NEXT: xorl %edx, %edx
437 ; AVX512-NEXT: divl %ecx
438 ; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
439 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
441 %res = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
442 store <4 x i32> %res, ptr %out
445 declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
447 define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
448 ; X86-LABEL: vp_srem_v4i32:
450 ; X86-NEXT: pushl %edi
451 ; X86-NEXT: pushl %esi
452 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
453 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2
454 ; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
455 ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
456 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
457 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
458 ; X86-NEXT: vextractps $1, %xmm1, %ecx
459 ; X86-NEXT: vpextrd $1, %xmm0, %eax
461 ; X86-NEXT: idivl %ecx
462 ; X86-NEXT: movl %edx, %ecx
463 ; X86-NEXT: vmovd %xmm1, %edi
464 ; X86-NEXT: vmovd %xmm0, %eax
466 ; X86-NEXT: idivl %edi
467 ; X86-NEXT: vmovd %edx, %xmm2
468 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
469 ; X86-NEXT: vpextrd $2, %xmm1, %ecx
470 ; X86-NEXT: vpextrd $2, %xmm0, %eax
472 ; X86-NEXT: idivl %ecx
473 ; X86-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
474 ; X86-NEXT: vpextrd $3, %xmm1, %ecx
475 ; X86-NEXT: vpextrd $3, %xmm0, %eax
477 ; X86-NEXT: idivl %ecx
478 ; X86-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
479 ; X86-NEXT: vmovdqa %xmm0, (%esi)
480 ; X86-NEXT: popl %esi
481 ; X86-NEXT: popl %edi
484 ; SSE-LABEL: vp_srem_v4i32:
486 ; SSE-NEXT: movd %esi, %xmm2
487 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
488 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
489 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
490 ; SSE-NEXT: pand %xmm2, %xmm1
491 ; SSE-NEXT: paddd %xmm2, %xmm1
492 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
493 ; SSE-NEXT: psubd %xmm2, %xmm1
494 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
495 ; SSE-NEXT: movd %xmm2, %ecx
496 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
497 ; SSE-NEXT: movd %xmm2, %eax
499 ; SSE-NEXT: idivl %ecx
500 ; SSE-NEXT: movd %edx, %xmm2
501 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
502 ; SSE-NEXT: movd %xmm3, %ecx
503 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
504 ; SSE-NEXT: movd %xmm3, %eax
506 ; SSE-NEXT: idivl %ecx
507 ; SSE-NEXT: movd %edx, %xmm3
508 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
509 ; SSE-NEXT: movd %xmm1, %ecx
510 ; SSE-NEXT: movd %xmm0, %eax
512 ; SSE-NEXT: idivl %ecx
513 ; SSE-NEXT: movd %edx, %xmm2
514 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
515 ; SSE-NEXT: movd %xmm1, %ecx
516 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
517 ; SSE-NEXT: movd %xmm0, %eax
519 ; SSE-NEXT: idivl %ecx
520 ; SSE-NEXT: movd %edx, %xmm0
521 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
522 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
523 ; SSE-NEXT: movdqa %xmm2, (%rdi)
526 ; AVX1-LABEL: vp_srem_v4i32:
528 ; AVX1-NEXT: vmovd %esi, %xmm2
529 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
530 ; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
531 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
532 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
533 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
534 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
535 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
537 ; AVX1-NEXT: idivl %ecx
538 ; AVX1-NEXT: movl %edx, %ecx
539 ; AVX1-NEXT: vmovd %xmm1, %esi
540 ; AVX1-NEXT: vmovd %xmm0, %eax
542 ; AVX1-NEXT: idivl %esi
543 ; AVX1-NEXT: vmovd %edx, %xmm2
544 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
545 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
546 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
548 ; AVX1-NEXT: idivl %ecx
549 ; AVX1-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
550 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
551 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
553 ; AVX1-NEXT: idivl %ecx
554 ; AVX1-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
555 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
558 ; AVX2-LABEL: vp_srem_v4i32:
560 ; AVX2-NEXT: vmovd %esi, %xmm2
561 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
562 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
563 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
564 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
565 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
566 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
567 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
568 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
570 ; AVX2-NEXT: idivl %ecx
571 ; AVX2-NEXT: movl %edx, %ecx
572 ; AVX2-NEXT: vmovd %xmm1, %esi
573 ; AVX2-NEXT: vmovd %xmm0, %eax
575 ; AVX2-NEXT: idivl %esi
576 ; AVX2-NEXT: vmovd %edx, %xmm2
577 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
578 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
579 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
581 ; AVX2-NEXT: idivl %ecx
582 ; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
583 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
584 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
586 ; AVX2-NEXT: idivl %ecx
587 ; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
588 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
591 ; AVX512-LABEL: vp_srem_v4i32:
593 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
594 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
595 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
596 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
597 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
598 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
600 ; AVX512-NEXT: idivl %ecx
601 ; AVX512-NEXT: movl %edx, %ecx
602 ; AVX512-NEXT: vmovd %xmm2, %esi
603 ; AVX512-NEXT: vmovd %xmm0, %eax
605 ; AVX512-NEXT: idivl %esi
606 ; AVX512-NEXT: vmovd %edx, %xmm1
607 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
608 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
609 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
611 ; AVX512-NEXT: idivl %ecx
612 ; AVX512-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
613 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
614 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
616 ; AVX512-NEXT: idivl %ecx
617 ; AVX512-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
618 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
620 %res = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
621 store <4 x i32> %res, ptr %out
624 declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
626 define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
627 ; X86-LABEL: vp_urem_v4i32:
629 ; X86-NEXT: pushl %edi
630 ; X86-NEXT: pushl %esi
631 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
632 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2
633 ; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
634 ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
635 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
636 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
637 ; X86-NEXT: vextractps $1, %xmm1, %ecx
638 ; X86-NEXT: vpextrd $1, %xmm0, %eax
639 ; X86-NEXT: xorl %edx, %edx
640 ; X86-NEXT: divl %ecx
641 ; X86-NEXT: movl %edx, %ecx
642 ; X86-NEXT: vmovd %xmm1, %edi
643 ; X86-NEXT: vmovd %xmm0, %eax
644 ; X86-NEXT: xorl %edx, %edx
645 ; X86-NEXT: divl %edi
646 ; X86-NEXT: vmovd %edx, %xmm2
647 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
648 ; X86-NEXT: vpextrd $2, %xmm1, %ecx
649 ; X86-NEXT: vpextrd $2, %xmm0, %eax
650 ; X86-NEXT: xorl %edx, %edx
651 ; X86-NEXT: divl %ecx
652 ; X86-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
653 ; X86-NEXT: vpextrd $3, %xmm1, %ecx
654 ; X86-NEXT: vpextrd $3, %xmm0, %eax
655 ; X86-NEXT: xorl %edx, %edx
656 ; X86-NEXT: divl %ecx
657 ; X86-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
658 ; X86-NEXT: vmovdqa %xmm0, (%esi)
659 ; X86-NEXT: popl %esi
660 ; X86-NEXT: popl %edi
663 ; SSE-LABEL: vp_urem_v4i32:
665 ; SSE-NEXT: movd %esi, %xmm2
666 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
667 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
668 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
669 ; SSE-NEXT: pand %xmm2, %xmm1
670 ; SSE-NEXT: paddd %xmm2, %xmm1
671 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
672 ; SSE-NEXT: psubd %xmm2, %xmm1
673 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
674 ; SSE-NEXT: movd %xmm2, %ecx
675 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
676 ; SSE-NEXT: movd %xmm2, %eax
677 ; SSE-NEXT: xorl %edx, %edx
678 ; SSE-NEXT: divl %ecx
679 ; SSE-NEXT: movd %edx, %xmm2
680 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
681 ; SSE-NEXT: movd %xmm3, %ecx
682 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
683 ; SSE-NEXT: movd %xmm3, %eax
684 ; SSE-NEXT: xorl %edx, %edx
685 ; SSE-NEXT: divl %ecx
686 ; SSE-NEXT: movd %edx, %xmm3
687 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
688 ; SSE-NEXT: movd %xmm1, %ecx
689 ; SSE-NEXT: movd %xmm0, %eax
690 ; SSE-NEXT: xorl %edx, %edx
691 ; SSE-NEXT: divl %ecx
692 ; SSE-NEXT: movd %edx, %xmm2
693 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
694 ; SSE-NEXT: movd %xmm1, %ecx
695 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
696 ; SSE-NEXT: movd %xmm0, %eax
697 ; SSE-NEXT: xorl %edx, %edx
698 ; SSE-NEXT: divl %ecx
699 ; SSE-NEXT: movd %edx, %xmm0
700 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
701 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
702 ; SSE-NEXT: movdqa %xmm2, (%rdi)
705 ; AVX1-LABEL: vp_urem_v4i32:
707 ; AVX1-NEXT: vmovd %esi, %xmm2
708 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
709 ; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
710 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
711 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
712 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
713 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
714 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
715 ; AVX1-NEXT: xorl %edx, %edx
716 ; AVX1-NEXT: divl %ecx
717 ; AVX1-NEXT: movl %edx, %ecx
718 ; AVX1-NEXT: vmovd %xmm1, %esi
719 ; AVX1-NEXT: vmovd %xmm0, %eax
720 ; AVX1-NEXT: xorl %edx, %edx
721 ; AVX1-NEXT: divl %esi
722 ; AVX1-NEXT: vmovd %edx, %xmm2
723 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
724 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
725 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
726 ; AVX1-NEXT: xorl %edx, %edx
727 ; AVX1-NEXT: divl %ecx
728 ; AVX1-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
729 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
730 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
731 ; AVX1-NEXT: xorl %edx, %edx
732 ; AVX1-NEXT: divl %ecx
733 ; AVX1-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
734 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
737 ; AVX2-LABEL: vp_urem_v4i32:
739 ; AVX2-NEXT: vmovd %esi, %xmm2
740 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
741 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
742 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
743 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
744 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
745 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
746 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
747 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
748 ; AVX2-NEXT: xorl %edx, %edx
749 ; AVX2-NEXT: divl %ecx
750 ; AVX2-NEXT: movl %edx, %ecx
751 ; AVX2-NEXT: vmovd %xmm1, %esi
752 ; AVX2-NEXT: vmovd %xmm0, %eax
753 ; AVX2-NEXT: xorl %edx, %edx
754 ; AVX2-NEXT: divl %esi
755 ; AVX2-NEXT: vmovd %edx, %xmm2
756 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
757 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
758 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
759 ; AVX2-NEXT: xorl %edx, %edx
760 ; AVX2-NEXT: divl %ecx
761 ; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
762 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
763 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
764 ; AVX2-NEXT: xorl %edx, %edx
765 ; AVX2-NEXT: divl %ecx
766 ; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
767 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
770 ; AVX512-LABEL: vp_urem_v4i32:
772 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
773 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
774 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
775 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
776 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
777 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
778 ; AVX512-NEXT: xorl %edx, %edx
779 ; AVX512-NEXT: divl %ecx
780 ; AVX512-NEXT: movl %edx, %ecx
781 ; AVX512-NEXT: vmovd %xmm2, %esi
782 ; AVX512-NEXT: vmovd %xmm0, %eax
783 ; AVX512-NEXT: xorl %edx, %edx
784 ; AVX512-NEXT: divl %esi
785 ; AVX512-NEXT: vmovd %edx, %xmm1
786 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
787 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
788 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
789 ; AVX512-NEXT: xorl %edx, %edx
790 ; AVX512-NEXT: divl %ecx
791 ; AVX512-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
792 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
793 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
794 ; AVX512-NEXT: xorl %edx, %edx
795 ; AVX512-NEXT: divl %ecx
796 ; AVX512-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
797 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
799 %res = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
800 store <4 x i32> %res, ptr %out
803 declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
805 define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
806 ; X86-LABEL: vp_ashr_v4i32:
808 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
809 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
810 ; X86-NEXT: vpsrad %xmm2, %xmm0, %xmm2
811 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
812 ; X86-NEXT: vpsrad %xmm3, %xmm0, %xmm3
813 ; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
814 ; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3
815 ; X86-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
816 ; X86-NEXT: vpsrad %xmm3, %xmm0, %xmm3
817 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
818 ; X86-NEXT: vpsrad %xmm1, %xmm0, %xmm0
819 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
820 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
821 ; X86-NEXT: vmovdqa %xmm0, (%eax)
824 ; SSE-LABEL: vp_ashr_v4i32:
826 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
827 ; SSE-NEXT: movdqa %xmm0, %xmm3
828 ; SSE-NEXT: psrad %xmm2, %xmm3
829 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
830 ; SSE-NEXT: movdqa %xmm0, %xmm4
831 ; SSE-NEXT: psrad %xmm2, %xmm4
832 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
833 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
834 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
835 ; SSE-NEXT: movdqa %xmm0, %xmm3
836 ; SSE-NEXT: psrad %xmm2, %xmm3
837 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
838 ; SSE-NEXT: psrad %xmm1, %xmm0
839 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
840 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
841 ; SSE-NEXT: movaps %xmm4, (%rdi)
844 ; AVX1-LABEL: vp_ashr_v4i32:
846 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
847 ; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
848 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
849 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
850 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
851 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
852 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
853 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
854 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
855 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
856 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
857 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
858 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
861 ; AVX2-LABEL: vp_ashr_v4i32:
863 ; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
864 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
867 ; AVX512-LABEL: vp_ashr_v4i32:
869 ; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
870 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
872 %res = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
873 store <4 x i32> %res, ptr %out
876 declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
878 define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
879 ; X86-LABEL: vp_lshr_v4i32:
881 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
882 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
883 ; X86-NEXT: vpsrld %xmm2, %xmm0, %xmm2
884 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
885 ; X86-NEXT: vpsrld %xmm3, %xmm0, %xmm3
886 ; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
887 ; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3
888 ; X86-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
889 ; X86-NEXT: vpsrld %xmm3, %xmm0, %xmm3
890 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
891 ; X86-NEXT: vpsrld %xmm1, %xmm0, %xmm0
892 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
893 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
894 ; X86-NEXT: vmovdqa %xmm0, (%eax)
897 ; SSE-LABEL: vp_lshr_v4i32:
899 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
900 ; SSE-NEXT: movdqa %xmm0, %xmm3
901 ; SSE-NEXT: psrld %xmm2, %xmm3
902 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
903 ; SSE-NEXT: movdqa %xmm0, %xmm4
904 ; SSE-NEXT: psrld %xmm2, %xmm4
905 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
906 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
907 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
908 ; SSE-NEXT: movdqa %xmm0, %xmm3
909 ; SSE-NEXT: psrld %xmm2, %xmm3
910 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
911 ; SSE-NEXT: psrld %xmm1, %xmm0
912 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
913 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
914 ; SSE-NEXT: movaps %xmm4, (%rdi)
917 ; AVX1-LABEL: vp_lshr_v4i32:
919 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
920 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
921 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
922 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
923 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
924 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
925 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
926 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
927 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
928 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
929 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
930 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
931 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
934 ; AVX2-LABEL: vp_lshr_v4i32:
936 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
937 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
940 ; AVX512-LABEL: vp_lshr_v4i32:
942 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
943 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
945 %res = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
946 store <4 x i32> %res, ptr %out
949 declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
951 define void @vp_shl_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
952 ; X86-LABEL: vp_shl_v4i32:
954 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
955 ; X86-NEXT: vpslld $23, %xmm1, %xmm1
956 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
957 ; X86-NEXT: vcvttps2dq %xmm1, %xmm1
958 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
959 ; X86-NEXT: vmovdqa %xmm0, (%eax)
962 ; SSE-LABEL: vp_shl_v4i32:
964 ; SSE-NEXT: pslld $23, %xmm1
965 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
966 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1
967 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
968 ; SSE-NEXT: pmuludq %xmm1, %xmm0
969 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
970 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
971 ; SSE-NEXT: pmuludq %xmm2, %xmm1
972 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
973 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
974 ; SSE-NEXT: movdqa %xmm0, (%rdi)
977 ; AVX1-LABEL: vp_shl_v4i32:
979 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
980 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
981 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
982 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
983 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
986 ; AVX2-LABEL: vp_shl_v4i32:
988 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
989 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
992 ; AVX512-LABEL: vp_shl_v4i32:
994 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
995 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
997 %res = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
998 store <4 x i32> %res, ptr %out
1001 declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1003 define void @vp_or_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1004 ; X86-LABEL: vp_or_v4i32:
1006 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1007 ; X86-NEXT: vorps %xmm1, %xmm0, %xmm0
1008 ; X86-NEXT: vmovaps %xmm0, (%eax)
1011 ; SSE-LABEL: vp_or_v4i32:
1013 ; SSE-NEXT: orps %xmm1, %xmm0
1014 ; SSE-NEXT: movaps %xmm0, (%rdi)
1017 ; AVX-LABEL: vp_or_v4i32:
1019 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
1020 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1022 %res = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1023 store <4 x i32> %res, ptr %out
1026 declare <4 x i32> @llvm.vp.or.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1028 define void @vp_and_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1029 ; X86-LABEL: vp_and_v4i32:
1031 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1032 ; X86-NEXT: vorps %xmm1, %xmm0, %xmm0
1033 ; X86-NEXT: vmovaps %xmm0, (%eax)
1036 ; SSE-LABEL: vp_and_v4i32:
1038 ; SSE-NEXT: orps %xmm1, %xmm0
1039 ; SSE-NEXT: movaps %xmm0, (%rdi)
1042 ; AVX-LABEL: vp_and_v4i32:
1044 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
1045 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1047 %res = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1048 store <4 x i32> %res, ptr %out
1051 declare <4 x i32> @llvm.vp.and.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1053 define void @vp_xor_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1054 ; X86-LABEL: vp_xor_v4i32:
1056 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1057 ; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0
1058 ; X86-NEXT: vmovaps %xmm0, (%eax)
1061 ; SSE-LABEL: vp_xor_v4i32:
1063 ; SSE-NEXT: xorps %xmm1, %xmm0
1064 ; SSE-NEXT: movaps %xmm0, (%rdi)
1067 ; AVX-LABEL: vp_xor_v4i32:
1069 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
1070 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1072 %res = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1073 store <4 x i32> %res, ptr %out
1076 declare <4 x i32> @llvm.vp.xor.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1078 define void @vp_abs_v4i32(<4 x i32> %a0, ptr %out, i32 %vp) nounwind {
1079 ; X86-LABEL: vp_abs_v4i32:
1081 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1082 ; X86-NEXT: vpabsd %xmm0, %xmm0
1083 ; X86-NEXT: vmovdqa %xmm0, (%eax)
1086 ; SSE-LABEL: vp_abs_v4i32:
1088 ; SSE-NEXT: movdqa %xmm0, %xmm1
1089 ; SSE-NEXT: psrad $31, %xmm1
1090 ; SSE-NEXT: pxor %xmm1, %xmm0
1091 ; SSE-NEXT: psubd %xmm1, %xmm0
1092 ; SSE-NEXT: movdqa %xmm0, (%rdi)
1095 ; AVX-LABEL: vp_abs_v4i32:
1097 ; AVX-NEXT: vpabsd %xmm0, %xmm0
1098 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
1100 %res = call <4 x i32> @llvm.vp.abs.v4i32(<4 x i32> %a0, i1 false, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1101 store <4 x i32> %res, ptr %out
1104 declare <4 x i32> @llvm.vp.abs.v4i32(<4 x i32>, i1 immarg, <4 x i1>, i32)
1106 define void @vp_smax_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1107 ; X86-LABEL: vp_smax_v4i32:
1109 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1110 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1111 ; X86-NEXT: vmovdqa %xmm0, (%eax)
1114 ; SSE-LABEL: vp_smax_v4i32:
1116 ; SSE-NEXT: movdqa %xmm0, %xmm2
1117 ; SSE-NEXT: pcmpgtd %xmm1, %xmm2
1118 ; SSE-NEXT: pand %xmm2, %xmm0
1119 ; SSE-NEXT: pandn %xmm1, %xmm2
1120 ; SSE-NEXT: por %xmm0, %xmm2
1121 ; SSE-NEXT: movdqa %xmm2, (%rdi)
1124 ; AVX-LABEL: vp_smax_v4i32:
1126 ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
1127 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
1129 %res = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1130 store <4 x i32> %res, ptr %out
1133 declare <4 x i32> @llvm.vp.smax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1135 define void @vp_smin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1136 ; X86-LABEL: vp_smin_v4i32:
1138 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1139 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
1140 ; X86-NEXT: vmovdqa %xmm0, (%eax)
1143 ; SSE-LABEL: vp_smin_v4i32:
1145 ; SSE-NEXT: movdqa %xmm1, %xmm2
1146 ; SSE-NEXT: pcmpgtd %xmm0, %xmm2
1147 ; SSE-NEXT: pand %xmm2, %xmm0
1148 ; SSE-NEXT: pandn %xmm1, %xmm2
1149 ; SSE-NEXT: por %xmm0, %xmm2
1150 ; SSE-NEXT: movdqa %xmm2, (%rdi)
1153 ; AVX-LABEL: vp_smin_v4i32:
1155 ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
1156 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
1158 %res = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1159 store <4 x i32> %res, ptr %out
1162 declare <4 x i32> @llvm.vp.smin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1164 define void @vp_umax_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1165 ; X86-LABEL: vp_umax_v4i32:
1167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1168 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
1169 ; X86-NEXT: vmovdqa %xmm0, (%eax)
1172 ; SSE-LABEL: vp_umax_v4i32:
1174 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
1175 ; SSE-NEXT: movdqa %xmm1, %xmm3
1176 ; SSE-NEXT: pxor %xmm2, %xmm3
1177 ; SSE-NEXT: pxor %xmm0, %xmm2
1178 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2
1179 ; SSE-NEXT: pand %xmm2, %xmm0
1180 ; SSE-NEXT: pandn %xmm1, %xmm2
1181 ; SSE-NEXT: por %xmm0, %xmm2
1182 ; SSE-NEXT: movdqa %xmm2, (%rdi)
1185 ; AVX-LABEL: vp_umax_v4i32:
1187 ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
1188 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
1190 %res = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1191 store <4 x i32> %res, ptr %out
1194 declare <4 x i32> @llvm.vp.umax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1196 define void @vp_umin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
1197 ; X86-LABEL: vp_umin_v4i32:
1199 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1200 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
1201 ; X86-NEXT: vmovdqa %xmm0, (%eax)
1204 ; SSE-LABEL: vp_umin_v4i32:
1206 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
1207 ; SSE-NEXT: movdqa %xmm0, %xmm3
1208 ; SSE-NEXT: pxor %xmm2, %xmm3
1209 ; SSE-NEXT: pxor %xmm1, %xmm2
1210 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2
1211 ; SSE-NEXT: pand %xmm2, %xmm0
1212 ; SSE-NEXT: pandn %xmm1, %xmm2
1213 ; SSE-NEXT: por %xmm0, %xmm2
1214 ; SSE-NEXT: movdqa %xmm2, (%rdi)
1217 ; AVX-LABEL: vp_umin_v4i32:
1219 ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
1220 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
1222 %res = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
1223 store <4 x i32> %res, ptr %out
1226 declare <4 x i32> @llvm.vp.umin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
1228 define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
1229 ; X86-LABEL: vp_bitreverse_v4i32:
1231 ; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1232 ; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1233 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm2
1234 ; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1235 ; X86-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1236 ; X86-NEXT: vpsrlw $4, %xmm0, %xmm0
1237 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
1238 ; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1239 ; X86-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1240 ; X86-NEXT: vpor %xmm0, %xmm2, %xmm0
1243 ; SSE-LABEL: vp_bitreverse_v4i32:
1245 ; SSE-NEXT: pxor %xmm1, %xmm1
1246 ; SSE-NEXT: movdqa %xmm0, %xmm2
1247 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1248 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1249 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1250 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1251 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1252 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1253 ; SSE-NEXT: packuswb %xmm2, %xmm0
1254 ; SSE-NEXT: movdqa %xmm0, %xmm1
1255 ; SSE-NEXT: psrlw $4, %xmm1
1256 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1257 ; SSE-NEXT: pand %xmm2, %xmm1
1258 ; SSE-NEXT: pand %xmm2, %xmm0
1259 ; SSE-NEXT: psllw $4, %xmm0
1260 ; SSE-NEXT: por %xmm1, %xmm0
1261 ; SSE-NEXT: movdqa %xmm0, %xmm1
1262 ; SSE-NEXT: psrlw $2, %xmm1
1263 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1264 ; SSE-NEXT: pand %xmm2, %xmm1
1265 ; SSE-NEXT: pand %xmm2, %xmm0
1266 ; SSE-NEXT: psllw $2, %xmm0
1267 ; SSE-NEXT: por %xmm1, %xmm0
1268 ; SSE-NEXT: movdqa %xmm0, %xmm1
1269 ; SSE-NEXT: psrlw $1, %xmm1
1270 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1271 ; SSE-NEXT: pand %xmm2, %xmm1
1272 ; SSE-NEXT: pand %xmm2, %xmm0
1273 ; SSE-NEXT: paddb %xmm0, %xmm0
1274 ; SSE-NEXT: por %xmm1, %xmm0
1277 ; AVX1-LABEL: vp_bitreverse_v4i32:
1279 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1280 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1281 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
1282 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1283 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1284 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1285 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1286 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1287 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1288 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1291 ; AVX2-LABEL: vp_bitreverse_v4i32:
1293 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1294 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1295 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
1296 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1297 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1298 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
1299 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1300 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1301 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1302 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
1305 ; AVX512-LABEL: vp_bitreverse_v4i32:
1307 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1308 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1309 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
1310 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1311 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1312 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
1313 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1314 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1315 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1316 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
1318 %v = call <4 x i32> @llvm.vp.bitreverse.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
1321 declare <4 x i32> @llvm.vp.bitreverse.v4i32(<4 x i32>, <4 x i1>, i32)
1323 define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
1324 ; X86-LABEL: vp_bswap_v4i32:
1326 ; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1329 ; SSE-LABEL: vp_bswap_v4i32:
1331 ; SSE-NEXT: pxor %xmm1, %xmm1
1332 ; SSE-NEXT: movdqa %xmm0, %xmm2
1333 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1334 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1335 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1336 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1337 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1338 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1339 ; SSE-NEXT: packuswb %xmm2, %xmm0
1342 ; AVX-LABEL: vp_bswap_v4i32:
1344 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1346 %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
1349 declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32)