1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
7 define void @vp_add_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
8 ; SSE-LABEL: vp_add_v4i32:
10 ; SSE-NEXT: paddd %xmm1, %xmm0
11 ; SSE-NEXT: movdqa %xmm0, (%rdi)
14 ; AVX-LABEL: vp_add_v4i32:
16 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
17 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
19 %res = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
20 store <4 x i32> %res, ptr %out
23 declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
25 define void @vp_sub_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
26 ; SSE-LABEL: vp_sub_v4i32:
28 ; SSE-NEXT: psubd %xmm1, %xmm0
29 ; SSE-NEXT: movdqa %xmm0, (%rdi)
32 ; AVX-LABEL: vp_sub_v4i32:
34 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
35 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
37 %res = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
38 store <4 x i32> %res, ptr %out
41 declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
43 define void @vp_mul_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
44 ; SSE-LABEL: vp_mul_v4i32:
46 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
47 ; SSE-NEXT: pmuludq %xmm1, %xmm0
48 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
49 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
50 ; SSE-NEXT: pmuludq %xmm2, %xmm1
51 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
52 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
53 ; SSE-NEXT: movdqa %xmm0, (%rdi)
56 ; AVX-LABEL: vp_mul_v4i32:
58 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
59 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
61 %res = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
62 store <4 x i32> %res, ptr %out
65 declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
67 define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
68 ; SSE-LABEL: vp_sdiv_v4i32:
70 ; SSE-NEXT: movd %esi, %xmm2
71 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
72 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
73 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
74 ; SSE-NEXT: pand %xmm2, %xmm1
75 ; SSE-NEXT: paddd %xmm2, %xmm1
76 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
77 ; SSE-NEXT: psubd %xmm2, %xmm1
78 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
79 ; SSE-NEXT: movd %xmm2, %ecx
80 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
81 ; SSE-NEXT: movd %xmm2, %eax
83 ; SSE-NEXT: idivl %ecx
84 ; SSE-NEXT: movd %eax, %xmm2
85 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
86 ; SSE-NEXT: movd %xmm3, %ecx
87 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
88 ; SSE-NEXT: movd %xmm3, %eax
90 ; SSE-NEXT: idivl %ecx
91 ; SSE-NEXT: movd %eax, %xmm3
92 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
93 ; SSE-NEXT: movd %xmm1, %ecx
94 ; SSE-NEXT: movd %xmm0, %eax
96 ; SSE-NEXT: idivl %ecx
97 ; SSE-NEXT: movd %eax, %xmm2
98 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
99 ; SSE-NEXT: movd %xmm1, %ecx
100 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
101 ; SSE-NEXT: movd %xmm0, %eax
103 ; SSE-NEXT: idivl %ecx
104 ; SSE-NEXT: movd %eax, %xmm0
105 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
106 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
107 ; SSE-NEXT: movdqa %xmm2, (%rdi)
110 ; AVX1-LABEL: vp_sdiv_v4i32:
112 ; AVX1-NEXT: vmovd %esi, %xmm2
113 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
114 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
115 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
116 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
117 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
118 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
119 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
121 ; AVX1-NEXT: idivl %ecx
122 ; AVX1-NEXT: movl %eax, %ecx
123 ; AVX1-NEXT: vmovd %xmm1, %esi
124 ; AVX1-NEXT: vmovd %xmm0, %eax
126 ; AVX1-NEXT: idivl %esi
127 ; AVX1-NEXT: vmovd %eax, %xmm2
128 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
129 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
130 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
132 ; AVX1-NEXT: idivl %ecx
133 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
134 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
135 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
137 ; AVX1-NEXT: idivl %ecx
138 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
139 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
142 ; AVX2-LABEL: vp_sdiv_v4i32:
144 ; AVX2-NEXT: vmovd %esi, %xmm2
145 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
146 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
147 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
148 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
149 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
150 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
151 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
152 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
154 ; AVX2-NEXT: idivl %ecx
155 ; AVX2-NEXT: movl %eax, %ecx
156 ; AVX2-NEXT: vmovd %xmm1, %esi
157 ; AVX2-NEXT: vmovd %xmm0, %eax
159 ; AVX2-NEXT: idivl %esi
160 ; AVX2-NEXT: vmovd %eax, %xmm2
161 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
162 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
163 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
165 ; AVX2-NEXT: idivl %ecx
166 ; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
167 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
168 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
170 ; AVX2-NEXT: idivl %ecx
171 ; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
172 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
175 ; AVX512-LABEL: vp_sdiv_v4i32:
177 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
178 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
179 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
180 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
181 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
182 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
184 ; AVX512-NEXT: idivl %ecx
185 ; AVX512-NEXT: movl %eax, %ecx
186 ; AVX512-NEXT: vmovd %xmm2, %esi
187 ; AVX512-NEXT: vmovd %xmm0, %eax
189 ; AVX512-NEXT: idivl %esi
190 ; AVX512-NEXT: vmovd %eax, %xmm1
191 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
192 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
193 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
195 ; AVX512-NEXT: idivl %ecx
196 ; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
197 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
198 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
200 ; AVX512-NEXT: idivl %ecx
201 ; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
202 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
204 %res = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
205 store <4 x i32> %res, ptr %out
208 declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
210 define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
211 ; SSE-LABEL: vp_udiv_v4i32:
213 ; SSE-NEXT: movd %esi, %xmm2
214 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
215 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
216 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
217 ; SSE-NEXT: pand %xmm2, %xmm1
218 ; SSE-NEXT: paddd %xmm2, %xmm1
219 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
220 ; SSE-NEXT: psubd %xmm2, %xmm1
221 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
222 ; SSE-NEXT: movd %xmm2, %ecx
223 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
224 ; SSE-NEXT: movd %xmm2, %eax
225 ; SSE-NEXT: xorl %edx, %edx
226 ; SSE-NEXT: divl %ecx
227 ; SSE-NEXT: movd %eax, %xmm2
228 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
229 ; SSE-NEXT: movd %xmm3, %ecx
230 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
231 ; SSE-NEXT: movd %xmm3, %eax
232 ; SSE-NEXT: xorl %edx, %edx
233 ; SSE-NEXT: divl %ecx
234 ; SSE-NEXT: movd %eax, %xmm3
235 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
236 ; SSE-NEXT: movd %xmm1, %ecx
237 ; SSE-NEXT: movd %xmm0, %eax
238 ; SSE-NEXT: xorl %edx, %edx
239 ; SSE-NEXT: divl %ecx
240 ; SSE-NEXT: movd %eax, %xmm2
241 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
242 ; SSE-NEXT: movd %xmm1, %ecx
243 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
244 ; SSE-NEXT: movd %xmm0, %eax
245 ; SSE-NEXT: xorl %edx, %edx
246 ; SSE-NEXT: divl %ecx
247 ; SSE-NEXT: movd %eax, %xmm0
248 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
249 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
250 ; SSE-NEXT: movdqa %xmm2, (%rdi)
253 ; AVX1-LABEL: vp_udiv_v4i32:
255 ; AVX1-NEXT: vmovd %esi, %xmm2
256 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
257 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
258 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
259 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
260 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
261 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
262 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
263 ; AVX1-NEXT: xorl %edx, %edx
264 ; AVX1-NEXT: divl %ecx
265 ; AVX1-NEXT: movl %eax, %ecx
266 ; AVX1-NEXT: vmovd %xmm1, %esi
267 ; AVX1-NEXT: vmovd %xmm0, %eax
268 ; AVX1-NEXT: xorl %edx, %edx
269 ; AVX1-NEXT: divl %esi
270 ; AVX1-NEXT: vmovd %eax, %xmm2
271 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
272 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
273 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
274 ; AVX1-NEXT: xorl %edx, %edx
275 ; AVX1-NEXT: divl %ecx
276 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
277 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
278 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
279 ; AVX1-NEXT: xorl %edx, %edx
280 ; AVX1-NEXT: divl %ecx
281 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
282 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
285 ; AVX2-LABEL: vp_udiv_v4i32:
287 ; AVX2-NEXT: vmovd %esi, %xmm2
288 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
289 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
290 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
291 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
292 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
293 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
294 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
295 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
296 ; AVX2-NEXT: xorl %edx, %edx
297 ; AVX2-NEXT: divl %ecx
298 ; AVX2-NEXT: movl %eax, %ecx
299 ; AVX2-NEXT: vmovd %xmm1, %esi
300 ; AVX2-NEXT: vmovd %xmm0, %eax
301 ; AVX2-NEXT: xorl %edx, %edx
302 ; AVX2-NEXT: divl %esi
303 ; AVX2-NEXT: vmovd %eax, %xmm2
304 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
305 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
306 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
307 ; AVX2-NEXT: xorl %edx, %edx
308 ; AVX2-NEXT: divl %ecx
309 ; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
310 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
311 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
312 ; AVX2-NEXT: xorl %edx, %edx
313 ; AVX2-NEXT: divl %ecx
314 ; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
315 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
318 ; AVX512-LABEL: vp_udiv_v4i32:
320 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
321 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
322 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
323 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
324 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
325 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
326 ; AVX512-NEXT: xorl %edx, %edx
327 ; AVX512-NEXT: divl %ecx
328 ; AVX512-NEXT: movl %eax, %ecx
329 ; AVX512-NEXT: vmovd %xmm2, %esi
330 ; AVX512-NEXT: vmovd %xmm0, %eax
331 ; AVX512-NEXT: xorl %edx, %edx
332 ; AVX512-NEXT: divl %esi
333 ; AVX512-NEXT: vmovd %eax, %xmm1
334 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
335 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
336 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
337 ; AVX512-NEXT: xorl %edx, %edx
338 ; AVX512-NEXT: divl %ecx
339 ; AVX512-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
340 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
341 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
342 ; AVX512-NEXT: xorl %edx, %edx
343 ; AVX512-NEXT: divl %ecx
344 ; AVX512-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
345 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
347 %res = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
348 store <4 x i32> %res, ptr %out
351 declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
353 define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
354 ; SSE-LABEL: vp_srem_v4i32:
356 ; SSE-NEXT: movd %esi, %xmm2
357 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
358 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
359 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
360 ; SSE-NEXT: pand %xmm2, %xmm1
361 ; SSE-NEXT: paddd %xmm2, %xmm1
362 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
363 ; SSE-NEXT: psubd %xmm2, %xmm1
364 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
365 ; SSE-NEXT: movd %xmm2, %ecx
366 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
367 ; SSE-NEXT: movd %xmm2, %eax
369 ; SSE-NEXT: idivl %ecx
370 ; SSE-NEXT: movd %edx, %xmm2
371 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
372 ; SSE-NEXT: movd %xmm3, %ecx
373 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
374 ; SSE-NEXT: movd %xmm3, %eax
376 ; SSE-NEXT: idivl %ecx
377 ; SSE-NEXT: movd %edx, %xmm3
378 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
379 ; SSE-NEXT: movd %xmm1, %ecx
380 ; SSE-NEXT: movd %xmm0, %eax
382 ; SSE-NEXT: idivl %ecx
383 ; SSE-NEXT: movd %edx, %xmm2
384 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
385 ; SSE-NEXT: movd %xmm1, %ecx
386 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
387 ; SSE-NEXT: movd %xmm0, %eax
389 ; SSE-NEXT: idivl %ecx
390 ; SSE-NEXT: movd %edx, %xmm0
391 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
392 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
393 ; SSE-NEXT: movdqa %xmm2, (%rdi)
396 ; AVX1-LABEL: vp_srem_v4i32:
398 ; AVX1-NEXT: vmovd %esi, %xmm2
399 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
400 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
401 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
402 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
403 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
404 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
405 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
407 ; AVX1-NEXT: idivl %ecx
408 ; AVX1-NEXT: movl %edx, %ecx
409 ; AVX1-NEXT: vmovd %xmm1, %esi
410 ; AVX1-NEXT: vmovd %xmm0, %eax
412 ; AVX1-NEXT: idivl %esi
413 ; AVX1-NEXT: vmovd %edx, %xmm2
414 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
415 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
416 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
418 ; AVX1-NEXT: idivl %ecx
419 ; AVX1-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
420 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
421 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
423 ; AVX1-NEXT: idivl %ecx
424 ; AVX1-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
425 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
428 ; AVX2-LABEL: vp_srem_v4i32:
430 ; AVX2-NEXT: vmovd %esi, %xmm2
431 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
432 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
433 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
434 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
435 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
436 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
437 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
438 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
440 ; AVX2-NEXT: idivl %ecx
441 ; AVX2-NEXT: movl %edx, %ecx
442 ; AVX2-NEXT: vmovd %xmm1, %esi
443 ; AVX2-NEXT: vmovd %xmm0, %eax
445 ; AVX2-NEXT: idivl %esi
446 ; AVX2-NEXT: vmovd %edx, %xmm2
447 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
448 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
449 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
451 ; AVX2-NEXT: idivl %ecx
452 ; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
453 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
454 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
456 ; AVX2-NEXT: idivl %ecx
457 ; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
458 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
461 ; AVX512-LABEL: vp_srem_v4i32:
463 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
464 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
465 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
466 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
467 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
468 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
470 ; AVX512-NEXT: idivl %ecx
471 ; AVX512-NEXT: movl %edx, %ecx
472 ; AVX512-NEXT: vmovd %xmm2, %esi
473 ; AVX512-NEXT: vmovd %xmm0, %eax
475 ; AVX512-NEXT: idivl %esi
476 ; AVX512-NEXT: vmovd %edx, %xmm1
477 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
478 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
479 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
481 ; AVX512-NEXT: idivl %ecx
482 ; AVX512-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
483 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
484 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
486 ; AVX512-NEXT: idivl %ecx
487 ; AVX512-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
488 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
490 %res = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
491 store <4 x i32> %res, ptr %out
494 declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
496 define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
497 ; SSE-LABEL: vp_urem_v4i32:
499 ; SSE-NEXT: movd %esi, %xmm2
500 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
501 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
502 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
503 ; SSE-NEXT: pand %xmm2, %xmm1
504 ; SSE-NEXT: paddd %xmm2, %xmm1
505 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2
506 ; SSE-NEXT: psubd %xmm2, %xmm1
507 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
508 ; SSE-NEXT: movd %xmm2, %ecx
509 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
510 ; SSE-NEXT: movd %xmm2, %eax
511 ; SSE-NEXT: xorl %edx, %edx
512 ; SSE-NEXT: divl %ecx
513 ; SSE-NEXT: movd %edx, %xmm2
514 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
515 ; SSE-NEXT: movd %xmm3, %ecx
516 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
517 ; SSE-NEXT: movd %xmm3, %eax
518 ; SSE-NEXT: xorl %edx, %edx
519 ; SSE-NEXT: divl %ecx
520 ; SSE-NEXT: movd %edx, %xmm3
521 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
522 ; SSE-NEXT: movd %xmm1, %ecx
523 ; SSE-NEXT: movd %xmm0, %eax
524 ; SSE-NEXT: xorl %edx, %edx
525 ; SSE-NEXT: divl %ecx
526 ; SSE-NEXT: movd %edx, %xmm2
527 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
528 ; SSE-NEXT: movd %xmm1, %ecx
529 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
530 ; SSE-NEXT: movd %xmm0, %eax
531 ; SSE-NEXT: xorl %edx, %edx
532 ; SSE-NEXT: divl %ecx
533 ; SSE-NEXT: movd %edx, %xmm0
534 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
535 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
536 ; SSE-NEXT: movdqa %xmm2, (%rdi)
539 ; AVX1-LABEL: vp_urem_v4i32:
541 ; AVX1-NEXT: vmovd %esi, %xmm2
542 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
543 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
544 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
545 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
546 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
547 ; AVX1-NEXT: vextractps $1, %xmm1, %ecx
548 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
549 ; AVX1-NEXT: xorl %edx, %edx
550 ; AVX1-NEXT: divl %ecx
551 ; AVX1-NEXT: movl %edx, %ecx
552 ; AVX1-NEXT: vmovd %xmm1, %esi
553 ; AVX1-NEXT: vmovd %xmm0, %eax
554 ; AVX1-NEXT: xorl %edx, %edx
555 ; AVX1-NEXT: divl %esi
556 ; AVX1-NEXT: vmovd %edx, %xmm2
557 ; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
558 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
559 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
560 ; AVX1-NEXT: xorl %edx, %edx
561 ; AVX1-NEXT: divl %ecx
562 ; AVX1-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
563 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
564 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
565 ; AVX1-NEXT: xorl %edx, %edx
566 ; AVX1-NEXT: divl %ecx
567 ; AVX1-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
568 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
571 ; AVX2-LABEL: vp_urem_v4i32:
573 ; AVX2-NEXT: vmovd %esi, %xmm2
574 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
575 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
576 ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2
577 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
578 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
579 ; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
580 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx
581 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
582 ; AVX2-NEXT: xorl %edx, %edx
583 ; AVX2-NEXT: divl %ecx
584 ; AVX2-NEXT: movl %edx, %ecx
585 ; AVX2-NEXT: vmovd %xmm1, %esi
586 ; AVX2-NEXT: vmovd %xmm0, %eax
587 ; AVX2-NEXT: xorl %edx, %edx
588 ; AVX2-NEXT: divl %esi
589 ; AVX2-NEXT: vmovd %edx, %xmm2
590 ; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
591 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
592 ; AVX2-NEXT: vpextrd $2, %xmm0, %eax
593 ; AVX2-NEXT: xorl %edx, %edx
594 ; AVX2-NEXT: divl %ecx
595 ; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
596 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
597 ; AVX2-NEXT: vpextrd $3, %xmm0, %eax
598 ; AVX2-NEXT: xorl %edx, %edx
599 ; AVX2-NEXT: divl %ecx
600 ; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
601 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
604 ; AVX512-LABEL: vp_urem_v4i32:
606 ; AVX512-NEXT: vpbroadcastd %esi, %xmm2
607 ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
608 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
609 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
610 ; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
611 ; AVX512-NEXT: vpextrd $1, %xmm0, %eax
612 ; AVX512-NEXT: xorl %edx, %edx
613 ; AVX512-NEXT: divl %ecx
614 ; AVX512-NEXT: movl %edx, %ecx
615 ; AVX512-NEXT: vmovd %xmm2, %esi
616 ; AVX512-NEXT: vmovd %xmm0, %eax
617 ; AVX512-NEXT: xorl %edx, %edx
618 ; AVX512-NEXT: divl %esi
619 ; AVX512-NEXT: vmovd %edx, %xmm1
620 ; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
621 ; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
622 ; AVX512-NEXT: vpextrd $2, %xmm0, %eax
623 ; AVX512-NEXT: xorl %edx, %edx
624 ; AVX512-NEXT: divl %ecx
625 ; AVX512-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
626 ; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
627 ; AVX512-NEXT: vpextrd $3, %xmm0, %eax
628 ; AVX512-NEXT: xorl %edx, %edx
629 ; AVX512-NEXT: divl %ecx
630 ; AVX512-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
631 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
633 %res = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
634 store <4 x i32> %res, ptr %out
637 declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
639 define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
640 ; SSE-LABEL: vp_ashr_v4i32:
642 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
643 ; SSE-NEXT: movdqa %xmm0, %xmm3
644 ; SSE-NEXT: psrad %xmm2, %xmm3
645 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
646 ; SSE-NEXT: movdqa %xmm0, %xmm4
647 ; SSE-NEXT: psrad %xmm2, %xmm4
648 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
649 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
650 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
651 ; SSE-NEXT: movdqa %xmm0, %xmm3
652 ; SSE-NEXT: psrad %xmm2, %xmm3
653 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
654 ; SSE-NEXT: psrad %xmm1, %xmm0
655 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
656 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
657 ; SSE-NEXT: movaps %xmm4, (%rdi)
660 ; AVX1-LABEL: vp_ashr_v4i32:
662 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
663 ; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
664 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
665 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
666 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
667 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
668 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
669 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
670 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
671 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
672 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
673 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
674 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
677 ; AVX2-LABEL: vp_ashr_v4i32:
679 ; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
680 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
683 ; AVX512-LABEL: vp_ashr_v4i32:
685 ; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
686 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
688 %res = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
689 store <4 x i32> %res, ptr %out
692 declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
694 define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
695 ; SSE-LABEL: vp_lshr_v4i32:
697 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
698 ; SSE-NEXT: movdqa %xmm0, %xmm3
699 ; SSE-NEXT: psrld %xmm2, %xmm3
700 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
701 ; SSE-NEXT: movdqa %xmm0, %xmm4
702 ; SSE-NEXT: psrld %xmm2, %xmm4
703 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
704 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
705 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
706 ; SSE-NEXT: movdqa %xmm0, %xmm3
707 ; SSE-NEXT: psrld %xmm2, %xmm3
708 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
709 ; SSE-NEXT: psrld %xmm1, %xmm0
710 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
711 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
712 ; SSE-NEXT: movaps %xmm4, (%rdi)
715 ; AVX1-LABEL: vp_lshr_v4i32:
717 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
718 ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
719 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
720 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
721 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
722 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
723 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
724 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
725 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
726 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
727 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
728 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
729 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
732 ; AVX2-LABEL: vp_lshr_v4i32:
734 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
735 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
738 ; AVX512-LABEL: vp_lshr_v4i32:
740 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
741 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
743 %res = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
744 store <4 x i32> %res, ptr %out
747 declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
749 define void @vp_shl_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
750 ; SSE-LABEL: vp_shl_v4i32:
752 ; SSE-NEXT: pslld $23, %xmm1
753 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
754 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1
755 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
756 ; SSE-NEXT: pmuludq %xmm1, %xmm0
757 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
758 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
759 ; SSE-NEXT: pmuludq %xmm2, %xmm1
760 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
761 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
762 ; SSE-NEXT: movdqa %xmm0, (%rdi)
765 ; AVX1-LABEL: vp_shl_v4i32:
767 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
768 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
769 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
770 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
771 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
774 ; AVX2-LABEL: vp_shl_v4i32:
776 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
777 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
780 ; AVX512-LABEL: vp_shl_v4i32:
782 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
783 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
785 %res = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
786 store <4 x i32> %res, ptr %out
789 declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
791 define void @vp_or_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
792 ; SSE-LABEL: vp_or_v4i32:
794 ; SSE-NEXT: orps %xmm1, %xmm0
795 ; SSE-NEXT: movaps %xmm0, (%rdi)
798 ; AVX-LABEL: vp_or_v4i32:
800 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
801 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
803 %res = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
804 store <4 x i32> %res, ptr %out
807 declare <4 x i32> @llvm.vp.or.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
809 define void @vp_and_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
810 ; SSE-LABEL: vp_and_v4i32:
812 ; SSE-NEXT: orps %xmm1, %xmm0
813 ; SSE-NEXT: movaps %xmm0, (%rdi)
816 ; AVX-LABEL: vp_and_v4i32:
818 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
819 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
821 %res = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
822 store <4 x i32> %res, ptr %out
825 declare <4 x i32> @llvm.vp.and.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
827 define void @vp_xor_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
828 ; SSE-LABEL: vp_xor_v4i32:
830 ; SSE-NEXT: xorps %xmm1, %xmm0
831 ; SSE-NEXT: movaps %xmm0, (%rdi)
834 ; AVX-LABEL: vp_xor_v4i32:
836 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
837 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
839 %res = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
840 store <4 x i32> %res, ptr %out
843 declare <4 x i32> @llvm.vp.xor.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
845 define void @vp_abs_v4i32(<4 x i32> %a0, ptr %out, i32 %vp) nounwind {
846 ; SSE-LABEL: vp_abs_v4i32:
848 ; SSE-NEXT: movdqa %xmm0, %xmm1
849 ; SSE-NEXT: psrad $31, %xmm1
850 ; SSE-NEXT: pxor %xmm1, %xmm0
851 ; SSE-NEXT: psubd %xmm1, %xmm0
852 ; SSE-NEXT: movdqa %xmm0, (%rdi)
855 ; AVX-LABEL: vp_abs_v4i32:
857 ; AVX-NEXT: vpabsd %xmm0, %xmm0
858 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
860 %res = call <4 x i32> @llvm.vp.abs.v4i32(<4 x i32> %a0, i1 false, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
861 store <4 x i32> %res, ptr %out
864 declare <4 x i32> @llvm.vp.abs.v4i32(<4 x i32>, i1 immarg, <4 x i1>, i32)
866 define void @vp_smax_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
867 ; SSE-LABEL: vp_smax_v4i32:
869 ; SSE-NEXT: movdqa %xmm0, %xmm2
870 ; SSE-NEXT: pcmpgtd %xmm1, %xmm2
871 ; SSE-NEXT: pand %xmm2, %xmm0
872 ; SSE-NEXT: pandn %xmm1, %xmm2
873 ; SSE-NEXT: por %xmm0, %xmm2
874 ; SSE-NEXT: movdqa %xmm2, (%rdi)
877 ; AVX-LABEL: vp_smax_v4i32:
879 ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
880 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
882 %res = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
883 store <4 x i32> %res, ptr %out
886 declare <4 x i32> @llvm.vp.smax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
888 define void @vp_smin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
889 ; SSE-LABEL: vp_smin_v4i32:
891 ; SSE-NEXT: movdqa %xmm1, %xmm2
892 ; SSE-NEXT: pcmpgtd %xmm0, %xmm2
893 ; SSE-NEXT: pand %xmm2, %xmm0
894 ; SSE-NEXT: pandn %xmm1, %xmm2
895 ; SSE-NEXT: por %xmm0, %xmm2
896 ; SSE-NEXT: movdqa %xmm2, (%rdi)
899 ; AVX-LABEL: vp_smin_v4i32:
901 ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
902 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
904 %res = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
905 store <4 x i32> %res, ptr %out
908 declare <4 x i32> @llvm.vp.smin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
910 define void @vp_umax_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
911 ; SSE-LABEL: vp_umax_v4i32:
913 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
914 ; SSE-NEXT: movdqa %xmm1, %xmm3
915 ; SSE-NEXT: pxor %xmm2, %xmm3
916 ; SSE-NEXT: pxor %xmm0, %xmm2
917 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2
918 ; SSE-NEXT: pand %xmm2, %xmm0
919 ; SSE-NEXT: pandn %xmm1, %xmm2
920 ; SSE-NEXT: por %xmm0, %xmm2
921 ; SSE-NEXT: movdqa %xmm2, (%rdi)
924 ; AVX-LABEL: vp_umax_v4i32:
926 ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
927 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
929 %res = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
930 store <4 x i32> %res, ptr %out
933 declare <4 x i32> @llvm.vp.umax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
935 define void @vp_umin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
936 ; SSE-LABEL: vp_umin_v4i32:
938 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
939 ; SSE-NEXT: movdqa %xmm0, %xmm3
940 ; SSE-NEXT: pxor %xmm2, %xmm3
941 ; SSE-NEXT: pxor %xmm1, %xmm2
942 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2
943 ; SSE-NEXT: pand %xmm2, %xmm0
944 ; SSE-NEXT: pandn %xmm1, %xmm2
945 ; SSE-NEXT: por %xmm0, %xmm2
946 ; SSE-NEXT: movdqa %xmm2, (%rdi)
949 ; AVX-LABEL: vp_umin_v4i32:
951 ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
952 ; AVX-NEXT: vmovdqa %xmm0, (%rdi)
954 %res = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
955 store <4 x i32> %res, ptr %out
958 declare <4 x i32> @llvm.vp.umin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)