1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2
3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE41
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX1
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
6 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL
9 define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
10 ; CHECK-SSE2-LABEL: test_urem_odd_25:
11 ; CHECK-SSE2: # %bb.0:
12 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
13 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
14 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
15 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
16 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
17 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
18 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
19 ; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
20 ; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
21 ; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
22 ; CHECK-SSE2-NEXT: retq
24 ; CHECK-SSE41-LABEL: test_urem_odd_25:
25 ; CHECK-SSE41: # %bb.0:
26 ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
27 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691]
28 ; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
29 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
30 ; CHECK-SSE41-NEXT: psrld $31, %xmm0
31 ; CHECK-SSE41-NEXT: retq
33 ; CHECK-AVX1-LABEL: test_urem_odd_25:
34 ; CHECK-AVX1: # %bb.0:
35 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
36 ; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
37 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
38 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
39 ; CHECK-AVX1-NEXT: retq
41 ; CHECK-AVX2-LABEL: test_urem_odd_25:
42 ; CHECK-AVX2: # %bb.0:
43 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
44 ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
45 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691]
46 ; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
47 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
48 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
49 ; CHECK-AVX2-NEXT: retq
51 ; CHECK-AVX512VL-LABEL: test_urem_odd_25:
52 ; CHECK-AVX512VL: # %bb.0:
53 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
54 ; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1
55 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
56 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
57 ; CHECK-AVX512VL-NEXT: retq
58 %urem = urem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
59 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
60 %ret = zext <4 x i1> %cmp to <4 x i32>
65 define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
66 ; CHECK-SSE2-LABEL: test_urem_even_100:
67 ; CHECK-SSE2: # %bb.0:
68 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
69 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
70 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
71 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
72 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
73 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
74 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
75 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
76 ; CHECK-SSE2-NEXT: psrld $5, %xmm2
77 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
78 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
79 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
80 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
81 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
82 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
83 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
84 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
85 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
86 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
87 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
88 ; CHECK-SSE2-NEXT: retq
90 ; CHECK-SSE41-LABEL: test_urem_even_100:
91 ; CHECK-SSE41: # %bb.0:
92 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
93 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
94 ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
95 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
96 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
97 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
98 ; CHECK-SSE41-NEXT: psrld $5, %xmm2
99 ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
100 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
101 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
102 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
103 ; CHECK-SSE41-NEXT: psrld $31, %xmm0
104 ; CHECK-SSE41-NEXT: retq
106 ; CHECK-AVX1-LABEL: test_urem_even_100:
107 ; CHECK-AVX1: # %bb.0:
108 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
109 ; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
110 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
111 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
112 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
113 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
114 ; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
115 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
116 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
117 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
118 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
119 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
120 ; CHECK-AVX1-NEXT: retq
122 ; CHECK-AVX2-LABEL: test_urem_even_100:
123 ; CHECK-AVX2: # %bb.0:
124 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
125 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
126 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
127 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
128 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
129 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
130 ; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1
131 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
132 ; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
133 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
134 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
135 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
136 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
137 ; CHECK-AVX2-NEXT: retq
139 ; CHECK-AVX512VL-LABEL: test_urem_even_100:
140 ; CHECK-AVX512VL: # %bb.0:
141 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0
142 ; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0
143 ; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1
144 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
145 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
146 ; CHECK-AVX512VL-NEXT: retq
147 %urem = urem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
148 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
149 %ret = zext <4 x i1> %cmp to <4 x i32>
153 ;------------------------------------------------------------------------------;
154 ; Comparison constant has undef elements.
155 ;------------------------------------------------------------------------------;
157 define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
158 ; CHECK-SSE2-LABEL: test_urem_odd_undef1:
159 ; CHECK-SSE2: # %bb.0:
160 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
161 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
162 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
163 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
164 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
165 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
166 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
167 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
168 ; CHECK-SSE2-NEXT: psrld $3, %xmm2
169 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25]
170 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
171 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
172 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
173 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
174 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
175 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
176 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
177 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
178 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
179 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
180 ; CHECK-SSE2-NEXT: retq
182 ; CHECK-SSE41-LABEL: test_urem_odd_undef1:
183 ; CHECK-SSE41: # %bb.0:
184 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
185 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
186 ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
187 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
188 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
189 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
190 ; CHECK-SSE41-NEXT: psrld $3, %xmm2
191 ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
192 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
193 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
194 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
195 ; CHECK-SSE41-NEXT: psrld $31, %xmm0
196 ; CHECK-SSE41-NEXT: retq
198 ; CHECK-AVX1-LABEL: test_urem_odd_undef1:
199 ; CHECK-AVX1: # %bb.0:
200 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
201 ; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
202 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
203 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
204 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
205 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
206 ; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
207 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
208 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
209 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
210 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
211 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
212 ; CHECK-AVX1-NEXT: retq
214 ; CHECK-AVX2-LABEL: test_urem_odd_undef1:
215 ; CHECK-AVX2: # %bb.0:
216 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
217 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
218 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
219 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
220 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
221 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
222 ; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1
223 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25]
224 ; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
225 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
226 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
227 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
228 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
229 ; CHECK-AVX2-NEXT: retq
231 ; CHECK-AVX512VL-LABEL: test_urem_odd_undef1:
232 ; CHECK-AVX512VL: # %bb.0:
233 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
234 ; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
235 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
236 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
237 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
238 ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
239 ; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1
240 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
241 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
242 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
243 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
244 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
245 ; CHECK-AVX512VL-NEXT: retq
246 %urem = urem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
247 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 undef, i32 0>
248 %ret = zext <4 x i1> %cmp to <4 x i32>
252 define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
253 ; CHECK-SSE2-LABEL: test_urem_even_undef1:
254 ; CHECK-SSE2: # %bb.0:
255 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
256 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
257 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
258 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
259 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
260 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
261 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
262 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
263 ; CHECK-SSE2-NEXT: psrld $5, %xmm2
264 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
265 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
266 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
267 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
268 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
269 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
270 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
271 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
272 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
273 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
274 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
275 ; CHECK-SSE2-NEXT: retq
277 ; CHECK-SSE41-LABEL: test_urem_even_undef1:
278 ; CHECK-SSE41: # %bb.0:
279 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
280 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
281 ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
282 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
283 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
284 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
285 ; CHECK-SSE41-NEXT: psrld $5, %xmm2
286 ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
287 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
288 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
289 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
290 ; CHECK-SSE41-NEXT: psrld $31, %xmm0
291 ; CHECK-SSE41-NEXT: retq
293 ; CHECK-AVX1-LABEL: test_urem_even_undef1:
294 ; CHECK-AVX1: # %bb.0:
295 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
296 ; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
297 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
298 ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
299 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
300 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
301 ; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
302 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
303 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
304 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
305 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
306 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
307 ; CHECK-AVX1-NEXT: retq
309 ; CHECK-AVX2-LABEL: test_urem_even_undef1:
310 ; CHECK-AVX2: # %bb.0:
311 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
312 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
313 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
314 ; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
315 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
316 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
317 ; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1
318 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
319 ; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
320 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
321 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
322 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
323 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
324 ; CHECK-AVX2-NEXT: retq
326 ; CHECK-AVX512VL-LABEL: test_urem_even_undef1:
327 ; CHECK-AVX512VL: # %bb.0:
328 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
329 ; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
330 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
331 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
332 ; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
333 ; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
334 ; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1
335 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
336 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
337 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
338 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
339 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
340 ; CHECK-AVX512VL-NEXT: retq
341 %urem = urem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
342 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 undef, i32 0>
343 %ret = zext <4 x i1> %cmp to <4 x i32>
347 ;------------------------------------------------------------------------------;
349 ;------------------------------------------------------------------------------;
351 ; We can lower remainder of division by powers of two much better elsewhere.
352 define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
353 ; CHECK-SSE-LABEL: test_urem_pow2:
354 ; CHECK-SSE: # %bb.0:
355 ; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0
356 ; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
357 ; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0
358 ; CHECK-SSE-NEXT: psrld $31, %xmm0
359 ; CHECK-SSE-NEXT: retq
361 ; CHECK-AVX1-LABEL: test_urem_pow2:
362 ; CHECK-AVX1: # %bb.0:
363 ; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
364 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
365 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
366 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
367 ; CHECK-AVX1-NEXT: retq
369 ; CHECK-AVX2-LABEL: test_urem_pow2:
370 ; CHECK-AVX2: # %bb.0:
371 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
372 ; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
373 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
374 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
375 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
376 ; CHECK-AVX2-NEXT: retq
378 ; CHECK-AVX512VL-LABEL: test_urem_pow2:
379 ; CHECK-AVX512VL: # %bb.0:
380 ; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
381 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
382 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
383 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
384 ; CHECK-AVX512VL-NEXT: retq
385 %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
386 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
387 %ret = zext <4 x i1> %cmp to <4 x i32>
391 ; We could lower remainder of division by all-ones much better elsewhere.
392 define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind {
393 ; CHECK-SSE2-LABEL: test_urem_allones:
394 ; CHECK-SSE2: # %bb.0:
395 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
396 ; CHECK-SSE2-NEXT: psubd %xmm0, %xmm1
397 ; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm1
398 ; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1
399 ; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
400 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
401 ; CHECK-SSE2-NEXT: retq
403 ; CHECK-SSE41-LABEL: test_urem_allones:
404 ; CHECK-SSE41: # %bb.0:
405 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
406 ; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1
407 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
408 ; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
409 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
410 ; CHECK-SSE41-NEXT: psrld $31, %xmm0
411 ; CHECK-SSE41-NEXT: retq
413 ; CHECK-AVX1-LABEL: test_urem_allones:
414 ; CHECK-AVX1: # %bb.0:
415 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
416 ; CHECK-AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
417 ; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
418 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
419 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
420 ; CHECK-AVX1-NEXT: retq
422 ; CHECK-AVX2-LABEL: test_urem_allones:
423 ; CHECK-AVX2: # %bb.0:
424 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
425 ; CHECK-AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0
426 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
427 ; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
428 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
429 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
430 ; CHECK-AVX2-NEXT: retq
432 ; CHECK-AVX512VL-LABEL: test_urem_allones:
433 ; CHECK-AVX512VL: # %bb.0:
434 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
435 ; CHECK-AVX512VL-NEXT: vpsubd %xmm0, %xmm1, %xmm0
436 ; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm1
437 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
438 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
439 ; CHECK-AVX512VL-NEXT: retq
440 %urem = urem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295>
441 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
442 %ret = zext <4 x i1> %cmp to <4 x i32>
446 ; If all divisors are ones, this is constant-folded.
447 define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind {
448 ; CHECK-SSE-LABEL: test_urem_one_eq:
449 ; CHECK-SSE: # %bb.0:
450 ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
451 ; CHECK-SSE-NEXT: retq
453 ; CHECK-AVX1-LABEL: test_urem_one_eq:
454 ; CHECK-AVX1: # %bb.0:
455 ; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
456 ; CHECK-AVX1-NEXT: retq
458 ; CHECK-AVX2-LABEL: test_urem_one_eq:
459 ; CHECK-AVX2: # %bb.0:
460 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
461 ; CHECK-AVX2-NEXT: retq
463 ; CHECK-AVX512VL-LABEL: test_urem_one_eq:
464 ; CHECK-AVX512VL: # %bb.0:
465 ; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
466 ; CHECK-AVX512VL-NEXT: retq
467 %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
468 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
469 %ret = zext <4 x i1> %cmp to <4 x i32>
472 define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind {
473 ; CHECK-SSE-LABEL: test_urem_one_ne:
474 ; CHECK-SSE: # %bb.0:
475 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
476 ; CHECK-SSE-NEXT: retq
478 ; CHECK-AVX-LABEL: test_urem_one_ne:
479 ; CHECK-AVX: # %bb.0:
480 ; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
481 ; CHECK-AVX-NEXT: retq
482 %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
483 %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
484 %ret = zext <4 x i1> %cmp to <4 x i32>