1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s
3 ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s
5 ; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
7 define i32 @v_udiv_i32(i32 %num, i32 %den) {
8 ; GISEL-LABEL: v_udiv_i32:
10 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
12 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
13 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
14 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
16 ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
17 ; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
18 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
19 ; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
20 ; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1
21 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
22 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
23 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
24 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
25 ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
26 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
27 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2
28 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
29 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
30 ; GISEL-NEXT: s_setpc_b64 s[30:31]
32 ; CGP-LABEL: v_udiv_i32:
34 ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
36 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
37 ; CGP-NEXT: v_rcp_f32_e32 v2, v2
38 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
39 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
40 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
41 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
42 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
43 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
44 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1
45 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
46 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
47 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
48 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
49 ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
50 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
51 ; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
52 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
53 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
54 ; CGP-NEXT: s_setpc_b64 s[30:31]
55 %result = udiv i32 %num, %den
59 ; FIXME: This is a workaround for not handling uniform VGPR case.
60 declare i32 @llvm.amdgcn.readfirstlane(i32)
62 define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) {
63 ; GISEL-LABEL: s_udiv_i32:
65 ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, s1
66 ; GISEL-NEXT: s_sub_i32 s2, 0, s1
67 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
68 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
69 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
70 ; GISEL-NEXT: v_mul_lo_u32 v1, s2, v0
71 ; GISEL-NEXT: v_mul_hi_u32 v1, v0, v1
72 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
73 ; GISEL-NEXT: v_mul_hi_u32 v0, s0, v0
74 ; GISEL-NEXT: v_mul_lo_u32 v1, v0, s1
75 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
76 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
77 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s1, v1
78 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
79 ; GISEL-NEXT: v_subrev_i32_e64 v2, s[2:3], s1, v1
80 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
81 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
82 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s1, v1
83 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
84 ; GISEL-NEXT: v_readfirstlane_b32 s0, v0
85 ; GISEL-NEXT: ; return to shader part epilog
87 ; CGP-LABEL: s_udiv_i32:
89 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1
90 ; CGP-NEXT: s_sub_i32 s2, 0, s1
91 ; CGP-NEXT: v_rcp_f32_e32 v0, v0
92 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
93 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
94 ; CGP-NEXT: v_mul_lo_u32 v1, s2, v0
95 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1
96 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
97 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0
98 ; CGP-NEXT: v_mul_lo_u32 v1, v0, s1
99 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0
100 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
101 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s1, v1
102 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
103 ; CGP-NEXT: v_subrev_i32_e64 v2, s[2:3], s1, v1
104 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
105 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0
106 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s1, v1
107 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
108 ; CGP-NEXT: v_readfirstlane_b32 s0, v0
109 ; CGP-NEXT: ; return to shader part epilog
110 %result = udiv i32 %num, %den
111 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %result)
115 define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
116 ; GISEL-LABEL: v_udiv_v2i32:
118 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
120 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
121 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
122 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
123 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
124 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
125 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
126 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
127 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
128 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
129 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
130 ; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
131 ; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
132 ; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
133 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
134 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
135 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
136 ; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
137 ; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
138 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
139 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
140 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
141 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
142 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
143 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
144 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
145 ; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
146 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
147 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
148 ; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
149 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
150 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
151 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
152 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
153 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
154 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
155 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
156 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
157 ; GISEL-NEXT: s_setpc_b64 s[30:31]
159 ; CGP-LABEL: v_udiv_v2i32:
161 ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
163 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
164 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
165 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
166 ; CGP-NEXT: v_rcp_f32_e32 v4, v4
167 ; CGP-NEXT: v_rcp_f32_e32 v6, v6
168 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
169 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
170 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
171 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
172 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
173 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
174 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
175 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
176 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
177 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
178 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
179 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
180 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
181 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
182 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
183 ; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
184 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
185 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
186 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
187 ; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
188 ; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
189 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
190 ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
191 ; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
192 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
193 ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
194 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
195 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
196 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
197 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
198 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
199 ; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
200 ; CGP-NEXT: s_setpc_b64 s[30:31]
201 %result = udiv <2 x i32> %num, %den
202 ret <2 x i32> %result
205 define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
206 ; CHECK-LABEL: v_udiv_i32_pow2k_denom:
208 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0
210 ; CHECK-NEXT: s_setpc_b64 s[30:31]
211 %result = udiv i32 %num, 4096
215 define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
216 ; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
218 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0
220 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 12, v1
221 ; CHECK-NEXT: s_setpc_b64 s[30:31]
222 %result = udiv <2 x i32> %num, <i32 4096, i32 4096>
223 ret <2 x i32> %result
226 define i32 @v_udiv_i32_oddk_denom(i32 %num) {
227 ; CHECK-LABEL: v_udiv_i32_oddk_denom:
229 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230 ; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881
231 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
232 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
233 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
234 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
235 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
236 ; CHECK-NEXT: s_setpc_b64 s[30:31]
237 %result = udiv i32 %num, 1235195
241 define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
242 ; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
244 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245 ; CHECK-NEXT: s_mov_b32 s4, 0xb2a50881
246 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, s4
247 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, s4
248 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
249 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
250 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
251 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1
252 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
253 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
254 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
255 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1
256 ; CHECK-NEXT: s_setpc_b64 s[30:31]
257 %result = udiv <2 x i32> %num, <i32 1235195, i32 1235195>
258 ret <2 x i32> %result
261 define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
262 ; CHECK-LABEL: v_udiv_i32_pow2_shl_denom:
264 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; CHECK-NEXT: v_lshl_b32_e32 v1, 0x1000, v1
266 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1
267 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
268 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
269 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
270 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
271 ; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2
272 ; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3
273 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
274 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
275 ; CHECK-NEXT: v_mul_lo_u32 v3, v2, v1
276 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
277 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
278 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
279 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
280 ; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
281 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
282 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2
283 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
284 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
285 ; CHECK-NEXT: s_setpc_b64 s[30:31]
286 %shl.y = shl i32 4096, %y
287 %r = udiv i32 %x, %shl.y
291 define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
292 ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom:
294 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295 ; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
296 ; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
297 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
298 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
299 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
300 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
301 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
302 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
303 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
304 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
305 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
306 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
307 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
308 ; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
309 ; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
310 ; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
311 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
312 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
313 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
314 ; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
315 ; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
316 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
317 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
318 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
319 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
320 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
321 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
322 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
323 ; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
324 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
325 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
326 ; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
327 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
328 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
329 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
330 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
331 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
332 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
333 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
334 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
335 ; GISEL-NEXT: s_setpc_b64 s[30:31]
337 ; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom:
339 ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340 ; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
341 ; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
342 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
343 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
344 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
345 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
346 ; CGP-NEXT: v_rcp_f32_e32 v4, v4
347 ; CGP-NEXT: v_rcp_f32_e32 v6, v6
348 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
349 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
350 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
351 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
352 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
353 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
354 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
355 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
356 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
357 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
358 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
359 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
360 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
361 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
362 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
363 ; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
364 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
365 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
366 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
367 ; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
368 ; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
369 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
370 ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
371 ; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
372 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
373 ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
374 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
375 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
376 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
377 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
378 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
379 ; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
380 ; CGP-NEXT: s_setpc_b64 s[30:31]
381 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
382 %r = udiv <2 x i32> %x, %shl.y
386 define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
387 ; GISEL-LABEL: v_udiv_i32_24bit:
389 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
391 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
392 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
393 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
394 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
395 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
396 ; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
397 ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
398 ; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
399 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
400 ; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
401 ; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1
402 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
403 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
404 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
405 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
406 ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
407 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
408 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2
409 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
410 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
411 ; GISEL-NEXT: s_setpc_b64 s[30:31]
413 ; CGP-LABEL: v_udiv_i32_24bit:
415 ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416 ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
417 ; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
418 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
419 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
420 ; CGP-NEXT: v_rcp_f32_e32 v2, v2
421 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
422 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
423 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
424 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
425 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
426 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
427 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1
428 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
429 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
430 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
431 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
432 ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
433 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
434 ; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
435 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
436 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
437 ; CGP-NEXT: s_setpc_b64 s[30:31]
438 %num.mask = and i32 %num, 16777215
439 %den.mask = and i32 %den, 16777215
440 %result = udiv i32 %num.mask, %den.mask
444 define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
445 ; GISEL-LABEL: v_udiv_v2i32_24bit:
447 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
449 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
450 ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
451 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
452 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
453 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
454 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
455 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
456 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
457 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
458 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
459 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
460 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
461 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
462 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
463 ; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
464 ; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
465 ; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
466 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
467 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
468 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
469 ; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
470 ; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
471 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
472 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
473 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
474 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
475 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
476 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
477 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
478 ; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
479 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
480 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
481 ; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
482 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
483 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
484 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
485 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
486 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
487 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
488 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
489 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
490 ; GISEL-NEXT: s_setpc_b64 s[30:31]
492 ; CGP-LABEL: v_udiv_v2i32_24bit:
494 ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495 ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
496 ; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
497 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
498 ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
499 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
500 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
501 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
502 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
503 ; CGP-NEXT: v_rcp_f32_e32 v4, v4
504 ; CGP-NEXT: v_rcp_f32_e32 v6, v6
505 ; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
506 ; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
507 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
508 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
509 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
510 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
511 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
512 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
513 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
514 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
515 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
516 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
517 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
518 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
519 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
520 ; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
521 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
522 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
523 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
524 ; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
525 ; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
526 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
527 ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
528 ; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
529 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
530 ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
531 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
532 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
533 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
534 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
535 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
536 ; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
537 ; CGP-NEXT: s_setpc_b64 s[30:31]
538 %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
539 %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
540 %result = udiv <2 x i32> %num.mask, %den.mask
541 ret <2 x i32> %result