1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
6 define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
7 ; GFX9-LABEL: udiv32_invariant_denom:
9 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
10 ; GFX9-NEXT: s_mov_b32 s7, 0
11 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
12 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
15 ; GFX9-NEXT: s_sub_i32 s4, 0, s6
16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
17 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
18 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
19 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
20 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1
21 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
22 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
23 ; GFX9-NEXT: s_add_i32 s8, s5, s4
24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
25 ; GFX9-NEXT: .LBB0_1: ; %bb3
26 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
27 ; GFX9-NEXT: s_not_b32 s10, s5
28 ; GFX9-NEXT: s_mul_i32 s9, s6, s5
29 ; GFX9-NEXT: s_mul_i32 s10, s6, s10
30 ; GFX9-NEXT: s_add_i32 s11, s5, 1
31 ; GFX9-NEXT: s_sub_i32 s9, s7, s9
32 ; GFX9-NEXT: s_add_i32 s10, s7, s10
33 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
34 ; GFX9-NEXT: s_cselect_b32 s11, s11, s5
35 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
36 ; GFX9-NEXT: s_add_i32 s10, s11, 1
37 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
38 ; GFX9-NEXT: s_cselect_b32 s9, s10, s11
39 ; GFX9-NEXT: s_add_u32 s10, s0, s2
40 ; GFX9-NEXT: s_addc_u32 s11, s1, s3
41 ; GFX9-NEXT: s_add_i32 s7, s7, 1
42 ; GFX9-NEXT: s_add_u32 s4, s4, s8
43 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
44 ; GFX9-NEXT: s_add_u32 s2, s2, 4
45 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
46 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
47 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
48 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
49 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
50 ; GFX9-NEXT: ; %bb.2: ; %bb2
53 ; GFX10-LABEL: udiv32_invariant_denom:
54 ; GFX10: ; %bb.0: ; %bb
55 ; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
56 ; GFX10-NEXT: s_mov_b32 s7, 0
57 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
58 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
60 ; GFX10-NEXT: s_sub_i32 s2, 0, s6
61 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
62 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
63 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
64 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
65 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
66 ; GFX10-NEXT: s_mul_i32 s2, s2, s4
67 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
68 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
69 ; GFX10-NEXT: s_add_i32 s8, s4, s5
70 ; GFX10-NEXT: s_mov_b64 s[4:5], 0
71 ; GFX10-NEXT: .LBB0_1: ; %bb3
72 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
73 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
74 ; GFX10-NEXT: s_not_b32 s10, s5
75 ; GFX10-NEXT: s_mul_i32 s9, s6, s5
76 ; GFX10-NEXT: s_mul_i32 s10, s6, s10
77 ; GFX10-NEXT: s_sub_i32 s9, s7, s9
78 ; GFX10-NEXT: s_add_i32 s11, s5, 1
79 ; GFX10-NEXT: s_add_i32 s10, s7, s10
80 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6
81 ; GFX10-NEXT: s_cselect_b32 s11, s11, s5
82 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9
83 ; GFX10-NEXT: s_add_i32 s10, s11, 1
84 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6
85 ; GFX10-NEXT: s_cselect_b32 s9, s10, s11
86 ; GFX10-NEXT: s_add_u32 s10, s0, s2
87 ; GFX10-NEXT: s_addc_u32 s11, s1, s3
88 ; GFX10-NEXT: s_add_i32 s7, s7, 1
89 ; GFX10-NEXT: s_add_u32 s4, s4, s8
90 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
91 ; GFX10-NEXT: s_addc_u32 s5, s5, 0
92 ; GFX10-NEXT: s_add_u32 s2, s2, 4
93 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
94 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
95 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
96 ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
97 ; GFX10-NEXT: ; %bb.2: ; %bb2
98 ; GFX10-NEXT: s_endpgm
100 ; GFX11-LABEL: udiv32_invariant_denom:
101 ; GFX11: ; %bb.0: ; %bb
102 ; GFX11-NEXT: s_clause 0x1
103 ; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
104 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
105 ; GFX11-NEXT: s_mov_b32 s7, 0
106 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
108 ; GFX11-NEXT: s_sub_i32 s2, 0, s6
109 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
110 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
111 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
112 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
113 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
114 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
115 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
116 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX11-NEXT: s_mul_i32 s2, s2, s4
118 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
119 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
120 ; GFX11-NEXT: s_mov_b64 s[2:3], 0
121 ; GFX11-NEXT: s_add_i32 s8, s4, s5
122 ; GFX11-NEXT: s_mov_b64 s[4:5], 0
123 ; GFX11-NEXT: .p2align 6
124 ; GFX11-NEXT: .LBB0_1: ; %bb3
125 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
126 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
127 ; GFX11-NEXT: s_not_b32 s10, s5
128 ; GFX11-NEXT: s_mul_i32 s9, s6, s5
129 ; GFX11-NEXT: s_mul_i32 s10, s6, s10
130 ; GFX11-NEXT: s_sub_i32 s9, s7, s9
131 ; GFX11-NEXT: s_add_i32 s11, s5, 1
132 ; GFX11-NEXT: s_add_i32 s10, s7, s10
133 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6
134 ; GFX11-NEXT: s_cselect_b32 s11, s11, s5
135 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9
136 ; GFX11-NEXT: s_add_i32 s10, s11, 1
137 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6
138 ; GFX11-NEXT: s_cselect_b32 s9, s10, s11
139 ; GFX11-NEXT: s_add_u32 s10, s0, s2
140 ; GFX11-NEXT: s_addc_u32 s11, s1, s3
141 ; GFX11-NEXT: s_add_i32 s7, s7, 1
142 ; GFX11-NEXT: s_add_u32 s4, s4, s8
143 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
144 ; GFX11-NEXT: s_addc_u32 s5, s5, 0
145 ; GFX11-NEXT: s_add_u32 s2, s2, 4
146 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
147 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
148 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
149 ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
150 ; GFX11-NEXT: ; %bb.2: ; %bb2
151 ; GFX11-NEXT: s_nop 0
152 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
153 ; GFX11-NEXT: s_endpgm
160 bb3: ; preds = %bb3, %bb
161 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
162 %tmp4 = udiv i32 %tmp, %arg1
163 %tmp5 = zext i32 %tmp to i64
164 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
165 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
166 %tmp7 = add nuw nsw i32 %tmp, 1
167 %tmp8 = icmp eq i32 %tmp7, 1024
168 br i1 %tmp8, label %bb2, label %bb3
171 define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
172 ; GFX9-LABEL: urem32_invariant_denom:
173 ; GFX9: ; %bb.0: ; %bb
174 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
175 ; GFX9-NEXT: s_mov_b32 s7, 0
176 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
177 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
178 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
180 ; GFX9-NEXT: s_sub_i32 s4, 0, s6
181 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
182 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
183 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
184 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
185 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1
186 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
187 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
188 ; GFX9-NEXT: s_add_i32 s8, s5, s4
189 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
190 ; GFX9-NEXT: .LBB1_1: ; %bb3
191 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
192 ; GFX9-NEXT: s_not_b32 s10, s5
193 ; GFX9-NEXT: s_mul_i32 s9, s6, s5
194 ; GFX9-NEXT: s_mul_i32 s10, s6, s10
195 ; GFX9-NEXT: s_sub_i32 s9, s7, s9
196 ; GFX9-NEXT: s_add_i32 s10, s7, s10
197 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
198 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
199 ; GFX9-NEXT: s_sub_i32 s10, s9, s6
200 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
201 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
202 ; GFX9-NEXT: s_add_u32 s10, s0, s2
203 ; GFX9-NEXT: s_addc_u32 s11, s1, s3
204 ; GFX9-NEXT: s_add_i32 s7, s7, 1
205 ; GFX9-NEXT: s_add_u32 s4, s4, s8
206 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
207 ; GFX9-NEXT: s_add_u32 s2, s2, 4
208 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
209 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
210 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
211 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
212 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
213 ; GFX9-NEXT: ; %bb.2: ; %bb2
214 ; GFX9-NEXT: s_endpgm
216 ; GFX10-LABEL: urem32_invariant_denom:
217 ; GFX10: ; %bb.0: ; %bb
218 ; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
219 ; GFX10-NEXT: s_mov_b32 s7, 0
220 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
221 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
223 ; GFX10-NEXT: s_sub_i32 s2, 0, s6
224 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
225 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
226 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
227 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
228 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
229 ; GFX10-NEXT: s_mul_i32 s2, s2, s4
230 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
231 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
232 ; GFX10-NEXT: s_add_i32 s8, s4, s5
233 ; GFX10-NEXT: s_mov_b64 s[4:5], 0
234 ; GFX10-NEXT: .LBB1_1: ; %bb3
235 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
236 ; GFX10-NEXT: s_not_b32 s9, s5
237 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
238 ; GFX10-NEXT: s_mul_i32 s10, s6, s5
239 ; GFX10-NEXT: s_mul_i32 s9, s6, s9
240 ; GFX10-NEXT: s_sub_i32 s10, s7, s10
241 ; GFX10-NEXT: s_add_i32 s9, s7, s9
242 ; GFX10-NEXT: s_cmp_ge_u32 s10, s6
243 ; GFX10-NEXT: s_cselect_b32 s9, s9, s10
244 ; GFX10-NEXT: s_sub_i32 s10, s9, s6
245 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6
246 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9
247 ; GFX10-NEXT: s_add_u32 s10, s0, s2
248 ; GFX10-NEXT: s_addc_u32 s11, s1, s3
249 ; GFX10-NEXT: s_add_i32 s7, s7, 1
250 ; GFX10-NEXT: s_add_u32 s4, s4, s8
251 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
252 ; GFX10-NEXT: s_addc_u32 s5, s5, 0
253 ; GFX10-NEXT: s_add_u32 s2, s2, 4
254 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
255 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
256 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
257 ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
258 ; GFX10-NEXT: ; %bb.2: ; %bb2
259 ; GFX10-NEXT: s_endpgm
261 ; GFX11-LABEL: urem32_invariant_denom:
262 ; GFX11: ; %bb.0: ; %bb
263 ; GFX11-NEXT: s_clause 0x1
264 ; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
265 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
266 ; GFX11-NEXT: s_mov_b32 s7, 0
267 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
269 ; GFX11-NEXT: s_sub_i32 s2, 0, s6
270 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
271 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
272 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
273 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
274 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
275 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
276 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
277 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX11-NEXT: s_mul_i32 s2, s2, s4
279 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
281 ; GFX11-NEXT: s_mov_b64 s[2:3], 0
282 ; GFX11-NEXT: s_add_i32 s8, s4, s5
283 ; GFX11-NEXT: s_mov_b64 s[4:5], 0
284 ; GFX11-NEXT: .p2align 6
285 ; GFX11-NEXT: .LBB1_1: ; %bb3
286 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
287 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
288 ; GFX11-NEXT: s_not_b32 s9, s5
289 ; GFX11-NEXT: s_mul_i32 s10, s6, s5
290 ; GFX11-NEXT: s_mul_i32 s9, s6, s9
291 ; GFX11-NEXT: s_sub_i32 s10, s7, s10
292 ; GFX11-NEXT: s_add_i32 s9, s7, s9
293 ; GFX11-NEXT: s_cmp_ge_u32 s10, s6
294 ; GFX11-NEXT: s_cselect_b32 s9, s9, s10
295 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
296 ; GFX11-NEXT: s_sub_i32 s10, s9, s6
297 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6
298 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9
299 ; GFX11-NEXT: s_add_u32 s10, s0, s2
300 ; GFX11-NEXT: s_addc_u32 s11, s1, s3
301 ; GFX11-NEXT: s_add_i32 s7, s7, 1
302 ; GFX11-NEXT: s_add_u32 s4, s4, s8
303 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
304 ; GFX11-NEXT: s_addc_u32 s5, s5, 0
305 ; GFX11-NEXT: s_add_u32 s2, s2, 4
306 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
307 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
308 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
309 ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
310 ; GFX11-NEXT: ; %bb.2: ; %bb2
311 ; GFX11-NEXT: s_nop 0
312 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
313 ; GFX11-NEXT: s_endpgm
320 bb3: ; preds = %bb3, %bb
321 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
322 %tmp4 = urem i32 %tmp, %arg1
323 %tmp5 = zext i32 %tmp to i64
324 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
325 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
326 %tmp7 = add nuw nsw i32 %tmp, 1
327 %tmp8 = icmp eq i32 %tmp7, 1024
328 br i1 %tmp8, label %bb2, label %bb3
331 define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
332 ; GFX9-LABEL: sdiv32_invariant_denom:
333 ; GFX9: ; %bb.0: ; %bb
334 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
335 ; GFX9-NEXT: s_mov_b32 s4, 0
336 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
337 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
338 ; GFX9-NEXT: s_ashr_i32 s2, s3, 31
339 ; GFX9-NEXT: s_add_i32 s3, s3, s2
340 ; GFX9-NEXT: s_xor_b32 s3, s3, s2
341 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
342 ; GFX9-NEXT: s_sub_i32 s5, 0, s3
343 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
344 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
345 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
346 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
347 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
348 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
349 ; GFX9-NEXT: s_add_i32 s5, s6, s5
350 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
351 ; GFX9-NEXT: .LBB2_1: ; %bb3
352 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
353 ; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5
354 ; GFX9-NEXT: s_mul_i32 s7, s6, s3
355 ; GFX9-NEXT: s_sub_i32 s7, s4, s7
356 ; GFX9-NEXT: s_add_i32 s8, s6, 1
357 ; GFX9-NEXT: s_sub_i32 s9, s7, s3
358 ; GFX9-NEXT: s_cmp_ge_u32 s7, s3
359 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
360 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7
361 ; GFX9-NEXT: s_add_i32 s8, s6, 1
362 ; GFX9-NEXT: s_cmp_ge_u32 s7, s3
363 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
364 ; GFX9-NEXT: s_xor_b32 s6, s6, s2
365 ; GFX9-NEXT: s_sub_i32 s6, s6, s2
366 ; GFX9-NEXT: s_add_i32 s4, s4, 1
367 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
368 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
369 ; GFX9-NEXT: s_add_u32 s0, s0, 4
370 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
371 ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
372 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
373 ; GFX9-NEXT: ; %bb.2: ; %bb2
374 ; GFX9-NEXT: s_endpgm
376 ; GFX10-LABEL: sdiv32_invariant_denom:
377 ; GFX10: ; %bb.0: ; %bb
378 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c
379 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX10-NEXT: s_ashr_i32 s2, s3, 31
381 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
382 ; GFX10-NEXT: s_add_i32 s3, s3, s2
383 ; GFX10-NEXT: s_xor_b32 s3, s3, s2
384 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
385 ; GFX10-NEXT: s_sub_i32 s4, 0, s3
386 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
387 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
388 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
389 ; GFX10-NEXT: v_readfirstlane_b32 s5, v0
390 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
391 ; GFX10-NEXT: s_mul_i32 s4, s4, s5
392 ; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4
393 ; GFX10-NEXT: s_mov_b32 s4, 0
394 ; GFX10-NEXT: s_add_i32 s5, s5, s6
395 ; GFX10-NEXT: .LBB2_1: ; %bb3
396 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
397 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5
398 ; GFX10-NEXT: s_mul_i32 s7, s6, s3
399 ; GFX10-NEXT: s_add_i32 s8, s6, 1
400 ; GFX10-NEXT: s_sub_i32 s7, s4, s7
401 ; GFX10-NEXT: s_sub_i32 s9, s7, s3
402 ; GFX10-NEXT: s_cmp_ge_u32 s7, s3
403 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6
404 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7
405 ; GFX10-NEXT: s_add_i32 s8, s6, 1
406 ; GFX10-NEXT: s_cmp_ge_u32 s7, s3
407 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6
408 ; GFX10-NEXT: s_add_i32 s4, s4, 1
409 ; GFX10-NEXT: s_xor_b32 s6, s6, s2
410 ; GFX10-NEXT: s_sub_i32 s6, s6, s2
411 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
412 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
414 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
415 ; GFX10-NEXT: s_add_u32 s0, s0, 4
416 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
417 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
418 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
419 ; GFX10-NEXT: ; %bb.2: ; %bb2
420 ; GFX10-NEXT: s_endpgm
422 ; GFX11-LABEL: sdiv32_invariant_denom:
423 ; GFX11: ; %bb.0: ; %bb
424 ; GFX11-NEXT: s_clause 0x1
425 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c
426 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
427 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX11-NEXT: s_ashr_i32 s2, s3, 31
429 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
430 ; GFX11-NEXT: s_add_i32 s3, s3, s2
431 ; GFX11-NEXT: s_xor_b32 s3, s3, s2
432 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
433 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3
434 ; GFX11-NEXT: s_sub_i32 s4, 0, s3
435 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
436 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
437 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
438 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
439 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
440 ; GFX11-NEXT: v_readfirstlane_b32 s5, v0
441 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
442 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
443 ; GFX11-NEXT: s_mul_i32 s4, s4, s5
444 ; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
445 ; GFX11-NEXT: s_mov_b32 s4, 0
446 ; GFX11-NEXT: s_add_i32 s5, s5, s6
447 ; GFX11-NEXT: .p2align 6
448 ; GFX11-NEXT: .LBB2_1: ; %bb3
449 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
450 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
451 ; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5
452 ; GFX11-NEXT: s_mul_i32 s7, s6, s3
453 ; GFX11-NEXT: s_add_i32 s8, s6, 1
454 ; GFX11-NEXT: s_sub_i32 s7, s4, s7
455 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
456 ; GFX11-NEXT: s_sub_i32 s9, s7, s3
457 ; GFX11-NEXT: s_cmp_ge_u32 s7, s3
458 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6
459 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7
460 ; GFX11-NEXT: s_add_i32 s8, s6, 1
461 ; GFX11-NEXT: s_cmp_ge_u32 s7, s3
462 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6
463 ; GFX11-NEXT: s_add_i32 s4, s4, 1
464 ; GFX11-NEXT: s_xor_b32 s6, s6, s2
465 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
466 ; GFX11-NEXT: s_sub_i32 s6, s6, s2
467 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
468 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
469 ; GFX11-NEXT: s_add_u32 s0, s0, 4
470 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
471 ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
472 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1
473 ; GFX11-NEXT: ; %bb.2: ; %bb2
474 ; GFX11-NEXT: s_nop 0
475 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
476 ; GFX11-NEXT: s_endpgm
483 bb3: ; preds = %bb3, %bb
484 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
485 %tmp4 = sdiv i32 %tmp, %arg1
486 %tmp5 = zext i32 %tmp to i64
487 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
488 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
489 %tmp7 = add nuw nsw i32 %tmp, 1
490 %tmp8 = icmp eq i32 %tmp7, 1024
491 br i1 %tmp8, label %bb2, label %bb3
494 define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
495 ; GFX9-LABEL: srem32_invariant_denom:
496 ; GFX9: ; %bb.0: ; %bb
497 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
498 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
499 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
500 ; GFX9-NEXT: s_add_i32 s2, s2, s3
501 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
502 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
503 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
504 ; GFX9-NEXT: s_sub_i32 s4, 0, s2
505 ; GFX9-NEXT: s_mov_b32 s3, 0
506 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
507 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
508 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
509 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
510 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
511 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
512 ; GFX9-NEXT: s_add_i32 s4, s5, s4
513 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
514 ; GFX9-NEXT: .LBB3_1: ; %bb3
515 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
516 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
517 ; GFX9-NEXT: s_mul_i32 s5, s5, s2
518 ; GFX9-NEXT: s_sub_i32 s5, s3, s5
519 ; GFX9-NEXT: s_sub_i32 s6, s5, s2
520 ; GFX9-NEXT: s_cmp_ge_u32 s5, s2
521 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
522 ; GFX9-NEXT: s_sub_i32 s6, s5, s2
523 ; GFX9-NEXT: s_cmp_ge_u32 s5, s2
524 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
525 ; GFX9-NEXT: s_add_i32 s3, s3, 1
526 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
527 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
528 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
529 ; GFX9-NEXT: s_add_u32 s0, s0, 4
530 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
531 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
532 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
533 ; GFX9-NEXT: ; %bb.2: ; %bb2
534 ; GFX9-NEXT: s_endpgm
536 ; GFX10-LABEL: srem32_invariant_denom:
537 ; GFX10: ; %bb.0: ; %bb
538 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
539 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX10-NEXT: s_ashr_i32 s3, s2, 31
541 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
542 ; GFX10-NEXT: s_add_i32 s2, s2, s3
543 ; GFX10-NEXT: s_xor_b32 s2, s2, s3
544 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
545 ; GFX10-NEXT: s_sub_i32 s3, 0, s2
546 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
547 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
548 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
549 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
550 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
551 ; GFX10-NEXT: s_mul_i32 s3, s3, s4
552 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3
553 ; GFX10-NEXT: s_mov_b32 s3, 0
554 ; GFX10-NEXT: s_add_i32 s4, s4, s5
555 ; GFX10-NEXT: .LBB3_1: ; %bb3
556 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
557 ; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4
558 ; GFX10-NEXT: s_mul_i32 s5, s5, s2
559 ; GFX10-NEXT: s_sub_i32 s5, s3, s5
560 ; GFX10-NEXT: s_sub_i32 s6, s5, s2
561 ; GFX10-NEXT: s_cmp_ge_u32 s5, s2
562 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5
563 ; GFX10-NEXT: s_sub_i32 s6, s5, s2
564 ; GFX10-NEXT: s_cmp_ge_u32 s5, s2
565 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5
566 ; GFX10-NEXT: s_add_i32 s3, s3, 1
567 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
568 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
570 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
571 ; GFX10-NEXT: s_add_u32 s0, s0, 4
572 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
573 ; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
574 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
575 ; GFX10-NEXT: ; %bb.2: ; %bb2
576 ; GFX10-NEXT: s_endpgm
578 ; GFX11-LABEL: srem32_invariant_denom:
579 ; GFX11: ; %bb.0: ; %bb
580 ; GFX11-NEXT: s_clause 0x1
581 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
582 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
583 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX11-NEXT: s_ashr_i32 s3, s2, 31
585 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
586 ; GFX11-NEXT: s_add_i32 s2, s2, s3
587 ; GFX11-NEXT: s_xor_b32 s2, s2, s3
588 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
589 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
590 ; GFX11-NEXT: s_sub_i32 s3, 0, s2
591 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
592 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
593 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
595 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
596 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
597 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
598 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
599 ; GFX11-NEXT: s_mul_i32 s3, s3, s4
600 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
601 ; GFX11-NEXT: s_mov_b32 s3, 0
602 ; GFX11-NEXT: s_add_i32 s4, s4, s5
603 ; GFX11-NEXT: .p2align 6
604 ; GFX11-NEXT: .LBB3_1: ; %bb3
605 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
606 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
607 ; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4
608 ; GFX11-NEXT: s_mul_i32 s5, s5, s2
609 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
610 ; GFX11-NEXT: s_sub_i32 s5, s3, s5
611 ; GFX11-NEXT: s_sub_i32 s6, s5, s2
612 ; GFX11-NEXT: s_cmp_ge_u32 s5, s2
613 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5
614 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
615 ; GFX11-NEXT: s_sub_i32 s6, s5, s2
616 ; GFX11-NEXT: s_cmp_ge_u32 s5, s2
617 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5
618 ; GFX11-NEXT: s_add_i32 s3, s3, 1
619 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
620 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
621 ; GFX11-NEXT: s_add_u32 s0, s0, 4
622 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
623 ; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
624 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1
625 ; GFX11-NEXT: ; %bb.2: ; %bb2
626 ; GFX11-NEXT: s_nop 0
627 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
628 ; GFX11-NEXT: s_endpgm
635 bb3: ; preds = %bb3, %bb
636 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
637 %tmp4 = srem i32 %tmp, %arg1
638 %tmp5 = zext i32 %tmp to i64
639 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
640 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
641 %tmp7 = add nuw nsw i32 %tmp, 1
642 %tmp8 = icmp eq i32 %tmp7, 1024
643 br i1 %tmp8, label %bb2, label %bb3
646 define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
647 ; GFX9-LABEL: udiv16_invariant_denom:
648 ; GFX9: ; %bb.0: ; %bb
649 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
650 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
651 ; GFX9-NEXT: s_movk_i32 s4, 0x400
652 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
654 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
655 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
656 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
657 ; GFX9-NEXT: .LBB4_1: ; %bb3
658 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
659 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2
660 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3
661 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
662 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
663 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3
664 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1
665 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
666 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5
667 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4
668 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0
669 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1]
670 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
671 ; GFX9-NEXT: global_store_short v3, v4, s[2:3]
672 ; GFX9-NEXT: s_cbranch_vccz .LBB4_1
673 ; GFX9-NEXT: ; %bb.2: ; %bb2
674 ; GFX9-NEXT: s_endpgm
676 ; GFX10-LABEL: udiv16_invariant_denom:
677 ; GFX10: ; %bb.0: ; %bb
678 ; GFX10-NEXT: s_clause 0x1
679 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
680 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
681 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
682 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
683 ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff
684 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0
685 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
686 ; GFX10-NEXT: .LBB4_1: ; %bb3
687 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
688 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2
689 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1
690 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3
691 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
692 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3
693 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1
694 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
695 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
696 ; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4
697 ; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5
698 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0
699 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
700 ; GFX10-NEXT: global_store_short v3, v4, s[2:3]
701 ; GFX10-NEXT: s_cbranch_vccz .LBB4_1
702 ; GFX10-NEXT: ; %bb.2: ; %bb2
703 ; GFX10-NEXT: s_endpgm
705 ; GFX11-LABEL: udiv16_invariant_denom:
706 ; GFX11: ; %bb.0: ; %bb
707 ; GFX11-NEXT: s_clause 0x1
708 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
709 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
710 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
711 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
712 ; GFX11-NEXT: s_and_b32 s0, s4, 0xffff
713 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
714 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
715 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
716 ; GFX11-NEXT: .p2align 6
717 ; GFX11-NEXT: .LBB4_1: ; %bb3
718 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
719 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
720 ; GFX11-NEXT: v_add_nc_u16 v2, v2, 1
721 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
722 ; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3
723 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
724 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3
725 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
726 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1
727 ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
728 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
729 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
730 ; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4
731 ; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5
732 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
733 ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0
734 ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
735 ; GFX11-NEXT: global_store_b16 v3, v4, s[2:3]
736 ; GFX11-NEXT: s_cbranch_vccz .LBB4_1
737 ; GFX11-NEXT: ; %bb.2: ; %bb2
738 ; GFX11-NEXT: s_nop 0
739 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
740 ; GFX11-NEXT: s_endpgm
747 bb3: ; preds = %bb3, %bb
748 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
749 %tmp4 = udiv i16 %tmp, %arg1
750 %tmp5 = zext i16 %tmp to i64
751 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
752 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
753 %tmp7 = add nuw nsw i16 %tmp, 1
754 %tmp8 = icmp eq i16 %tmp7, 1024
755 br i1 %tmp8, label %bb2, label %bb3
758 define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
759 ; GFX9-LABEL: urem16_invariant_denom:
760 ; GFX9: ; %bb.0: ; %bb
761 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
762 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
763 ; GFX9-NEXT: s_movk_i32 s5, 0x400
764 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
765 ; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
766 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
767 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
768 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
769 ; GFX9-NEXT: .LBB5_1: ; %bb3
770 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
771 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2
772 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3
773 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
774 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
775 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 1, v3
776 ; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1
777 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
778 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
779 ; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4
780 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0
781 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
782 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
783 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4
784 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
785 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX9-NEXT: global_store_short v5, v3, s[2:3]
787 ; GFX9-NEXT: s_cbranch_vccz .LBB5_1
788 ; GFX9-NEXT: ; %bb.2: ; %bb2
789 ; GFX9-NEXT: s_endpgm
791 ; GFX10-LABEL: urem16_invariant_denom:
792 ; GFX10: ; %bb.0: ; %bb
793 ; GFX10-NEXT: s_clause 0x1
794 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
795 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
796 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
797 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
798 ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff
799 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0
800 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
801 ; GFX10-NEXT: .LBB5_1: ; %bb3
802 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
803 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2
804 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1
805 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3
806 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1
807 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
808 ; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4
809 ; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5
810 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
811 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
812 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
813 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 1, v3
814 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0
815 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v4
816 ; GFX10-NEXT: global_store_short v5, v3, s[2:3]
817 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1
818 ; GFX10-NEXT: ; %bb.2: ; %bb2
819 ; GFX10-NEXT: s_endpgm
821 ; GFX11-LABEL: urem16_invariant_denom:
822 ; GFX11: ; %bb.0: ; %bb
823 ; GFX11-NEXT: s_clause 0x1
824 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
825 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
826 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
827 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
828 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
829 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
830 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
831 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
832 ; GFX11-NEXT: .p2align 6
833 ; GFX11-NEXT: .LBB5_1: ; %bb3
834 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
835 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
836 ; GFX11-NEXT: v_add_nc_u16 v2, v2, 1
837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
838 ; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3
839 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
840 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1
841 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
842 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
843 ; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4
844 ; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5
845 ; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
846 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
847 ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
848 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
849 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3
850 ; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2
851 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
852 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4
853 ; GFX11-NEXT: global_store_b16 v5, v3, s[0:1]
854 ; GFX11-NEXT: s_cbranch_vccz .LBB5_1
855 ; GFX11-NEXT: ; %bb.2: ; %bb2
856 ; GFX11-NEXT: s_nop 0
857 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
858 ; GFX11-NEXT: s_endpgm
865 bb3: ; preds = %bb3, %bb
866 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
867 %tmp4 = urem i16 %tmp, %arg1
868 %tmp5 = zext i16 %tmp to i64
869 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
870 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
871 %tmp7 = add nuw nsw i16 %tmp, 1
872 %tmp8 = icmp eq i16 %tmp7, 1024
873 br i1 %tmp8, label %bb2, label %bb3
876 define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
877 ; GFX9-LABEL: sdiv16_invariant_denom:
878 ; GFX9: ; %bb.0: ; %bb
879 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
880 ; GFX9-NEXT: s_mov_b32 s4, 0
881 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
882 ; GFX9-NEXT: s_movk_i32 s3, 0x400
883 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
884 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
885 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
886 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
887 ; GFX9-NEXT: .LBB6_1: ; %bb3
888 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
889 ; GFX9-NEXT: s_sext_i32_i16 s5, s4
890 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
891 ; GFX9-NEXT: s_xor_b32 s6, s5, s2
892 ; GFX9-NEXT: s_ashr_i32 s5, s6, 30
893 ; GFX9-NEXT: s_or_b32 s5, s5, 1
894 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
895 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
896 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
897 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
898 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
899 ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
900 ; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
901 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0
902 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s4
903 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
904 ; GFX9-NEXT: v_readfirstlane_b32 s4, v2
905 ; GFX9-NEXT: v_add_u32_e32 v2, s5, v4
906 ; GFX9-NEXT: s_lshl_b32 s5, s6, 1
907 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
908 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
909 ; GFX9-NEXT: s_cbranch_vccz .LBB6_1
910 ; GFX9-NEXT: ; %bb.2: ; %bb2
911 ; GFX9-NEXT: s_endpgm
913 ; GFX10-LABEL: sdiv16_invariant_denom:
914 ; GFX10: ; %bb.0: ; %bb
915 ; GFX10-NEXT: s_clause 0x1
916 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
917 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
918 ; GFX10-NEXT: s_mov_b32 s1, 0
919 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
920 ; GFX10-NEXT: s_sext_i32_i16 s0, s4
921 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0
922 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
923 ; GFX10-NEXT: .LBB6_1: ; %bb3
924 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
925 ; GFX10-NEXT: s_sext_i32_i16 s4, s1
926 ; GFX10-NEXT: v_add_nc_u16 v2, s1, 1
927 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4
928 ; GFX10-NEXT: s_xor_b32 s5, s4, s0
929 ; GFX10-NEXT: s_ashr_i32 s4, s5, 30
930 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
931 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1
932 ; GFX10-NEXT: s_or_b32 s4, s4, 1
933 ; GFX10-NEXT: v_trunc_f32_e32 v4, v4
934 ; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3
935 ; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v4
936 ; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
937 ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
938 ; GFX10-NEXT: s_cselect_b32 s4, s4, 0
939 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s1
940 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
941 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
942 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4
943 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
944 ; GFX10-NEXT: global_store_short v3, v2, s[2:3]
945 ; GFX10-NEXT: s_cbranch_vccz .LBB6_1
946 ; GFX10-NEXT: ; %bb.2: ; %bb2
947 ; GFX10-NEXT: s_endpgm
949 ; GFX11-LABEL: sdiv16_invariant_denom:
950 ; GFX11: ; %bb.0: ; %bb
951 ; GFX11-NEXT: s_clause 0x1
952 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
953 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
954 ; GFX11-NEXT: s_mov_b32 s3, 0
955 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
956 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
957 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
958 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
959 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
960 ; GFX11-NEXT: .p2align 6
961 ; GFX11-NEXT: .LBB6_1: ; %bb3
962 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
963 ; GFX11-NEXT: s_sext_i32_i16 s4, s3
964 ; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
965 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
966 ; GFX11-NEXT: s_xor_b32 s5, s4, s2
967 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
968 ; GFX11-NEXT: s_ashr_i32 s4, s5, 30
969 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
970 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
971 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1
972 ; GFX11-NEXT: s_or_b32 s4, s4, 1
973 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
974 ; GFX11-NEXT: v_trunc_f32_e32 v4, v4
975 ; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3
976 ; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v4
977 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
978 ; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
979 ; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
980 ; GFX11-NEXT: s_cselect_b32 s4, s4, 0
981 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
982 ; GFX11-NEXT: v_readfirstlane_b32 s3, v2
983 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
984 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
985 ; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
986 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
987 ; GFX11-NEXT: s_cbranch_vccz .LBB6_1
988 ; GFX11-NEXT: ; %bb.2: ; %bb2
989 ; GFX11-NEXT: s_nop 0
990 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
991 ; GFX11-NEXT: s_endpgm
998 bb3: ; preds = %bb3, %bb
999 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
1000 %tmp4 = sdiv i16 %tmp, %arg1
1001 %tmp5 = zext i16 %tmp to i64
1002 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
1003 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
1004 %tmp7 = add nuw nsw i16 %tmp, 1
1005 %tmp8 = icmp eq i16 %tmp7, 1024
1006 br i1 %tmp8, label %bb2, label %bb3
1009 define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
1010 ; GFX9-LABEL: srem16_invariant_denom:
1011 ; GFX9: ; %bb.0: ; %bb
1012 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
1013 ; GFX9-NEXT: s_mov_b32 s4, 0
1014 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1015 ; GFX9-NEXT: s_movk_i32 s3, 0x400
1016 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1017 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
1018 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
1019 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
1020 ; GFX9-NEXT: .LBB7_1: ; %bb3
1021 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1022 ; GFX9-NEXT: s_sext_i32_i16 s5, s4
1023 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
1024 ; GFX9-NEXT: s_xor_b32 s6, s5, s2
1025 ; GFX9-NEXT: s_ashr_i32 s6, s6, 30
1026 ; GFX9-NEXT: s_or_b32 s8, s6, 1
1027 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
1028 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
1029 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
1030 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
1031 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
1032 ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
1033 ; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
1034 ; GFX9-NEXT: s_cselect_b32 s6, s8, 0
1035 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
1036 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s4
1037 ; GFX9-NEXT: v_readfirstlane_b32 s4, v2
1038 ; GFX9-NEXT: v_add_u32_e32 v2, s6, v4
1039 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
1040 ; GFX9-NEXT: s_lshl_b32 s6, s7, 1
1041 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
1042 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
1043 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
1044 ; GFX9-NEXT: s_cbranch_vccz .LBB7_1
1045 ; GFX9-NEXT: ; %bb.2: ; %bb2
1046 ; GFX9-NEXT: s_endpgm
1048 ; GFX10-LABEL: srem16_invariant_denom:
1049 ; GFX10: ; %bb.0: ; %bb
1050 ; GFX10-NEXT: s_clause 0x1
1051 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
1052 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1053 ; GFX10-NEXT: s_mov_b32 s1, 0
1054 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1055 ; GFX10-NEXT: s_sext_i32_i16 s0, s4
1056 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0
1057 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
1058 ; GFX10-NEXT: .LBB7_1: ; %bb3
1059 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1060 ; GFX10-NEXT: s_sext_i32_i16 s4, s1
1061 ; GFX10-NEXT: v_add_nc_u16 v2, s1, 1
1062 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4
1063 ; GFX10-NEXT: s_xor_b32 s5, s4, s0
1064 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30
1065 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
1066 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1
1067 ; GFX10-NEXT: s_or_b32 s5, s5, 1
1068 ; GFX10-NEXT: v_trunc_f32_e32 v4, v4
1069 ; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3
1070 ; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0|
1071 ; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4
1072 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
1073 ; GFX10-NEXT: s_cselect_b32 s5, s5, 0
1074 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
1075 ; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3
1076 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s1
1077 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
1078 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
1079 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
1080 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0
1081 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3
1082 ; GFX10-NEXT: global_store_short v2, v3, s[2:3]
1083 ; GFX10-NEXT: s_cbranch_vccz .LBB7_1
1084 ; GFX10-NEXT: ; %bb.2: ; %bb2
1085 ; GFX10-NEXT: s_endpgm
1087 ; GFX11-LABEL: srem16_invariant_denom:
1088 ; GFX11: ; %bb.0: ; %bb
1089 ; GFX11-NEXT: s_clause 0x1
1090 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
1091 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1092 ; GFX11-NEXT: s_mov_b32 s3, 0
1093 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1094 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
1095 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1096 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
1097 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
1098 ; GFX11-NEXT: .p2align 6
1099 ; GFX11-NEXT: .LBB7_1: ; %bb3
1100 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1101 ; GFX11-NEXT: s_sext_i32_i16 s4, s3
1102 ; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
1103 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
1104 ; GFX11-NEXT: s_xor_b32 s5, s4, s2
1105 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1106 ; GFX11-NEXT: s_ashr_i32 s5, s5, 30
1107 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
1108 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1109 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1
1110 ; GFX11-NEXT: s_or_b32 s5, s5, 1
1111 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1112 ; GFX11-NEXT: v_trunc_f32_e32 v4, v4
1113 ; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3
1114 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1115 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0|
1116 ; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v4
1117 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
1118 ; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1119 ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
1120 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1121 ; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3
1122 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
1123 ; GFX11-NEXT: v_readfirstlane_b32 s3, v2
1124 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
1125 ; GFX11-NEXT: v_mov_b32_e32 v2, s5
1126 ; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2
1127 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1128 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3
1129 ; GFX11-NEXT: global_store_b16 v2, v3, s[0:1]
1130 ; GFX11-NEXT: s_cbranch_vccz .LBB7_1
1131 ; GFX11-NEXT: ; %bb.2: ; %bb2
1132 ; GFX11-NEXT: s_nop 0
1133 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1134 ; GFX11-NEXT: s_endpgm
1141 bb3: ; preds = %bb3, %bb
1142 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
1143 %tmp4 = srem i16 %tmp, %arg1
1144 %tmp5 = zext i16 %tmp to i64
1145 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
1146 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
1147 %tmp7 = add nuw nsw i16 %tmp, 1
1148 %tmp8 = icmp eq i16 %tmp7, 1024
1149 br i1 %tmp8, label %bb2, label %bb3