1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
6 define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
7 ; GFX9-LABEL: udiv32_invariant_denom:
9 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
10 ; GFX9-NEXT: s_mov_b32 s8, 0
11 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
12 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
15 ; GFX9-NEXT: s_sub_i32 s4, 0, s5
16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
17 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
18 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
19 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
20 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1
21 ; GFX9-NEXT: s_mul_i32 s4, s4, s6
22 ; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
23 ; GFX9-NEXT: s_add_i32 s4, s6, s4
24 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
25 ; GFX9-NEXT: .LBB0_1: ; %bb3
26 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
27 ; GFX9-NEXT: s_not_b32 s10, s7
28 ; GFX9-NEXT: s_mul_i32 s9, s5, s7
29 ; GFX9-NEXT: s_mul_i32 s10, s5, s10
30 ; GFX9-NEXT: s_add_i32 s11, s7, 1
31 ; GFX9-NEXT: s_sub_i32 s9, s8, s9
32 ; GFX9-NEXT: s_add_i32 s10, s8, s10
33 ; GFX9-NEXT: s_cmp_ge_u32 s9, s5
34 ; GFX9-NEXT: s_cselect_b32 s11, s11, s7
35 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
36 ; GFX9-NEXT: s_add_i32 s10, s11, 1
37 ; GFX9-NEXT: s_cmp_ge_u32 s9, s5
38 ; GFX9-NEXT: s_cselect_b32 s9, s10, s11
39 ; GFX9-NEXT: s_add_u32 s10, s0, s2
40 ; GFX9-NEXT: s_addc_u32 s11, s1, s3
41 ; GFX9-NEXT: s_add_i32 s8, s8, 1
42 ; GFX9-NEXT: s_add_u32 s6, s6, s4
43 ; GFX9-NEXT: s_addc_u32 s7, s7, 0
44 ; GFX9-NEXT: s_add_u32 s2, s2, 4
45 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
46 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
47 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
48 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
49 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
50 ; GFX9-NEXT: ; %bb.2: ; %bb2
53 ; GFX10-LABEL: udiv32_invariant_denom:
54 ; GFX10: ; %bb.0: ; %bb
55 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c
56 ; GFX10-NEXT: s_mov_b32 s8, 0
57 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
58 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5
60 ; GFX10-NEXT: s_sub_i32 s2, 0, s5
61 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
62 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
63 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
64 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
65 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
66 ; GFX10-NEXT: s_mul_i32 s2, s2, s4
67 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s2
68 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
69 ; GFX10-NEXT: s_add_i32 s4, s4, s6
70 ; GFX10-NEXT: s_mov_b64 s[6:7], 0
71 ; GFX10-NEXT: .LBB0_1: ; %bb3
72 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
73 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
74 ; GFX10-NEXT: s_not_b32 s10, s7
75 ; GFX10-NEXT: s_mul_i32 s9, s5, s7
76 ; GFX10-NEXT: s_mul_i32 s10, s5, s10
77 ; GFX10-NEXT: s_sub_i32 s9, s8, s9
78 ; GFX10-NEXT: s_add_i32 s11, s7, 1
79 ; GFX10-NEXT: s_add_i32 s10, s8, s10
80 ; GFX10-NEXT: s_cmp_ge_u32 s9, s5
81 ; GFX10-NEXT: s_cselect_b32 s11, s11, s7
82 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9
83 ; GFX10-NEXT: s_add_i32 s10, s11, 1
84 ; GFX10-NEXT: s_cmp_ge_u32 s9, s5
85 ; GFX10-NEXT: s_cselect_b32 s9, s10, s11
86 ; GFX10-NEXT: s_add_u32 s10, s0, s2
87 ; GFX10-NEXT: s_addc_u32 s11, s1, s3
88 ; GFX10-NEXT: s_add_i32 s8, s8, 1
89 ; GFX10-NEXT: s_add_u32 s6, s6, s4
90 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
91 ; GFX10-NEXT: s_addc_u32 s7, s7, 0
92 ; GFX10-NEXT: s_add_u32 s2, s2, 4
93 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
94 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
95 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
96 ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
97 ; GFX10-NEXT: ; %bb.2: ; %bb2
98 ; GFX10-NEXT: s_endpgm
100 ; GFX11-LABEL: udiv32_invariant_denom:
101 ; GFX11: ; %bb.0: ; %bb
102 ; GFX11-NEXT: s_clause 0x1
103 ; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c
104 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
105 ; GFX11-NEXT: s_mov_b32 s8, 0
106 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s5
108 ; GFX11-NEXT: s_sub_i32 s2, 0, s5
109 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
110 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
111 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
112 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
113 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
114 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
115 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
116 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX11-NEXT: s_mul_i32 s2, s2, s4
118 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
119 ; GFX11-NEXT: s_mul_hi_u32 s6, s4, s2
120 ; GFX11-NEXT: s_mov_b64 s[2:3], 0
121 ; GFX11-NEXT: s_add_i32 s4, s4, s6
122 ; GFX11-NEXT: s_mov_b64 s[6:7], 0
123 ; GFX11-NEXT: .p2align 6
124 ; GFX11-NEXT: .LBB0_1: ; %bb3
125 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
126 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
127 ; GFX11-NEXT: s_not_b32 s10, s7
128 ; GFX11-NEXT: s_mul_i32 s9, s5, s7
129 ; GFX11-NEXT: s_mul_i32 s10, s5, s10
130 ; GFX11-NEXT: s_sub_i32 s9, s8, s9
131 ; GFX11-NEXT: s_add_i32 s11, s7, 1
132 ; GFX11-NEXT: s_add_i32 s10, s8, s10
133 ; GFX11-NEXT: s_cmp_ge_u32 s9, s5
134 ; GFX11-NEXT: s_cselect_b32 s11, s11, s7
135 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9
136 ; GFX11-NEXT: s_add_i32 s10, s11, 1
137 ; GFX11-NEXT: s_cmp_ge_u32 s9, s5
138 ; GFX11-NEXT: s_cselect_b32 s9, s10, s11
139 ; GFX11-NEXT: s_add_u32 s10, s0, s2
140 ; GFX11-NEXT: s_addc_u32 s11, s1, s3
141 ; GFX11-NEXT: s_add_i32 s8, s8, 1
142 ; GFX11-NEXT: s_add_u32 s6, s6, s4
143 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
144 ; GFX11-NEXT: s_addc_u32 s7, s7, 0
145 ; GFX11-NEXT: s_add_u32 s2, s2, 4
146 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
147 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
148 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
149 ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
150 ; GFX11-NEXT: ; %bb.2: ; %bb2
151 ; GFX11-NEXT: s_nop 0
152 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
153 ; GFX11-NEXT: s_endpgm
160 bb3: ; preds = %bb3, %bb
161 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
162 %tmp4 = udiv i32 %tmp, %arg1
163 %tmp5 = zext i32 %tmp to i64
164 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
165 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
166 %tmp7 = add nuw nsw i32 %tmp, 1
167 %tmp8 = icmp eq i32 %tmp7, 1024
168 br i1 %tmp8, label %bb2, label %bb3
171 define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
172 ; GFX9-LABEL: urem32_invariant_denom:
173 ; GFX9: ; %bb.0: ; %bb
174 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
175 ; GFX9-NEXT: s_mov_b32 s8, 0
176 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
177 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
178 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
180 ; GFX9-NEXT: s_sub_i32 s4, 0, s5
181 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
182 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
183 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
184 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
185 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1
186 ; GFX9-NEXT: s_mul_i32 s4, s4, s6
187 ; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
188 ; GFX9-NEXT: s_add_i32 s4, s6, s4
189 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
190 ; GFX9-NEXT: .LBB1_1: ; %bb3
191 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
192 ; GFX9-NEXT: s_not_b32 s10, s7
193 ; GFX9-NEXT: s_mul_i32 s9, s5, s7
194 ; GFX9-NEXT: s_mul_i32 s10, s5, s10
195 ; GFX9-NEXT: s_sub_i32 s9, s8, s9
196 ; GFX9-NEXT: s_add_i32 s10, s8, s10
197 ; GFX9-NEXT: s_cmp_ge_u32 s9, s5
198 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
199 ; GFX9-NEXT: s_sub_i32 s10, s9, s5
200 ; GFX9-NEXT: s_cmp_ge_u32 s9, s5
201 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
202 ; GFX9-NEXT: s_add_u32 s10, s0, s2
203 ; GFX9-NEXT: s_addc_u32 s11, s1, s3
204 ; GFX9-NEXT: s_add_i32 s8, s8, 1
205 ; GFX9-NEXT: s_add_u32 s6, s6, s4
206 ; GFX9-NEXT: s_addc_u32 s7, s7, 0
207 ; GFX9-NEXT: s_add_u32 s2, s2, 4
208 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
209 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
210 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
211 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
212 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
213 ; GFX9-NEXT: ; %bb.2: ; %bb2
214 ; GFX9-NEXT: s_endpgm
216 ; GFX10-LABEL: urem32_invariant_denom:
217 ; GFX10: ; %bb.0: ; %bb
218 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c
219 ; GFX10-NEXT: s_mov_b32 s8, 0
220 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
221 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5
223 ; GFX10-NEXT: s_sub_i32 s2, 0, s5
224 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
225 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
226 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
227 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
228 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
229 ; GFX10-NEXT: s_mul_i32 s2, s2, s4
230 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s2
231 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
232 ; GFX10-NEXT: s_add_i32 s4, s4, s6
233 ; GFX10-NEXT: s_mov_b64 s[6:7], 0
234 ; GFX10-NEXT: .LBB1_1: ; %bb3
235 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
236 ; GFX10-NEXT: s_not_b32 s9, s7
237 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
238 ; GFX10-NEXT: s_mul_i32 s10, s5, s7
239 ; GFX10-NEXT: s_mul_i32 s9, s5, s9
240 ; GFX10-NEXT: s_sub_i32 s10, s8, s10
241 ; GFX10-NEXT: s_add_i32 s9, s8, s9
242 ; GFX10-NEXT: s_cmp_ge_u32 s10, s5
243 ; GFX10-NEXT: s_cselect_b32 s9, s9, s10
244 ; GFX10-NEXT: s_sub_i32 s10, s9, s5
245 ; GFX10-NEXT: s_cmp_ge_u32 s9, s5
246 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9
247 ; GFX10-NEXT: s_add_u32 s10, s0, s2
248 ; GFX10-NEXT: s_addc_u32 s11, s1, s3
249 ; GFX10-NEXT: s_add_i32 s8, s8, 1
250 ; GFX10-NEXT: s_add_u32 s6, s6, s4
251 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
252 ; GFX10-NEXT: s_addc_u32 s7, s7, 0
253 ; GFX10-NEXT: s_add_u32 s2, s2, 4
254 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
255 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
256 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
257 ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
258 ; GFX10-NEXT: ; %bb.2: ; %bb2
259 ; GFX10-NEXT: s_endpgm
261 ; GFX11-LABEL: urem32_invariant_denom:
262 ; GFX11: ; %bb.0: ; %bb
263 ; GFX11-NEXT: s_clause 0x1
264 ; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c
265 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
266 ; GFX11-NEXT: s_mov_b32 s8, 0
267 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s5
269 ; GFX11-NEXT: s_sub_i32 s2, 0, s5
270 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
271 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
272 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
273 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
274 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
275 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
276 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
277 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX11-NEXT: s_mul_i32 s2, s2, s4
279 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280 ; GFX11-NEXT: s_mul_hi_u32 s6, s4, s2
281 ; GFX11-NEXT: s_mov_b64 s[2:3], 0
282 ; GFX11-NEXT: s_add_i32 s4, s4, s6
283 ; GFX11-NEXT: s_mov_b64 s[6:7], 0
284 ; GFX11-NEXT: .p2align 6
285 ; GFX11-NEXT: .LBB1_1: ; %bb3
286 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
287 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
288 ; GFX11-NEXT: s_not_b32 s9, s7
289 ; GFX11-NEXT: s_mul_i32 s10, s5, s7
290 ; GFX11-NEXT: s_mul_i32 s9, s5, s9
291 ; GFX11-NEXT: s_sub_i32 s10, s8, s10
292 ; GFX11-NEXT: s_add_i32 s9, s8, s9
293 ; GFX11-NEXT: s_cmp_ge_u32 s10, s5
294 ; GFX11-NEXT: s_cselect_b32 s9, s9, s10
295 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
296 ; GFX11-NEXT: s_sub_i32 s10, s9, s5
297 ; GFX11-NEXT: s_cmp_ge_u32 s9, s5
298 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9
299 ; GFX11-NEXT: s_add_u32 s10, s0, s2
300 ; GFX11-NEXT: s_addc_u32 s11, s1, s3
301 ; GFX11-NEXT: s_add_i32 s8, s8, 1
302 ; GFX11-NEXT: s_add_u32 s6, s6, s4
303 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
304 ; GFX11-NEXT: s_addc_u32 s7, s7, 0
305 ; GFX11-NEXT: s_add_u32 s2, s2, 4
306 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
307 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
308 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
309 ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
310 ; GFX11-NEXT: ; %bb.2: ; %bb2
311 ; GFX11-NEXT: s_nop 0
312 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
313 ; GFX11-NEXT: s_endpgm
320 bb3: ; preds = %bb3, %bb
321 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
322 %tmp4 = urem i32 %tmp, %arg1
323 %tmp5 = zext i32 %tmp to i64
324 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
325 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
326 %tmp7 = add nuw nsw i32 %tmp, 1
327 %tmp8 = icmp eq i32 %tmp7, 1024
328 br i1 %tmp8, label %bb2, label %bb3
331 define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
332 ; GFX9-LABEL: sdiv32_invariant_denom:
333 ; GFX9: ; %bb.0: ; %bb
334 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
335 ; GFX9-NEXT: s_mov_b32 s5, 0
336 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
337 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
338 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
339 ; GFX9-NEXT: s_add_i32 s2, s2, s3
340 ; GFX9-NEXT: s_xor_b32 s4, s2, s3
341 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
342 ; GFX9-NEXT: s_sub_i32 s2, 0, s4
343 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
344 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
345 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
346 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
347 ; GFX9-NEXT: s_mul_i32 s2, s2, s6
348 ; GFX9-NEXT: s_mul_hi_u32 s2, s6, s2
349 ; GFX9-NEXT: s_add_i32 s2, s6, s2
350 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
351 ; GFX9-NEXT: .LBB2_1: ; %bb3
352 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
353 ; GFX9-NEXT: s_mul_hi_u32 s6, s5, s2
354 ; GFX9-NEXT: s_mul_i32 s7, s6, s4
355 ; GFX9-NEXT: s_sub_i32 s7, s5, s7
356 ; GFX9-NEXT: s_add_i32 s8, s6, 1
357 ; GFX9-NEXT: s_sub_i32 s9, s7, s4
358 ; GFX9-NEXT: s_cmp_ge_u32 s7, s4
359 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
360 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7
361 ; GFX9-NEXT: s_add_i32 s8, s6, 1
362 ; GFX9-NEXT: s_cmp_ge_u32 s7, s4
363 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
364 ; GFX9-NEXT: s_xor_b32 s6, s6, s3
365 ; GFX9-NEXT: s_sub_i32 s6, s6, s3
366 ; GFX9-NEXT: s_add_i32 s5, s5, 1
367 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
368 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
369 ; GFX9-NEXT: s_add_u32 s0, s0, 4
370 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
371 ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
372 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
373 ; GFX9-NEXT: ; %bb.2: ; %bb2
374 ; GFX9-NEXT: s_endpgm
376 ; GFX10-LABEL: sdiv32_invariant_denom:
377 ; GFX10: ; %bb.0: ; %bb
378 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
379 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX10-NEXT: s_ashr_i32 s3, s2, 31
381 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
382 ; GFX10-NEXT: s_add_i32 s2, s2, s3
383 ; GFX10-NEXT: s_xor_b32 s4, s2, s3
384 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
385 ; GFX10-NEXT: s_sub_i32 s5, 0, s4
386 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
387 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
388 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
389 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0
390 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
391 ; GFX10-NEXT: s_mul_i32 s5, s5, s2
392 ; GFX10-NEXT: s_mul_hi_u32 s6, s2, s5
393 ; GFX10-NEXT: s_mov_b32 s5, 0
394 ; GFX10-NEXT: s_add_i32 s2, s2, s6
395 ; GFX10-NEXT: .LBB2_1: ; %bb3
396 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
397 ; GFX10-NEXT: s_mul_hi_u32 s6, s5, s2
398 ; GFX10-NEXT: s_mul_i32 s7, s6, s4
399 ; GFX10-NEXT: s_add_i32 s8, s6, 1
400 ; GFX10-NEXT: s_sub_i32 s7, s5, s7
401 ; GFX10-NEXT: s_sub_i32 s9, s7, s4
402 ; GFX10-NEXT: s_cmp_ge_u32 s7, s4
403 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6
404 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7
405 ; GFX10-NEXT: s_add_i32 s8, s6, 1
406 ; GFX10-NEXT: s_cmp_ge_u32 s7, s4
407 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6
408 ; GFX10-NEXT: s_add_i32 s5, s5, 1
409 ; GFX10-NEXT: s_xor_b32 s6, s6, s3
410 ; GFX10-NEXT: s_sub_i32 s6, s6, s3
411 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
412 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
414 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
415 ; GFX10-NEXT: s_add_u32 s0, s0, 4
416 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
417 ; GFX10-NEXT: s_cmpk_eq_i32 s5, 0x400
418 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
419 ; GFX10-NEXT: ; %bb.2: ; %bb2
420 ; GFX10-NEXT: s_endpgm
422 ; GFX11-LABEL: sdiv32_invariant_denom:
423 ; GFX11: ; %bb.0: ; %bb
424 ; GFX11-NEXT: s_clause 0x1
425 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
426 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
427 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX11-NEXT: s_ashr_i32 s3, s2, 31
429 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
430 ; GFX11-NEXT: s_add_i32 s2, s2, s3
431 ; GFX11-NEXT: s_xor_b32 s4, s2, s3
432 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
433 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
434 ; GFX11-NEXT: s_sub_i32 s5, 0, s4
435 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
436 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
437 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
438 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
439 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
440 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
441 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
442 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
443 ; GFX11-NEXT: s_mul_i32 s5, s5, s2
444 ; GFX11-NEXT: s_mul_hi_u32 s6, s2, s5
445 ; GFX11-NEXT: s_mov_b32 s5, 0
446 ; GFX11-NEXT: s_add_i32 s2, s2, s6
447 ; GFX11-NEXT: .p2align 6
448 ; GFX11-NEXT: .LBB2_1: ; %bb3
449 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
450 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
451 ; GFX11-NEXT: s_mul_hi_u32 s6, s5, s2
452 ; GFX11-NEXT: s_mul_i32 s7, s6, s4
453 ; GFX11-NEXT: s_add_i32 s8, s6, 1
454 ; GFX11-NEXT: s_sub_i32 s7, s5, s7
455 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
456 ; GFX11-NEXT: s_sub_i32 s9, s7, s4
457 ; GFX11-NEXT: s_cmp_ge_u32 s7, s4
458 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6
459 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7
460 ; GFX11-NEXT: s_add_i32 s8, s6, 1
461 ; GFX11-NEXT: s_cmp_ge_u32 s7, s4
462 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6
463 ; GFX11-NEXT: s_add_i32 s5, s5, 1
464 ; GFX11-NEXT: s_xor_b32 s6, s6, s3
465 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
466 ; GFX11-NEXT: s_sub_i32 s6, s6, s3
467 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
468 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
469 ; GFX11-NEXT: s_add_u32 s0, s0, 4
470 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
471 ; GFX11-NEXT: s_cmpk_eq_i32 s5, 0x400
472 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1
473 ; GFX11-NEXT: ; %bb.2: ; %bb2
474 ; GFX11-NEXT: s_nop 0
475 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
476 ; GFX11-NEXT: s_endpgm
483 bb3: ; preds = %bb3, %bb
484 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
485 %tmp4 = sdiv i32 %tmp, %arg1
486 %tmp5 = zext i32 %tmp to i64
487 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
488 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
489 %tmp7 = add nuw nsw i32 %tmp, 1
490 %tmp8 = icmp eq i32 %tmp7, 1024
491 br i1 %tmp8, label %bb2, label %bb3
494 define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
495 ; GFX9-LABEL: srem32_invariant_denom:
496 ; GFX9: ; %bb.0: ; %bb
497 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
498 ; GFX9-NEXT: s_mov_b32 s4, 0
499 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
502 ; GFX9-NEXT: s_add_i32 s2, s2, s3
503 ; GFX9-NEXT: s_xor_b32 s3, s2, s3
504 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
505 ; GFX9-NEXT: s_sub_i32 s2, 0, s3
506 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
507 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
508 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
509 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
510 ; GFX9-NEXT: s_mul_i32 s2, s2, s5
511 ; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
512 ; GFX9-NEXT: s_add_i32 s2, s5, s2
513 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
514 ; GFX9-NEXT: .LBB3_1: ; %bb3
515 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
516 ; GFX9-NEXT: s_mul_hi_u32 s5, s4, s2
517 ; GFX9-NEXT: s_mul_i32 s5, s5, s3
518 ; GFX9-NEXT: s_sub_i32 s5, s4, s5
519 ; GFX9-NEXT: s_sub_i32 s6, s5, s3
520 ; GFX9-NEXT: s_cmp_ge_u32 s5, s3
521 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
522 ; GFX9-NEXT: s_sub_i32 s6, s5, s3
523 ; GFX9-NEXT: s_cmp_ge_u32 s5, s3
524 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
525 ; GFX9-NEXT: s_add_i32 s4, s4, 1
526 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
527 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
528 ; GFX9-NEXT: s_add_u32 s0, s0, 4
529 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
530 ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
531 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
532 ; GFX9-NEXT: ; %bb.2: ; %bb2
533 ; GFX9-NEXT: s_endpgm
535 ; GFX10-LABEL: srem32_invariant_denom:
536 ; GFX10: ; %bb.0: ; %bb
537 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
538 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
539 ; GFX10-NEXT: s_ashr_i32 s3, s2, 31
540 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
541 ; GFX10-NEXT: s_add_i32 s2, s2, s3
542 ; GFX10-NEXT: s_xor_b32 s3, s2, s3
543 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
544 ; GFX10-NEXT: s_sub_i32 s4, 0, s3
545 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
546 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
547 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
548 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0
549 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
550 ; GFX10-NEXT: s_mul_i32 s4, s4, s2
551 ; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
552 ; GFX10-NEXT: s_mov_b32 s4, 0
553 ; GFX10-NEXT: s_add_i32 s2, s2, s5
554 ; GFX10-NEXT: .LBB3_1: ; %bb3
555 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
556 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
557 ; GFX10-NEXT: s_mul_i32 s5, s5, s3
558 ; GFX10-NEXT: s_sub_i32 s5, s4, s5
559 ; GFX10-NEXT: s_sub_i32 s6, s5, s3
560 ; GFX10-NEXT: s_cmp_ge_u32 s5, s3
561 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5
562 ; GFX10-NEXT: s_sub_i32 s6, s5, s3
563 ; GFX10-NEXT: s_cmp_ge_u32 s5, s3
564 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5
565 ; GFX10-NEXT: s_add_i32 s4, s4, 1
566 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
567 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
569 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
570 ; GFX10-NEXT: s_add_u32 s0, s0, 4
571 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
572 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
573 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
574 ; GFX10-NEXT: ; %bb.2: ; %bb2
575 ; GFX10-NEXT: s_endpgm
577 ; GFX11-LABEL: srem32_invariant_denom:
578 ; GFX11: ; %bb.0: ; %bb
579 ; GFX11-NEXT: s_clause 0x1
580 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
581 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
582 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
583 ; GFX11-NEXT: s_ashr_i32 s3, s2, 31
584 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
585 ; GFX11-NEXT: s_add_i32 s2, s2, s3
586 ; GFX11-NEXT: s_xor_b32 s3, s2, s3
587 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
588 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3
589 ; GFX11-NEXT: s_sub_i32 s4, 0, s3
590 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
591 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
592 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
593 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
594 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
595 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
596 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
597 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
598 ; GFX11-NEXT: s_mul_i32 s4, s4, s2
599 ; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
600 ; GFX11-NEXT: s_mov_b32 s4, 0
601 ; GFX11-NEXT: s_add_i32 s2, s2, s5
602 ; GFX11-NEXT: .p2align 6
603 ; GFX11-NEXT: .LBB3_1: ; %bb3
604 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
605 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
606 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
607 ; GFX11-NEXT: s_mul_i32 s5, s5, s3
608 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
609 ; GFX11-NEXT: s_sub_i32 s5, s4, s5
610 ; GFX11-NEXT: s_sub_i32 s6, s5, s3
611 ; GFX11-NEXT: s_cmp_ge_u32 s5, s3
612 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5
613 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
614 ; GFX11-NEXT: s_sub_i32 s6, s5, s3
615 ; GFX11-NEXT: s_cmp_ge_u32 s5, s3
616 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5
617 ; GFX11-NEXT: s_add_i32 s4, s4, 1
618 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
619 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
620 ; GFX11-NEXT: s_add_u32 s0, s0, 4
621 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
622 ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
623 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1
624 ; GFX11-NEXT: ; %bb.2: ; %bb2
625 ; GFX11-NEXT: s_nop 0
626 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
627 ; GFX11-NEXT: s_endpgm
634 bb3: ; preds = %bb3, %bb
635 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
636 %tmp4 = srem i32 %tmp, %arg1
637 %tmp5 = zext i32 %tmp to i64
638 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
639 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
640 %tmp7 = add nuw nsw i32 %tmp, 1
641 %tmp8 = icmp eq i32 %tmp7, 1024
642 br i1 %tmp8, label %bb2, label %bb3
645 define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
646 ; GFX9-LABEL: udiv16_invariant_denom:
647 ; GFX9: ; %bb.0: ; %bb
648 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
649 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
650 ; GFX9-NEXT: s_movk_i32 s4, 0x400
651 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
652 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
653 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
654 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
655 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
656 ; GFX9-NEXT: .LBB4_1: ; %bb3
657 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
658 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2
659 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3
660 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
661 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
662 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3
663 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1
664 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
665 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5
666 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4
667 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0
668 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1]
669 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX9-NEXT: global_store_short v3, v4, s[2:3]
671 ; GFX9-NEXT: s_cbranch_vccz .LBB4_1
672 ; GFX9-NEXT: ; %bb.2: ; %bb2
673 ; GFX9-NEXT: s_endpgm
675 ; GFX10-LABEL: udiv16_invariant_denom:
676 ; GFX10: ; %bb.0: ; %bb
677 ; GFX10-NEXT: s_clause 0x1
678 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
679 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
680 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
681 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff
683 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0
684 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
685 ; GFX10-NEXT: .LBB4_1: ; %bb3
686 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
687 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2
688 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1
689 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3
690 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
691 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3
692 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1
693 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
694 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
695 ; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4
696 ; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5
697 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0
698 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
699 ; GFX10-NEXT: global_store_short v3, v4, s[2:3]
700 ; GFX10-NEXT: s_cbranch_vccz .LBB4_1
701 ; GFX10-NEXT: ; %bb.2: ; %bb2
702 ; GFX10-NEXT: s_endpgm
704 ; GFX11-LABEL: udiv16_invariant_denom:
705 ; GFX11: ; %bb.0: ; %bb
706 ; GFX11-NEXT: s_clause 0x1
707 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
708 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
709 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
710 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
711 ; GFX11-NEXT: s_and_b32 s0, s4, 0xffff
712 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
713 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
714 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
715 ; GFX11-NEXT: .p2align 6
716 ; GFX11-NEXT: .LBB4_1: ; %bb3
717 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
718 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
719 ; GFX11-NEXT: v_add_nc_u16 v2, v2, 1
720 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
721 ; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3
722 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
723 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3
724 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
725 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1
726 ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
727 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
728 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
729 ; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4
730 ; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5
731 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
732 ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0
733 ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
734 ; GFX11-NEXT: global_store_b16 v3, v4, s[2:3]
735 ; GFX11-NEXT: s_cbranch_vccz .LBB4_1
736 ; GFX11-NEXT: ; %bb.2: ; %bb2
737 ; GFX11-NEXT: s_nop 0
738 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
739 ; GFX11-NEXT: s_endpgm
746 bb3: ; preds = %bb3, %bb
747 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
748 %tmp4 = udiv i16 %tmp, %arg1
749 %tmp5 = zext i16 %tmp to i64
750 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
751 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
752 %tmp7 = add nuw nsw i16 %tmp, 1
753 %tmp8 = icmp eq i16 %tmp7, 1024
754 br i1 %tmp8, label %bb2, label %bb3
757 define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
758 ; GFX9-LABEL: urem16_invariant_denom:
759 ; GFX9: ; %bb.0: ; %bb
760 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
761 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
762 ; GFX9-NEXT: s_movk_i32 s5, 0x400
763 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
764 ; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
765 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
766 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
767 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
768 ; GFX9-NEXT: .LBB5_1: ; %bb3
769 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
770 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2
771 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3
772 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
773 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
774 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 1, v3
775 ; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1
776 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
777 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
778 ; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4
779 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0
780 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
781 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
782 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4
783 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
784 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
785 ; GFX9-NEXT: global_store_short v5, v3, s[2:3]
786 ; GFX9-NEXT: s_cbranch_vccz .LBB5_1
787 ; GFX9-NEXT: ; %bb.2: ; %bb2
788 ; GFX9-NEXT: s_endpgm
790 ; GFX10-LABEL: urem16_invariant_denom:
791 ; GFX10: ; %bb.0: ; %bb
792 ; GFX10-NEXT: s_clause 0x1
793 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
794 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
795 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
796 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
797 ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff
798 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0
799 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
800 ; GFX10-NEXT: .LBB5_1: ; %bb3
801 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
802 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2
803 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1
804 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3
805 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1
806 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
807 ; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4
808 ; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5
809 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
810 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
811 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
812 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 1, v3
813 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0
814 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v4
815 ; GFX10-NEXT: global_store_short v5, v3, s[2:3]
816 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1
817 ; GFX10-NEXT: ; %bb.2: ; %bb2
818 ; GFX10-NEXT: s_endpgm
820 ; GFX11-LABEL: urem16_invariant_denom:
821 ; GFX11: ; %bb.0: ; %bb
822 ; GFX11-NEXT: s_clause 0x1
823 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
824 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
825 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
826 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
827 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
828 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
829 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
830 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
831 ; GFX11-NEXT: .p2align 6
832 ; GFX11-NEXT: .LBB5_1: ; %bb3
833 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
834 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
835 ; GFX11-NEXT: v_add_nc_u16 v2, v2, 1
836 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
837 ; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3
838 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
839 ; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1
840 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
841 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
842 ; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4
843 ; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5
844 ; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
845 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
846 ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
847 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
848 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3
849 ; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2
850 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
851 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4
852 ; GFX11-NEXT: global_store_b16 v5, v3, s[0:1]
853 ; GFX11-NEXT: s_cbranch_vccz .LBB5_1
854 ; GFX11-NEXT: ; %bb.2: ; %bb2
855 ; GFX11-NEXT: s_nop 0
856 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
857 ; GFX11-NEXT: s_endpgm
864 bb3: ; preds = %bb3, %bb
865 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
866 %tmp4 = urem i16 %tmp, %arg1
867 %tmp5 = zext i16 %tmp to i64
868 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
869 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
870 %tmp7 = add nuw nsw i16 %tmp, 1
871 %tmp8 = icmp eq i16 %tmp7, 1024
872 br i1 %tmp8, label %bb2, label %bb3
875 define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
876 ; GFX9-LABEL: sdiv16_invariant_denom:
877 ; GFX9: ; %bb.0: ; %bb
878 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
879 ; GFX9-NEXT: s_mov_b32 s4, 0
880 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
881 ; GFX9-NEXT: s_movk_i32 s3, 0x400
882 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
883 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
884 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
885 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
886 ; GFX9-NEXT: .LBB6_1: ; %bb3
887 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
888 ; GFX9-NEXT: s_sext_i32_i16 s5, s4
889 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
890 ; GFX9-NEXT: s_xor_b32 s6, s5, s2
891 ; GFX9-NEXT: s_ashr_i32 s5, s6, 30
892 ; GFX9-NEXT: s_or_b32 s5, s5, 1
893 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
894 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
895 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
896 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
897 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
898 ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
899 ; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
900 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0
901 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s4
902 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
903 ; GFX9-NEXT: v_readfirstlane_b32 s4, v2
904 ; GFX9-NEXT: v_add_u32_e32 v2, s5, v4
905 ; GFX9-NEXT: s_lshl_b32 s5, s6, 1
906 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
907 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
908 ; GFX9-NEXT: s_cbranch_vccz .LBB6_1
909 ; GFX9-NEXT: ; %bb.2: ; %bb2
910 ; GFX9-NEXT: s_endpgm
912 ; GFX10-LABEL: sdiv16_invariant_denom:
913 ; GFX10: ; %bb.0: ; %bb
914 ; GFX10-NEXT: s_clause 0x1
915 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
916 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
917 ; GFX10-NEXT: s_mov_b32 s1, 0
918 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
919 ; GFX10-NEXT: s_sext_i32_i16 s0, s4
920 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0
921 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
922 ; GFX10-NEXT: .LBB6_1: ; %bb3
923 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
924 ; GFX10-NEXT: s_sext_i32_i16 s4, s1
925 ; GFX10-NEXT: v_add_nc_u16 v2, s1, 1
926 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4
927 ; GFX10-NEXT: s_xor_b32 s5, s4, s0
928 ; GFX10-NEXT: s_ashr_i32 s4, s5, 30
929 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
930 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1
931 ; GFX10-NEXT: s_or_b32 s4, s4, 1
932 ; GFX10-NEXT: v_trunc_f32_e32 v4, v4
933 ; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3
934 ; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v4
935 ; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
936 ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
937 ; GFX10-NEXT: s_cselect_b32 s4, s4, 0
938 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s1
939 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
940 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
941 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4
942 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
943 ; GFX10-NEXT: global_store_short v3, v2, s[2:3]
944 ; GFX10-NEXT: s_cbranch_vccz .LBB6_1
945 ; GFX10-NEXT: ; %bb.2: ; %bb2
946 ; GFX10-NEXT: s_endpgm
948 ; GFX11-LABEL: sdiv16_invariant_denom:
949 ; GFX11: ; %bb.0: ; %bb
950 ; GFX11-NEXT: s_clause 0x1
951 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
952 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
953 ; GFX11-NEXT: s_mov_b32 s3, 0
954 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
955 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
956 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
957 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
958 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
959 ; GFX11-NEXT: .p2align 6
960 ; GFX11-NEXT: .LBB6_1: ; %bb3
961 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
962 ; GFX11-NEXT: s_sext_i32_i16 s4, s3
963 ; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
964 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
965 ; GFX11-NEXT: s_xor_b32 s5, s4, s2
966 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
967 ; GFX11-NEXT: s_ashr_i32 s4, s5, 30
968 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
969 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
970 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1
971 ; GFX11-NEXT: s_or_b32 s4, s4, 1
972 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
973 ; GFX11-NEXT: v_trunc_f32_e32 v4, v4
974 ; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3
975 ; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v4
976 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
977 ; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
978 ; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
979 ; GFX11-NEXT: s_cselect_b32 s4, s4, 0
980 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
981 ; GFX11-NEXT: v_readfirstlane_b32 s3, v2
982 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
983 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
984 ; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
985 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
986 ; GFX11-NEXT: s_cbranch_vccz .LBB6_1
987 ; GFX11-NEXT: ; %bb.2: ; %bb2
988 ; GFX11-NEXT: s_nop 0
989 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
990 ; GFX11-NEXT: s_endpgm
997 bb3: ; preds = %bb3, %bb
998 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
999 %tmp4 = sdiv i16 %tmp, %arg1
1000 %tmp5 = zext i16 %tmp to i64
1001 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
1002 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
1003 %tmp7 = add nuw nsw i16 %tmp, 1
1004 %tmp8 = icmp eq i16 %tmp7, 1024
1005 br i1 %tmp8, label %bb2, label %bb3
1008 define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
1009 ; GFX9-LABEL: srem16_invariant_denom:
1010 ; GFX9: ; %bb.0: ; %bb
1011 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
1012 ; GFX9-NEXT: s_mov_b32 s4, 0
1013 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1014 ; GFX9-NEXT: s_movk_i32 s3, 0x400
1015 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1016 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
1017 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
1018 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
1019 ; GFX9-NEXT: .LBB7_1: ; %bb3
1020 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1021 ; GFX9-NEXT: s_sext_i32_i16 s5, s4
1022 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
1023 ; GFX9-NEXT: s_xor_b32 s6, s5, s2
1024 ; GFX9-NEXT: s_ashr_i32 s6, s6, 30
1025 ; GFX9-NEXT: s_or_b32 s8, s6, 1
1026 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
1027 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
1028 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
1029 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
1030 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
1031 ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
1032 ; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
1033 ; GFX9-NEXT: s_cselect_b32 s6, s8, 0
1034 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
1035 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s4
1036 ; GFX9-NEXT: v_readfirstlane_b32 s4, v2
1037 ; GFX9-NEXT: v_add_u32_e32 v2, s6, v4
1038 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
1039 ; GFX9-NEXT: s_lshl_b32 s6, s7, 1
1040 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
1041 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
1042 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
1043 ; GFX9-NEXT: s_cbranch_vccz .LBB7_1
1044 ; GFX9-NEXT: ; %bb.2: ; %bb2
1045 ; GFX9-NEXT: s_endpgm
1047 ; GFX10-LABEL: srem16_invariant_denom:
1048 ; GFX10: ; %bb.0: ; %bb
1049 ; GFX10-NEXT: s_clause 0x1
1050 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
1051 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1052 ; GFX10-NEXT: s_mov_b32 s1, 0
1053 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1054 ; GFX10-NEXT: s_sext_i32_i16 s0, s4
1055 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0
1056 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
1057 ; GFX10-NEXT: .LBB7_1: ; %bb3
1058 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1059 ; GFX10-NEXT: s_sext_i32_i16 s4, s1
1060 ; GFX10-NEXT: v_add_nc_u16 v2, s1, 1
1061 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4
1062 ; GFX10-NEXT: s_xor_b32 s5, s4, s0
1063 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30
1064 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
1065 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1
1066 ; GFX10-NEXT: s_or_b32 s5, s5, 1
1067 ; GFX10-NEXT: v_trunc_f32_e32 v4, v4
1068 ; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3
1069 ; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0|
1070 ; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4
1071 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
1072 ; GFX10-NEXT: s_cselect_b32 s5, s5, 0
1073 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
1074 ; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3
1075 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s1
1076 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
1077 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
1078 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
1079 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0
1080 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3
1081 ; GFX10-NEXT: global_store_short v2, v3, s[2:3]
1082 ; GFX10-NEXT: s_cbranch_vccz .LBB7_1
1083 ; GFX10-NEXT: ; %bb.2: ; %bb2
1084 ; GFX10-NEXT: s_endpgm
1086 ; GFX11-LABEL: srem16_invariant_denom:
1087 ; GFX11: ; %bb.0: ; %bb
1088 ; GFX11-NEXT: s_clause 0x1
1089 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
1090 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1091 ; GFX11-NEXT: s_mov_b32 s3, 0
1092 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1093 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
1094 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1095 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
1096 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
1097 ; GFX11-NEXT: .p2align 6
1098 ; GFX11-NEXT: .LBB7_1: ; %bb3
1099 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1100 ; GFX11-NEXT: s_sext_i32_i16 s4, s3
1101 ; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
1102 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
1103 ; GFX11-NEXT: s_xor_b32 s5, s4, s2
1104 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1105 ; GFX11-NEXT: s_ashr_i32 s5, s5, 30
1106 ; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
1107 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1108 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1
1109 ; GFX11-NEXT: s_or_b32 s5, s5, 1
1110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1111 ; GFX11-NEXT: v_trunc_f32_e32 v4, v4
1112 ; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3
1113 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1114 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0|
1115 ; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v4
1116 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
1117 ; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1118 ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
1119 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1120 ; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3
1121 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
1122 ; GFX11-NEXT: v_readfirstlane_b32 s3, v2
1123 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
1124 ; GFX11-NEXT: v_mov_b32_e32 v2, s5
1125 ; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2
1126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1127 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3
1128 ; GFX11-NEXT: global_store_b16 v2, v3, s[0:1]
1129 ; GFX11-NEXT: s_cbranch_vccz .LBB7_1
1130 ; GFX11-NEXT: ; %bb.2: ; %bb2
1131 ; GFX11-NEXT: s_nop 0
1132 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1133 ; GFX11-NEXT: s_endpgm
1140 bb3: ; preds = %bb3, %bb
1141 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
1142 %tmp4 = srem i16 %tmp, %arg1
1143 %tmp5 = zext i16 %tmp to i64
1144 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
1145 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
1146 %tmp7 = add nuw nsw i16 %tmp, 1
1147 %tmp8 = icmp eq i16 %tmp7, 1024
1148 br i1 %tmp8, label %bb2, label %bb3