1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
6 define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
7 ; GFX9-LABEL: udiv32_invariant_denom:
9 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
10 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
11 ; GFX9-NEXT: s_mov_b32 s7, 0
12 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
15 ; GFX9-NEXT: s_sub_i32 s4, 0, s6
16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
17 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
18 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
19 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
20 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1
21 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
22 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
23 ; GFX9-NEXT: s_add_i32 s8, s5, s4
24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
25 ; GFX9-NEXT: .LBB0_1: ; %bb3
26 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
27 ; GFX9-NEXT: s_not_b32 s10, s5
28 ; GFX9-NEXT: s_mul_i32 s9, s6, s5
29 ; GFX9-NEXT: s_mul_i32 s10, s6, s10
30 ; GFX9-NEXT: s_add_i32 s11, s5, 1
31 ; GFX9-NEXT: s_sub_i32 s9, s7, s9
32 ; GFX9-NEXT: s_add_i32 s10, s7, s10
33 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
34 ; GFX9-NEXT: s_cselect_b32 s11, s11, s5
35 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
36 ; GFX9-NEXT: s_add_i32 s10, s11, 1
37 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
38 ; GFX9-NEXT: s_cselect_b32 s9, s10, s11
39 ; GFX9-NEXT: s_add_u32 s10, s0, s2
40 ; GFX9-NEXT: s_addc_u32 s11, s1, s3
41 ; GFX9-NEXT: s_add_i32 s7, s7, 1
42 ; GFX9-NEXT: s_add_u32 s4, s4, s8
43 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
44 ; GFX9-NEXT: s_add_u32 s2, s2, 4
45 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
46 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
47 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
48 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
49 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
50 ; GFX9-NEXT: ; %bb.2: ; %bb2
53 ; GFX10-LABEL: udiv32_invariant_denom:
54 ; GFX10: ; %bb.0: ; %bb
55 ; GFX10-NEXT: s_clause 0x1
56 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
57 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
58 ; GFX10-NEXT: s_mov_b32 s7, 0
59 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
61 ; GFX10-NEXT: s_sub_i32 s2, 0, s6
62 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
63 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
64 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
65 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
66 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
67 ; GFX10-NEXT: s_mul_i32 s2, s2, s4
68 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
69 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
70 ; GFX10-NEXT: s_add_i32 s8, s4, s5
71 ; GFX10-NEXT: s_mov_b64 s[4:5], 0
72 ; GFX10-NEXT: .LBB0_1: ; %bb3
73 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
74 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
75 ; GFX10-NEXT: s_not_b32 s10, s5
76 ; GFX10-NEXT: s_mul_i32 s9, s6, s5
77 ; GFX10-NEXT: s_mul_i32 s10, s6, s10
78 ; GFX10-NEXT: s_sub_i32 s9, s7, s9
79 ; GFX10-NEXT: s_add_i32 s11, s5, 1
80 ; GFX10-NEXT: s_add_i32 s10, s7, s10
81 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6
82 ; GFX10-NEXT: s_cselect_b32 s11, s11, s5
83 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9
84 ; GFX10-NEXT: s_add_i32 s10, s11, 1
85 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6
86 ; GFX10-NEXT: s_cselect_b32 s9, s10, s11
87 ; GFX10-NEXT: s_add_u32 s10, s0, s2
88 ; GFX10-NEXT: s_addc_u32 s11, s1, s3
89 ; GFX10-NEXT: s_add_i32 s7, s7, 1
90 ; GFX10-NEXT: s_add_u32 s4, s4, s8
91 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
92 ; GFX10-NEXT: s_addc_u32 s5, s5, 0
93 ; GFX10-NEXT: s_add_u32 s2, s2, 4
94 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
95 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
96 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
97 ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
98 ; GFX10-NEXT: ; %bb.2: ; %bb2
99 ; GFX10-NEXT: s_endpgm
101 ; GFX11-LABEL: udiv32_invariant_denom:
102 ; GFX11: ; %bb.0: ; %bb
103 ; GFX11-NEXT: s_clause 0x1
104 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
105 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
106 ; GFX11-NEXT: s_mov_b32 s7, 0
107 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
108 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
109 ; GFX11-NEXT: s_sub_i32 s2, 0, s6
110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
111 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
112 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
113 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
114 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
115 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
116 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
117 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
118 ; GFX11-NEXT: s_mul_i32 s2, s2, s4
119 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
120 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
121 ; GFX11-NEXT: s_mov_b64 s[2:3], 0
122 ; GFX11-NEXT: s_add_i32 s8, s4, s5
123 ; GFX11-NEXT: s_mov_b64 s[4:5], 0
124 ; GFX11-NEXT: .p2align 6
125 ; GFX11-NEXT: .LBB0_1: ; %bb3
126 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
127 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
128 ; GFX11-NEXT: s_not_b32 s10, s5
129 ; GFX11-NEXT: s_mul_i32 s9, s6, s5
130 ; GFX11-NEXT: s_mul_i32 s10, s6, s10
131 ; GFX11-NEXT: s_sub_i32 s9, s7, s9
132 ; GFX11-NEXT: s_add_i32 s11, s5, 1
133 ; GFX11-NEXT: s_add_i32 s10, s7, s10
134 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6
135 ; GFX11-NEXT: s_cselect_b32 s11, s11, s5
136 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9
137 ; GFX11-NEXT: s_add_i32 s10, s11, 1
138 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6
139 ; GFX11-NEXT: s_cselect_b32 s9, s10, s11
140 ; GFX11-NEXT: s_add_u32 s10, s0, s2
141 ; GFX11-NEXT: s_addc_u32 s11, s1, s3
142 ; GFX11-NEXT: s_add_i32 s7, s7, 1
143 ; GFX11-NEXT: s_add_u32 s4, s4, s8
144 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
145 ; GFX11-NEXT: s_addc_u32 s5, s5, 0
146 ; GFX11-NEXT: s_add_u32 s2, s2, 4
147 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
148 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
149 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
150 ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
151 ; GFX11-NEXT: ; %bb.2: ; %bb2
152 ; GFX11-NEXT: s_endpgm
159 bb3: ; preds = %bb3, %bb
160 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
161 %tmp4 = udiv i32 %tmp, %arg1
162 %tmp5 = zext i32 %tmp to i64
163 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
164 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
165 %tmp7 = add nuw nsw i32 %tmp, 1
166 %tmp8 = icmp eq i32 %tmp7, 1024
167 br i1 %tmp8, label %bb2, label %bb3
170 define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
171 ; GFX9-LABEL: urem32_invariant_denom:
172 ; GFX9: ; %bb.0: ; %bb
173 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
174 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
175 ; GFX9-NEXT: s_mov_b32 s7, 0
176 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
177 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
178 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
179 ; GFX9-NEXT: s_sub_i32 s4, 0, s6
180 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
181 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
182 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
183 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
184 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1
185 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
186 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
187 ; GFX9-NEXT: s_add_i32 s8, s5, s4
188 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
189 ; GFX9-NEXT: .LBB1_1: ; %bb3
190 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
191 ; GFX9-NEXT: s_not_b32 s10, s5
192 ; GFX9-NEXT: s_mul_i32 s9, s6, s5
193 ; GFX9-NEXT: s_mul_i32 s10, s6, s10
194 ; GFX9-NEXT: s_sub_i32 s9, s7, s9
195 ; GFX9-NEXT: s_add_i32 s10, s7, s10
196 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
197 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
198 ; GFX9-NEXT: s_sub_i32 s10, s9, s6
199 ; GFX9-NEXT: s_cmp_ge_u32 s9, s6
200 ; GFX9-NEXT: s_cselect_b32 s9, s10, s9
201 ; GFX9-NEXT: s_add_u32 s10, s0, s2
202 ; GFX9-NEXT: s_addc_u32 s11, s1, s3
203 ; GFX9-NEXT: s_add_i32 s7, s7, 1
204 ; GFX9-NEXT: s_add_u32 s4, s4, s8
205 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
206 ; GFX9-NEXT: s_add_u32 s2, s2, 4
207 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
208 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
209 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
210 ; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
211 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
212 ; GFX9-NEXT: ; %bb.2: ; %bb2
213 ; GFX9-NEXT: s_endpgm
215 ; GFX10-LABEL: urem32_invariant_denom:
216 ; GFX10: ; %bb.0: ; %bb
217 ; GFX10-NEXT: s_clause 0x1
218 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
219 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
220 ; GFX10-NEXT: s_mov_b32 s7, 0
221 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
223 ; GFX10-NEXT: s_sub_i32 s2, 0, s6
224 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
225 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
226 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
227 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
228 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
229 ; GFX10-NEXT: s_mul_i32 s2, s2, s4
230 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
231 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
232 ; GFX10-NEXT: s_add_i32 s8, s4, s5
233 ; GFX10-NEXT: s_mov_b64 s[4:5], 0
234 ; GFX10-NEXT: .LBB1_1: ; %bb3
235 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
236 ; GFX10-NEXT: s_not_b32 s9, s5
237 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
238 ; GFX10-NEXT: s_mul_i32 s10, s6, s5
239 ; GFX10-NEXT: s_mul_i32 s9, s6, s9
240 ; GFX10-NEXT: s_sub_i32 s10, s7, s10
241 ; GFX10-NEXT: s_add_i32 s9, s7, s9
242 ; GFX10-NEXT: s_cmp_ge_u32 s10, s6
243 ; GFX10-NEXT: s_cselect_b32 s9, s9, s10
244 ; GFX10-NEXT: s_sub_i32 s10, s9, s6
245 ; GFX10-NEXT: s_cmp_ge_u32 s9, s6
246 ; GFX10-NEXT: s_cselect_b32 s9, s10, s9
247 ; GFX10-NEXT: s_add_u32 s10, s0, s2
248 ; GFX10-NEXT: s_addc_u32 s11, s1, s3
249 ; GFX10-NEXT: s_add_i32 s7, s7, 1
250 ; GFX10-NEXT: s_add_u32 s4, s4, s8
251 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
252 ; GFX10-NEXT: s_addc_u32 s5, s5, 0
253 ; GFX10-NEXT: s_add_u32 s2, s2, 4
254 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
255 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
256 ; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
257 ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
258 ; GFX10-NEXT: ; %bb.2: ; %bb2
259 ; GFX10-NEXT: s_endpgm
261 ; GFX11-LABEL: urem32_invariant_denom:
262 ; GFX11: ; %bb.0: ; %bb
263 ; GFX11-NEXT: s_clause 0x1
264 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
265 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
266 ; GFX11-NEXT: s_mov_b32 s7, 0
267 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
269 ; GFX11-NEXT: s_sub_i32 s2, 0, s6
270 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
271 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
272 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
273 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
274 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
275 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
276 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
277 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX11-NEXT: s_mul_i32 s2, s2, s4
279 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
281 ; GFX11-NEXT: s_mov_b64 s[2:3], 0
282 ; GFX11-NEXT: s_add_i32 s8, s4, s5
283 ; GFX11-NEXT: s_mov_b64 s[4:5], 0
284 ; GFX11-NEXT: .p2align 6
285 ; GFX11-NEXT: .LBB1_1: ; %bb3
286 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
287 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
288 ; GFX11-NEXT: s_not_b32 s9, s5
289 ; GFX11-NEXT: s_mul_i32 s10, s6, s5
290 ; GFX11-NEXT: s_mul_i32 s9, s6, s9
291 ; GFX11-NEXT: s_sub_i32 s10, s7, s10
292 ; GFX11-NEXT: s_add_i32 s9, s7, s9
293 ; GFX11-NEXT: s_cmp_ge_u32 s10, s6
294 ; GFX11-NEXT: s_cselect_b32 s9, s9, s10
295 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
296 ; GFX11-NEXT: s_sub_i32 s10, s9, s6
297 ; GFX11-NEXT: s_cmp_ge_u32 s9, s6
298 ; GFX11-NEXT: s_cselect_b32 s9, s10, s9
299 ; GFX11-NEXT: s_add_u32 s10, s0, s2
300 ; GFX11-NEXT: s_addc_u32 s11, s1, s3
301 ; GFX11-NEXT: s_add_i32 s7, s7, 1
302 ; GFX11-NEXT: s_add_u32 s4, s4, s8
303 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
304 ; GFX11-NEXT: s_addc_u32 s5, s5, 0
305 ; GFX11-NEXT: s_add_u32 s2, s2, 4
306 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
307 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
308 ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
309 ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
310 ; GFX11-NEXT: ; %bb.2: ; %bb2
311 ; GFX11-NEXT: s_endpgm
318 bb3: ; preds = %bb3, %bb
319 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
320 %tmp4 = urem i32 %tmp, %arg1
321 %tmp5 = zext i32 %tmp to i64
322 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
323 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
324 %tmp7 = add nuw nsw i32 %tmp, 1
325 %tmp8 = icmp eq i32 %tmp7, 1024
326 br i1 %tmp8, label %bb2, label %bb3
329 define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
330 ; GFX9-LABEL: sdiv32_invariant_denom:
331 ; GFX9: ; %bb.0: ; %bb
332 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
333 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
334 ; GFX9-NEXT: s_mov_b32 s3, 0
335 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
336 ; GFX9-NEXT: s_abs_i32 s2, s6
337 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
338 ; GFX9-NEXT: s_ashr_i32 s4, s6, 31
339 ; GFX9-NEXT: s_sub_i32 s5, 0, s2
340 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
341 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
342 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
343 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
344 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
345 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
346 ; GFX9-NEXT: s_add_i32 s5, s6, s5
347 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
348 ; GFX9-NEXT: .LBB2_1: ; %bb3
349 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
350 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5
351 ; GFX9-NEXT: s_mul_i32 s7, s6, s2
352 ; GFX9-NEXT: s_sub_i32 s7, s3, s7
353 ; GFX9-NEXT: s_add_i32 s8, s6, 1
354 ; GFX9-NEXT: s_sub_i32 s9, s7, s2
355 ; GFX9-NEXT: s_cmp_ge_u32 s7, s2
356 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
357 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7
358 ; GFX9-NEXT: s_add_i32 s8, s6, 1
359 ; GFX9-NEXT: s_cmp_ge_u32 s7, s2
360 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
361 ; GFX9-NEXT: s_xor_b32 s6, s6, s4
362 ; GFX9-NEXT: s_sub_i32 s6, s6, s4
363 ; GFX9-NEXT: s_add_i32 s3, s3, 1
364 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
365 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
366 ; GFX9-NEXT: s_add_u32 s0, s0, 4
367 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
368 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
369 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
370 ; GFX9-NEXT: ; %bb.2: ; %bb2
371 ; GFX9-NEXT: s_endpgm
373 ; GFX10-LABEL: sdiv32_invariant_denom:
374 ; GFX10: ; %bb.0: ; %bb
375 ; GFX10-NEXT: s_clause 0x1
376 ; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c
377 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
378 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX10-NEXT: s_abs_i32 s2, s3
380 ; GFX10-NEXT: s_ashr_i32 s3, s3, 31
381 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
382 ; GFX10-NEXT: s_sub_i32 s4, 0, s2
383 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
384 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
385 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
386 ; GFX10-NEXT: v_readfirstlane_b32 s5, v0
387 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
388 ; GFX10-NEXT: s_mul_i32 s4, s4, s5
389 ; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4
390 ; GFX10-NEXT: s_mov_b32 s4, 0
391 ; GFX10-NEXT: s_add_i32 s5, s5, s6
392 ; GFX10-NEXT: .LBB2_1: ; %bb3
393 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
394 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5
395 ; GFX10-NEXT: s_mul_i32 s7, s6, s2
396 ; GFX10-NEXT: s_add_i32 s8, s6, 1
397 ; GFX10-NEXT: s_sub_i32 s7, s4, s7
398 ; GFX10-NEXT: s_sub_i32 s9, s7, s2
399 ; GFX10-NEXT: s_cmp_ge_u32 s7, s2
400 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6
401 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7
402 ; GFX10-NEXT: s_add_i32 s8, s6, 1
403 ; GFX10-NEXT: s_cmp_ge_u32 s7, s2
404 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6
405 ; GFX10-NEXT: s_add_i32 s4, s4, 1
406 ; GFX10-NEXT: s_xor_b32 s6, s6, s3
407 ; GFX10-NEXT: s_sub_i32 s6, s6, s3
408 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
409 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
410 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
411 ; GFX10-NEXT: s_add_u32 s0, s0, 4
412 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
413 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
414 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
415 ; GFX10-NEXT: ; %bb.2: ; %bb2
416 ; GFX10-NEXT: s_endpgm
418 ; GFX11-LABEL: sdiv32_invariant_denom:
419 ; GFX11: ; %bb.0: ; %bb
420 ; GFX11-NEXT: s_clause 0x1
421 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x2c
422 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
423 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX11-NEXT: s_abs_i32 s2, s3
425 ; GFX11-NEXT: s_ashr_i32 s3, s3, 31
426 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
427 ; GFX11-NEXT: s_sub_i32 s4, 0, s2
428 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
429 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
430 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
431 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
432 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
433 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
434 ; GFX11-NEXT: v_readfirstlane_b32 s5, v0
435 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
436 ; GFX11-NEXT: s_mul_i32 s4, s4, s5
437 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
438 ; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
439 ; GFX11-NEXT: s_mov_b32 s4, 0
440 ; GFX11-NEXT: s_add_i32 s5, s5, s6
441 ; GFX11-NEXT: .p2align 6
442 ; GFX11-NEXT: .LBB2_1: ; %bb3
443 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
444 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
445 ; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5
446 ; GFX11-NEXT: s_mul_i32 s7, s6, s2
447 ; GFX11-NEXT: s_add_i32 s8, s6, 1
448 ; GFX11-NEXT: s_sub_i32 s7, s4, s7
449 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
450 ; GFX11-NEXT: s_sub_i32 s9, s7, s2
451 ; GFX11-NEXT: s_cmp_ge_u32 s7, s2
452 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6
453 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7
454 ; GFX11-NEXT: s_add_i32 s8, s6, 1
455 ; GFX11-NEXT: s_cmp_ge_u32 s7, s2
456 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6
457 ; GFX11-NEXT: s_add_i32 s4, s4, 1
458 ; GFX11-NEXT: s_xor_b32 s6, s6, s3
459 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
460 ; GFX11-NEXT: s_sub_i32 s6, s6, s3
461 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
462 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
463 ; GFX11-NEXT: s_add_u32 s0, s0, 4
464 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
465 ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
466 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1
467 ; GFX11-NEXT: ; %bb.2: ; %bb2
468 ; GFX11-NEXT: s_endpgm
475 bb3: ; preds = %bb3, %bb
476 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
477 %tmp4 = sdiv i32 %tmp, %arg1
478 %tmp5 = zext i32 %tmp to i64
479 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
480 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
481 %tmp7 = add nuw nsw i32 %tmp, 1
482 %tmp8 = icmp eq i32 %tmp7, 1024
483 br i1 %tmp8, label %bb2, label %bb3
486 define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
487 ; GFX9-LABEL: srem32_invariant_denom:
488 ; GFX9: ; %bb.0: ; %bb
489 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
490 ; GFX9-NEXT: s_mov_b32 s3, 0
491 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX9-NEXT: s_abs_i32 s2, s0
493 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
494 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
495 ; GFX9-NEXT: s_sub_i32 s4, 0, s2
496 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
497 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
498 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
499 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
500 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
501 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
502 ; GFX9-NEXT: s_add_i32 s4, s5, s4
503 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
504 ; GFX9-NEXT: .LBB3_1: ; %bb3
505 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
506 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
507 ; GFX9-NEXT: s_mul_i32 s5, s5, s2
508 ; GFX9-NEXT: s_sub_i32 s5, s3, s5
509 ; GFX9-NEXT: s_sub_i32 s6, s5, s2
510 ; GFX9-NEXT: s_cmp_ge_u32 s5, s2
511 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
512 ; GFX9-NEXT: s_sub_i32 s6, s5, s2
513 ; GFX9-NEXT: s_cmp_ge_u32 s5, s2
514 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
515 ; GFX9-NEXT: s_add_i32 s3, s3, 1
516 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
517 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
518 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
519 ; GFX9-NEXT: s_add_u32 s0, s0, 4
520 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
521 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
522 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
523 ; GFX9-NEXT: ; %bb.2: ; %bb2
524 ; GFX9-NEXT: s_endpgm
526 ; GFX10-LABEL: srem32_invariant_denom:
527 ; GFX10: ; %bb.0: ; %bb
528 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x2c
529 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX10-NEXT: s_abs_i32 s2, s0
531 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
532 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
533 ; GFX10-NEXT: s_sub_i32 s3, 0, s2
534 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
535 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
536 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
537 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
538 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
539 ; GFX10-NEXT: s_mul_i32 s3, s3, s4
540 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3
541 ; GFX10-NEXT: s_mov_b32 s3, 0
542 ; GFX10-NEXT: s_add_i32 s4, s4, s5
543 ; GFX10-NEXT: .LBB3_1: ; %bb3
544 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
545 ; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4
546 ; GFX10-NEXT: s_mul_i32 s5, s5, s2
547 ; GFX10-NEXT: s_sub_i32 s5, s3, s5
548 ; GFX10-NEXT: s_sub_i32 s6, s5, s2
549 ; GFX10-NEXT: s_cmp_ge_u32 s5, s2
550 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5
551 ; GFX10-NEXT: s_sub_i32 s6, s5, s2
552 ; GFX10-NEXT: s_cmp_ge_u32 s5, s2
553 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5
554 ; GFX10-NEXT: s_add_i32 s3, s3, 1
555 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
556 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
558 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
559 ; GFX10-NEXT: s_add_u32 s0, s0, 4
560 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
561 ; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
562 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
563 ; GFX10-NEXT: ; %bb.2: ; %bb2
564 ; GFX10-NEXT: s_endpgm
566 ; GFX11-LABEL: srem32_invariant_denom:
567 ; GFX11: ; %bb.0: ; %bb
568 ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
569 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
570 ; GFX11-NEXT: s_abs_i32 s2, s0
571 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
572 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
573 ; GFX11-NEXT: s_sub_i32 s3, 0, s2
574 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
575 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
576 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
577 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
578 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
579 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
580 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
581 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
582 ; GFX11-NEXT: s_mul_i32 s3, s3, s4
583 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
584 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
585 ; GFX11-NEXT: s_mov_b32 s3, 0
586 ; GFX11-NEXT: s_add_i32 s4, s4, s5
587 ; GFX11-NEXT: .p2align 6
588 ; GFX11-NEXT: .LBB3_1: ; %bb3
589 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
590 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
591 ; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4
592 ; GFX11-NEXT: s_mul_i32 s5, s5, s2
593 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
594 ; GFX11-NEXT: s_sub_i32 s5, s3, s5
595 ; GFX11-NEXT: s_sub_i32 s6, s5, s2
596 ; GFX11-NEXT: s_cmp_ge_u32 s5, s2
597 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5
598 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
599 ; GFX11-NEXT: s_sub_i32 s6, s5, s2
600 ; GFX11-NEXT: s_cmp_ge_u32 s5, s2
601 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5
602 ; GFX11-NEXT: s_add_i32 s3, s3, 1
603 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
604 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
605 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
606 ; GFX11-NEXT: s_add_u32 s0, s0, 4
607 ; GFX11-NEXT: s_addc_u32 s1, s1, 0
608 ; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
609 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1
610 ; GFX11-NEXT: ; %bb.2: ; %bb2
611 ; GFX11-NEXT: s_endpgm
618 bb3: ; preds = %bb3, %bb
619 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
620 %tmp4 = srem i32 %tmp, %arg1
621 %tmp5 = zext i32 %tmp to i64
622 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
623 store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
624 %tmp7 = add nuw nsw i32 %tmp, 1
625 %tmp8 = icmp eq i32 %tmp7, 1024
626 br i1 %tmp8, label %bb2, label %bb3
629 define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
630 ; GFX9-LABEL: udiv16_invariant_denom:
631 ; GFX9: ; %bb.0: ; %bb
632 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
633 ; GFX9-NEXT: s_mov_b32 s2, 0
634 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
635 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
636 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0
637 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
638 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
639 ; GFX9-NEXT: .LBB4_1: ; %bb3
640 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
641 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s2
642 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
643 ; GFX9-NEXT: s_add_i32 s2, s2, 1
644 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1
645 ; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
646 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v1
647 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
648 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4
649 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
650 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
651 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
652 ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
653 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
654 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
656 ; GFX9-NEXT: s_cbranch_scc0 .LBB4_1
657 ; GFX9-NEXT: ; %bb.2: ; %bb2
658 ; GFX9-NEXT: s_endpgm
660 ; GFX10-LABEL: udiv16_invariant_denom:
661 ; GFX10: ; %bb.0: ; %bb
662 ; GFX10-NEXT: s_clause 0x1
663 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
664 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
665 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
666 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
667 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
668 ; GFX10-NEXT: s_mov_b32 s2, 0
669 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
670 ; GFX10-NEXT: .LBB4_1: ; %bb3
671 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
672 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s2
673 ; GFX10-NEXT: s_add_i32 s2, s2, 1
674 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s3
675 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1
676 ; GFX10-NEXT: v_mov_b32_e32 v4, s3
677 ; GFX10-NEXT: s_and_b32 s3, s2, 0xffff
678 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
679 ; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
680 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
681 ; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2
682 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
683 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
684 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
685 ; GFX10-NEXT: global_store_short v4, v2, s[0:1]
686 ; GFX10-NEXT: s_cbranch_scc0 .LBB4_1
687 ; GFX10-NEXT: ; %bb.2: ; %bb2
688 ; GFX10-NEXT: s_endpgm
690 ; GFX11-LABEL: udiv16_invariant_denom:
691 ; GFX11: ; %bb.0: ; %bb
692 ; GFX11-NEXT: s_clause 0x1
693 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
694 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
695 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
696 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
697 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
698 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
699 ; GFX11-NEXT: s_mov_b32 s2, 0
700 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
701 ; GFX11-NEXT: .p2align 6
702 ; GFX11-NEXT: .LBB4_1: ; %bb3
703 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
704 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s2
705 ; GFX11-NEXT: s_add_i32 s2, s2, 1
706 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
707 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1
708 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
709 ; GFX11-NEXT: v_mov_b32_e32 v4, s3
710 ; GFX11-NEXT: s_and_b32 s3, s2, 0xffff
711 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
712 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
713 ; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
714 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
715 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
716 ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
717 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
718 ; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
719 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
720 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
721 ; GFX11-NEXT: global_store_b16 v4, v2, s[0:1]
722 ; GFX11-NEXT: s_cbranch_scc0 .LBB4_1
723 ; GFX11-NEXT: ; %bb.2: ; %bb2
724 ; GFX11-NEXT: s_endpgm
731 bb3: ; preds = %bb3, %bb
732 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
733 %tmp4 = udiv i16 %tmp, %arg1
734 %tmp5 = zext i16 %tmp to i64
735 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
736 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
737 %tmp7 = add nuw nsw i16 %tmp, 1
738 %tmp8 = icmp eq i16 %tmp7, 1024
739 br i1 %tmp8, label %bb2, label %bb3
742 define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
743 ; GFX9-LABEL: urem16_invariant_denom:
744 ; GFX9: ; %bb.0: ; %bb
745 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
746 ; GFX9-NEXT: s_mov_b32 s3, 0
747 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX9-NEXT: s_and_b32 s2, s0, 0xffff
749 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
750 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
751 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
752 ; GFX9-NEXT: .LBB5_1: ; %bb3
753 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
754 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s3
755 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4
756 ; GFX9-NEXT: s_add_i32 s3, s3, 1
757 ; GFX9-NEXT: s_lshl_b32 s5, s4, 1
758 ; GFX9-NEXT: s_and_b32 s6, s3, 0xffff
759 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1
760 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
761 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
762 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
763 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
764 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
765 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
766 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
767 ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
768 ; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2
769 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
770 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
771 ; GFX9-NEXT: s_cbranch_scc0 .LBB5_1
772 ; GFX9-NEXT: ; %bb.2: ; %bb2
773 ; GFX9-NEXT: s_endpgm
775 ; GFX10-LABEL: urem16_invariant_denom:
776 ; GFX10: ; %bb.0: ; %bb
777 ; GFX10-NEXT: s_clause 0x1
778 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
779 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
780 ; GFX10-NEXT: s_mov_b32 s3, 0
781 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
782 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
783 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
784 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
785 ; GFX10-NEXT: .LBB5_1: ; %bb3
786 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
787 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s3
788 ; GFX10-NEXT: s_add_i32 s3, s3, 1
789 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4
790 ; GFX10-NEXT: s_lshl_b32 s5, s4, 1
791 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
792 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
793 ; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2
794 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
795 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
796 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
797 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
798 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2
799 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
800 ; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
801 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
802 ; GFX10-NEXT: global_store_short v3, v2, s[0:1]
803 ; GFX10-NEXT: s_cbranch_scc0 .LBB5_1
804 ; GFX10-NEXT: ; %bb.2: ; %bb2
805 ; GFX10-NEXT: s_endpgm
807 ; GFX11-LABEL: urem16_invariant_denom:
808 ; GFX11: ; %bb.0: ; %bb
809 ; GFX11-NEXT: s_clause 0x1
810 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
811 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
812 ; GFX11-NEXT: s_mov_b32 s3, 0
813 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
814 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
815 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
816 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
817 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
818 ; GFX11-NEXT: .p2align 6
819 ; GFX11-NEXT: .LBB5_1: ; %bb3
820 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
821 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s3
822 ; GFX11-NEXT: s_add_i32 s3, s3, 1
823 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s4
824 ; GFX11-NEXT: s_lshl_b32 s5, s4, 1
825 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
826 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
827 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
828 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
829 ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
830 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
831 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
832 ; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
833 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
834 ; GFX11-NEXT: v_mov_b32_e32 v3, s5
835 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
836 ; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
837 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
838 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
839 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
840 ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
841 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
842 ; GFX11-NEXT: s_cbranch_scc0 .LBB5_1
843 ; GFX11-NEXT: ; %bb.2: ; %bb2
844 ; GFX11-NEXT: s_endpgm
851 bb3: ; preds = %bb3, %bb
852 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
853 %tmp4 = urem i16 %tmp, %arg1
854 %tmp5 = zext i16 %tmp to i64
855 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
856 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
857 %tmp7 = add nuw nsw i16 %tmp, 1
858 %tmp8 = icmp eq i16 %tmp7, 1024
859 br i1 %tmp8, label %bb2, label %bb3
862 define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
863 ; GFX9-LABEL: sdiv16_invariant_denom:
864 ; GFX9: ; %bb.0: ; %bb
865 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
866 ; GFX9-NEXT: s_mov_b32 s3, 0
867 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
868 ; GFX9-NEXT: s_sext_i32_i16 s2, s0
869 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
870 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
871 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
872 ; GFX9-NEXT: .LBB6_1: ; %bb3
873 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
874 ; GFX9-NEXT: s_sext_i32_i16 s4, s3
875 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
876 ; GFX9-NEXT: s_xor_b32 s4, s4, s2
877 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30
878 ; GFX9-NEXT: s_or_b32 s6, s4, 1
879 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1
880 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
881 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
882 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
883 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
884 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
885 ; GFX9-NEXT: s_cselect_b32 s4, s6, 0
886 ; GFX9-NEXT: s_and_b32 s5, 0xffff, s3
887 ; GFX9-NEXT: s_add_i32 s3, s3, 1
888 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3
889 ; GFX9-NEXT: s_lshl_b32 s4, s5, 1
890 ; GFX9-NEXT: s_and_b32 s5, s3, 0xffff
891 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
892 ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
893 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
894 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
895 ; GFX9-NEXT: s_cbranch_scc0 .LBB6_1
896 ; GFX9-NEXT: ; %bb.2: ; %bb2
897 ; GFX9-NEXT: s_endpgm
899 ; GFX10-LABEL: sdiv16_invariant_denom:
900 ; GFX10: ; %bb.0: ; %bb
901 ; GFX10-NEXT: s_clause 0x1
902 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
903 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
904 ; GFX10-NEXT: s_mov_b32 s3, 0
905 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
906 ; GFX10-NEXT: s_sext_i32_i16 s2, s2
907 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2
908 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
909 ; GFX10-NEXT: .LBB6_1: ; %bb3
910 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
911 ; GFX10-NEXT: s_sext_i32_i16 s4, s3
912 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4
913 ; GFX10-NEXT: s_xor_b32 s4, s4, s2
914 ; GFX10-NEXT: s_ashr_i32 s4, s4, 30
915 ; GFX10-NEXT: s_or_b32 s4, s4, 1
916 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
917 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
918 ; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2
919 ; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0|
920 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
921 ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
922 ; GFX10-NEXT: s_cselect_b32 s4, s4, 0
923 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s3
924 ; GFX10-NEXT: s_add_i32 s3, s3, 1
925 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
926 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2
927 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
928 ; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
929 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
930 ; GFX10-NEXT: global_store_short v3, v2, s[0:1]
931 ; GFX10-NEXT: s_cbranch_scc0 .LBB6_1
932 ; GFX10-NEXT: ; %bb.2: ; %bb2
933 ; GFX10-NEXT: s_endpgm
935 ; GFX11-LABEL: sdiv16_invariant_denom:
936 ; GFX11: ; %bb.0: ; %bb
937 ; GFX11-NEXT: s_clause 0x1
938 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
939 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
940 ; GFX11-NEXT: s_mov_b32 s3, 0
941 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
942 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
943 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
944 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
945 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
946 ; GFX11-NEXT: .p2align 6
947 ; GFX11-NEXT: .LBB6_1: ; %bb3
948 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
949 ; GFX11-NEXT: s_sext_i32_i16 s4, s3
950 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
951 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4
952 ; GFX11-NEXT: s_xor_b32 s4, s4, s2
953 ; GFX11-NEXT: s_ashr_i32 s4, s4, 30
954 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
955 ; GFX11-NEXT: s_or_b32 s4, s4, 1
956 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
957 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
958 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
959 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
960 ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
961 ; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0|
962 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
963 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
964 ; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
965 ; GFX11-NEXT: s_cselect_b32 s4, s4, 0
966 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
967 ; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
968 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
969 ; GFX11-NEXT: s_add_i32 s3, s3, 1
970 ; GFX11-NEXT: v_mov_b32_e32 v3, s5
971 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
972 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
973 ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
974 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
975 ; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
976 ; GFX11-NEXT: ; %bb.2: ; %bb2
977 ; GFX11-NEXT: s_endpgm
984 bb3: ; preds = %bb3, %bb
985 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
986 %tmp4 = sdiv i16 %tmp, %arg1
987 %tmp5 = zext i16 %tmp to i64
988 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
989 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
990 %tmp7 = add nuw nsw i16 %tmp, 1
991 %tmp8 = icmp eq i16 %tmp7, 1024
992 br i1 %tmp8, label %bb2, label %bb3
995 define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
996 ; GFX9-LABEL: srem16_invariant_denom:
997 ; GFX9: ; %bb.0: ; %bb
998 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
999 ; GFX9-NEXT: s_mov_b32 s3, 0
1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1001 ; GFX9-NEXT: s_sext_i32_i16 s2, s0
1002 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
1003 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1004 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
1005 ; GFX9-NEXT: .LBB7_1: ; %bb3
1006 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1007 ; GFX9-NEXT: s_sext_i32_i16 s6, s3
1008 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6
1009 ; GFX9-NEXT: s_xor_b32 s4, s6, s2
1010 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30
1011 ; GFX9-NEXT: s_or_b32 s7, s4, 1
1012 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1
1013 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
1014 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
1015 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
1016 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
1017 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
1018 ; GFX9-NEXT: s_cselect_b32 s4, s7, 0
1019 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3
1020 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
1021 ; GFX9-NEXT: s_and_b32 s5, 0xffff, s3
1022 ; GFX9-NEXT: s_add_i32 s3, s3, 1
1023 ; GFX9-NEXT: s_lshl_b32 s4, s5, 1
1024 ; GFX9-NEXT: s_and_b32 s5, s3, 0xffff
1025 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
1026 ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
1027 ; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2
1028 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1029 ; GFX9-NEXT: global_store_short v3, v2, s[0:1]
1030 ; GFX9-NEXT: s_cbranch_scc0 .LBB7_1
1031 ; GFX9-NEXT: ; %bb.2: ; %bb2
1032 ; GFX9-NEXT: s_endpgm
1034 ; GFX10-LABEL: srem16_invariant_denom:
1035 ; GFX10: ; %bb.0: ; %bb
1036 ; GFX10-NEXT: s_clause 0x1
1037 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
1038 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1039 ; GFX10-NEXT: s_mov_b32 s3, 0
1040 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1041 ; GFX10-NEXT: s_sext_i32_i16 s2, s2
1042 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2
1043 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
1044 ; GFX10-NEXT: .LBB7_1: ; %bb3
1045 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1046 ; GFX10-NEXT: s_sext_i32_i16 s4, s3
1047 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4
1048 ; GFX10-NEXT: s_xor_b32 s5, s4, s2
1049 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30
1050 ; GFX10-NEXT: s_or_b32 s5, s5, 1
1051 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
1052 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
1053 ; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2
1054 ; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
1055 ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
1056 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
1057 ; GFX10-NEXT: s_cselect_b32 s5, s5, 0
1058 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s5, v2
1059 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s3
1060 ; GFX10-NEXT: s_add_i32 s3, s3, 1
1061 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
1062 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2
1063 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
1064 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
1065 ; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
1066 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
1067 ; GFX10-NEXT: global_store_short v3, v2, s[0:1]
1068 ; GFX10-NEXT: s_cbranch_scc0 .LBB7_1
1069 ; GFX10-NEXT: ; %bb.2: ; %bb2
1070 ; GFX10-NEXT: s_endpgm
1072 ; GFX11-LABEL: srem16_invariant_denom:
1073 ; GFX11: ; %bb.0: ; %bb
1074 ; GFX11-NEXT: s_clause 0x1
1075 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
1076 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1077 ; GFX11-NEXT: s_mov_b32 s3, 0
1078 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1079 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
1080 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1081 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
1082 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
1083 ; GFX11-NEXT: .p2align 6
1084 ; GFX11-NEXT: .LBB7_1: ; %bb3
1085 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1086 ; GFX11-NEXT: s_sext_i32_i16 s4, s3
1087 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1088 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4
1089 ; GFX11-NEXT: s_xor_b32 s5, s4, s2
1090 ; GFX11-NEXT: s_ashr_i32 s5, s5, 30
1091 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1092 ; GFX11-NEXT: s_or_b32 s5, s5, 1
1093 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1094 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
1095 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
1096 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1097 ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
1098 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
1099 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
1100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1101 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
1102 ; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1103 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1104 ; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
1105 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
1106 ; GFX11-NEXT: s_add_i32 s3, s3, 1
1107 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
1108 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1109 ; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
1110 ; GFX11-NEXT: v_mov_b32_e32 v3, s5
1111 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
1112 ; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
1113 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1114 ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
1115 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
1116 ; GFX11-NEXT: s_cbranch_scc0 .LBB7_1
1117 ; GFX11-NEXT: ; %bb.2: ; %bb2
1118 ; GFX11-NEXT: s_endpgm
1125 bb3: ; preds = %bb3, %bb
1126 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
1127 %tmp4 = srem i16 %tmp, %arg1
1128 %tmp5 = zext i16 %tmp to i64
1129 %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
1130 store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
1131 %tmp7 = add nuw nsw i16 %tmp, 1
1132 %tmp8 = icmp eq i16 %tmp7, 1024
1133 br i1 %tmp8, label %bb2, label %bb3