1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
4 define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
5 ; GFX9-LABEL: udiv32_invariant_denom:
7 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
8 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
9 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
12 ; GFX9-NEXT: s_sub_i32 s3, 0, s2
13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
15 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
16 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s2
17 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2
18 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
19 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
20 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
21 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
22 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
23 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
24 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
25 ; GFX9-NEXT: BB0_1: ; %bb3
26 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
27 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
28 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6
29 ; GFX9-NEXT: v_add_u32_e32 v3, v2, v1
30 ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3
31 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2
32 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v3
33 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
34 ; GFX9-NEXT: v_add_u32_e32 v5, s6, v1
35 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4
36 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5
37 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
38 ; GFX9-NEXT: s_add_u32 s6, s6, 1
39 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
40 ; GFX9-NEXT: s_addc_u32 s7, s7, 0
41 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
42 ; GFX9-NEXT: s_add_u32 s4, s4, 4
43 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1]
44 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
45 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
46 ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
47 ; GFX9-NEXT: global_store_dword v[1:2], v3, off
48 ; GFX9-NEXT: s_cbranch_scc0 BB0_1
49 ; GFX9-NEXT: ; %bb.2: ; %bb2
57 bb3: ; preds = %bb3, %bb
58 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
59 %tmp4 = udiv i32 %tmp, %arg1
60 %tmp5 = zext i32 %tmp to i64
61 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
62 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
63 %tmp7 = add nuw nsw i32 %tmp, 1
64 %tmp8 = icmp eq i32 %tmp7, 1024
65 br i1 %tmp8, label %bb2, label %bb3
68 define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
69 ; GFX9-LABEL: urem32_invariant_denom:
70 ; GFX9: ; %bb.0: ; %bb
71 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
72 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
73 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
74 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
76 ; GFX9-NEXT: s_sub_i32 s3, 0, s2
77 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
78 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
79 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
80 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s2
81 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2
82 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
83 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
84 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
85 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
86 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
87 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
88 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
89 ; GFX9-NEXT: BB1_1: ; %bb3
90 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
91 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
92 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6
93 ; GFX9-NEXT: v_add_u32_e32 v3, v2, v1
94 ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
95 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2
96 ; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3
97 ; GFX9-NEXT: v_not_b32_e32 v3, v3
98 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5
99 ; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3
100 ; GFX9-NEXT: v_add_u32_e32 v4, s6, v4
101 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
102 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v6
103 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
104 ; GFX9-NEXT: v_add_u32_e32 v3, s6, v3
105 ; GFX9-NEXT: v_add_u32_e32 v5, s6, v5
106 ; GFX9-NEXT: s_add_u32 s6, s6, 1
107 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
108 ; GFX9-NEXT: s_addc_u32 s7, s7, 0
109 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
110 ; GFX9-NEXT: s_add_u32 s4, s4, 4
111 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
112 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
113 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1]
114 ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
115 ; GFX9-NEXT: global_store_dword v[1:2], v3, off
116 ; GFX9-NEXT: s_cbranch_scc0 BB1_1
117 ; GFX9-NEXT: ; %bb.2: ; %bb2
118 ; GFX9-NEXT: s_endpgm
125 bb3: ; preds = %bb3, %bb
126 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
127 %tmp4 = urem i32 %tmp, %arg1
128 %tmp5 = zext i32 %tmp to i64
129 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
130 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
131 %tmp7 = add nuw nsw i32 %tmp, 1
132 %tmp8 = icmp eq i32 %tmp7, 1024
133 br i1 %tmp8, label %bb2, label %bb3
136 define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
137 ; GFX9-LABEL: sdiv32_invariant_denom:
138 ; GFX9: ; %bb.0: ; %bb
139 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
140 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
141 ; GFX9-NEXT: s_mov_b32 s6, 0
142 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX9-NEXT: s_ashr_i32 s2, s3, 31
144 ; GFX9-NEXT: s_add_i32 s3, s3, s2
145 ; GFX9-NEXT: s_xor_b32 s3, s3, s2
146 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
147 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
148 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
149 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
150 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3
151 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3
152 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
153 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
154 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
155 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
156 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
157 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
158 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
159 ; GFX9-NEXT: BB2_1: ; %bb3
160 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
161 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s6
162 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
163 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
164 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3
165 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3
166 ; GFX9-NEXT: v_add_u32_e32 v7, -1, v3
167 ; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4
168 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4
169 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5
170 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
171 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
172 ; GFX9-NEXT: s_add_i32 s6, s6, 1
173 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
174 ; GFX9-NEXT: s_add_u32 s4, s4, 4
175 ; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3
176 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
177 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3
178 ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
179 ; GFX9-NEXT: global_store_dword v[1:2], v3, off
180 ; GFX9-NEXT: s_cbranch_scc0 BB2_1
181 ; GFX9-NEXT: ; %bb.2: ; %bb2
182 ; GFX9-NEXT: s_endpgm
189 bb3: ; preds = %bb3, %bb
190 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
191 %tmp4 = sdiv i32 %tmp, %arg1
192 %tmp5 = zext i32 %tmp to i64
193 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
194 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
195 %tmp7 = add nuw nsw i32 %tmp, 1
196 %tmp8 = icmp eq i32 %tmp7, 1024
197 br i1 %tmp8, label %bb2, label %bb3
200 define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
201 ; GFX9-LABEL: srem32_invariant_denom:
202 ; GFX9: ; %bb.0: ; %bb
203 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
204 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
205 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
207 ; GFX9-NEXT: s_add_i32 s2, s2, s3
208 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
209 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
210 ; GFX9-NEXT: s_mov_b32 s3, 0
211 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
212 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
213 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
214 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s2
215 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2
216 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
217 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
218 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
219 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
220 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
221 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
222 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
223 ; GFX9-NEXT: BB3_1: ; %bb3
224 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
225 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3
226 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2
227 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
228 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
229 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3
230 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3
231 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
232 ; GFX9-NEXT: s_add_i32 s3, s3, 1
233 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
234 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v4
235 ; GFX9-NEXT: s_add_u32 s4, s4, 4
236 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
237 ; GFX9-NEXT: v_add_u32_e32 v5, s2, v4
238 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
239 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1]
240 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
241 ; GFX9-NEXT: global_store_dword v[1:2], v3, off
242 ; GFX9-NEXT: s_cbranch_scc0 BB3_1
243 ; GFX9-NEXT: ; %bb.2: ; %bb2
244 ; GFX9-NEXT: s_endpgm
251 bb3: ; preds = %bb3, %bb
252 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
253 %tmp4 = srem i32 %tmp, %arg1
254 %tmp5 = zext i32 %tmp to i64
255 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
256 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
257 %tmp7 = add nuw nsw i32 %tmp, 1
258 %tmp8 = icmp eq i32 %tmp7, 1024
259 br i1 %tmp8, label %bb2, label %bb3
262 define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
263 ; GFX9-LABEL: udiv16_invariant_denom:
264 ; GFX9: ; %bb.0: ; %bb
265 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
266 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
267 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
268 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
269 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
270 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX9-NEXT: s_and_b32 s3, s2, s3
272 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
273 ; GFX9-NEXT: s_movk_i32 s3, 0x400
274 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
275 ; GFX9-NEXT: BB4_1: ; %bb3
276 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
277 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
278 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
279 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
280 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
281 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
282 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1]
283 ; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1
284 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
285 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2
286 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
287 ; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7
288 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0
289 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
290 ; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1]
291 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
292 ; GFX9-NEXT: global_store_short v[5:6], v2, off
293 ; GFX9-NEXT: s_cbranch_vccz BB4_1
294 ; GFX9-NEXT: ; %bb.2: ; %bb2
295 ; GFX9-NEXT: s_endpgm
302 bb3: ; preds = %bb3, %bb
303 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
304 %tmp4 = udiv i16 %tmp, %arg1
305 %tmp5 = zext i16 %tmp to i64
306 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
307 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
308 %tmp7 = add nuw nsw i16 %tmp, 1
309 %tmp8 = icmp eq i16 %tmp7, 1024
310 br i1 %tmp8, label %bb2, label %bb3
313 define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
314 ; GFX9-LABEL: urem16_invariant_denom:
315 ; GFX9: ; %bb.0: ; %bb
316 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
317 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
318 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
319 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
320 ; GFX9-NEXT: s_movk_i32 s6, 0x400
321 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX9-NEXT: s_and_b32 s3, s2, s3
323 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
324 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
325 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
326 ; GFX9-NEXT: BB5_1: ; %bb3
327 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
328 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
329 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
330 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
331 ; GFX9-NEXT: v_mov_b32_e32 v8, s5
332 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
333 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
334 ; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1
335 ; GFX9-NEXT: v_trunc_f32_e32 v8, v8
336 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8
337 ; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7
338 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0
339 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
340 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]
341 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3
342 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4
343 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
344 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
345 ; GFX9-NEXT: global_store_short v[5:6], v2, off
346 ; GFX9-NEXT: s_cbranch_vccz BB5_1
347 ; GFX9-NEXT: ; %bb.2: ; %bb2
348 ; GFX9-NEXT: s_endpgm
355 bb3: ; preds = %bb3, %bb
356 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
357 %tmp4 = urem i16 %tmp, %arg1
358 %tmp5 = zext i16 %tmp to i64
359 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
360 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
361 %tmp7 = add nuw nsw i16 %tmp, 1
362 %tmp8 = icmp eq i16 %tmp7, 1024
363 br i1 %tmp8, label %bb2, label %bb3
366 define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
367 ; GFX9-LABEL: sdiv16_invariant_denom:
368 ; GFX9: ; %bb.0: ; %bb
369 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
370 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
371 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
372 ; GFX9-NEXT: s_movk_i32 s3, 0x400
373 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
374 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
375 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
376 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
377 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
378 ; GFX9-NEXT: BB6_1: ; %bb3
379 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
380 ; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16
381 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4
382 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5
383 ; GFX9-NEXT: v_xor_b32_e32 v8, s2, v5
384 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
385 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
386 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
387 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
388 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v1
389 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
390 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v8
391 ; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7
392 ; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9
393 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
394 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
395 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0|
396 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
397 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
398 ; GFX9-NEXT: v_add_u32_e32 v2, v8, v2
399 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
400 ; GFX9-NEXT: global_store_short v[5:6], v2, off
401 ; GFX9-NEXT: s_cbranch_vccz BB6_1
402 ; GFX9-NEXT: ; %bb.2: ; %bb2
403 ; GFX9-NEXT: s_endpgm
410 bb3: ; preds = %bb3, %bb
411 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
412 %tmp4 = sdiv i16 %tmp, %arg1
413 %tmp5 = zext i16 %tmp to i64
414 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
415 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
416 %tmp7 = add nuw nsw i16 %tmp, 1
417 %tmp8 = icmp eq i16 %tmp7, 1024
418 br i1 %tmp8, label %bb2, label %bb3
421 define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
422 ; GFX9-LABEL: srem16_invariant_denom:
423 ; GFX9: ; %bb.0: ; %bb
424 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
425 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
426 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
427 ; GFX9-NEXT: s_movk_i32 s3, 0x400
428 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
429 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
430 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
431 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
432 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
433 ; GFX9-NEXT: BB7_1: ; %bb3
434 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
435 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16
436 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4
437 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7
438 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
439 ; GFX9-NEXT: v_mov_b32_e32 v8, s5
440 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
441 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
442 ; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1
443 ; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7
444 ; GFX9-NEXT: v_trunc_f32_e32 v8, v8
445 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9
446 ; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8
447 ; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10
448 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
449 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0|
450 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
451 ; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
452 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
453 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
454 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
455 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
456 ; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2
457 ; GFX9-NEXT: global_store_short v[5:6], v2, off
458 ; GFX9-NEXT: s_cbranch_vccz BB7_1
459 ; GFX9-NEXT: ; %bb.2: ; %bb2
460 ; GFX9-NEXT: s_endpgm
467 bb3: ; preds = %bb3, %bb
468 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
469 %tmp4 = srem i16 %tmp, %arg1
470 %tmp5 = zext i16 %tmp to i64
471 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
472 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
473 %tmp7 = add nuw nsw i16 %tmp, 1
474 %tmp8 = icmp eq i16 %tmp7, 1024
475 br i1 %tmp8, label %bb2, label %bb3