1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5 define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
6 ; GFX9-LABEL: udiv32_invariant_denom:
8 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
9 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
10 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
13 ; GFX9-NEXT: s_sub_i32 s4, 0, s5
14 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
15 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
16 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
17 ; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0
18 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
19 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
20 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
21 ; GFX9-NEXT: BB0_1: ; %bb3
22 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
23 ; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0
24 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0
25 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
26 ; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2
27 ; GFX9-NEXT: v_not_b32_e32 v5, v2
28 ; GFX9-NEXT: v_mul_lo_u32 v5, s5, v5
29 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
30 ; GFX9-NEXT: v_add_u32_e32 v3, s2, v3
31 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
32 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
33 ; GFX9-NEXT: v_add_u32_e32 v4, s2, v5
34 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
35 ; GFX9-NEXT: s_add_u32 s2, s2, 1
36 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
37 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
38 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
39 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
40 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
41 ; GFX9-NEXT: s_add_u32 s0, s0, 4
42 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
43 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
44 ; GFX9-NEXT: s_cbranch_scc0 BB0_1
45 ; GFX9-NEXT: ; %bb.2: ; %bb2
48 ; GFX10-LABEL: udiv32_invariant_denom:
49 ; GFX10: ; %bb.0: ; %bb
50 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
51 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
52 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
53 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
55 ; GFX10-NEXT: s_sub_i32 s5, 0, s4
56 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
57 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
58 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
59 ; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0
60 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
61 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
62 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
63 ; GFX10-NEXT: BB0_1: ; %bb3
64 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
65 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
66 ; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0
67 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2
68 ; GFX10-NEXT: v_mul_lo_u32 v4, s5, v2
69 ; GFX10-NEXT: v_not_b32_e32 v3, v2
70 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v2
71 ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3
72 ; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v4
73 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4
74 ; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3
75 ; GFX10-NEXT: s_add_u32 s2, s2, 1
76 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
77 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
78 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
79 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2
80 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3
81 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
82 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
83 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
84 ; GFX10-NEXT: s_add_u32 s0, s0, 4
85 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
86 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
87 ; GFX10-NEXT: s_cbranch_scc0 BB0_1
88 ; GFX10-NEXT: ; %bb.2: ; %bb2
89 ; GFX10-NEXT: s_endpgm
96 bb3: ; preds = %bb3, %bb
97 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
98 %tmp4 = udiv i32 %tmp, %arg1
99 %tmp5 = zext i32 %tmp to i64
100 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
101 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
102 %tmp7 = add nuw nsw i32 %tmp, 1
103 %tmp8 = icmp eq i32 %tmp7, 1024
104 br i1 %tmp8, label %bb2, label %bb3
107 define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
108 ; GFX9-LABEL: urem32_invariant_denom:
109 ; GFX9: ; %bb.0: ; %bb
110 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
111 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
112 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
113 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
114 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
115 ; GFX9-NEXT: s_sub_i32 s5, 0, s4
116 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
117 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
118 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
119 ; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0
120 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
121 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
122 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
123 ; GFX9-NEXT: BB1_1: ; %bb3
124 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
125 ; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0
126 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0
127 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
128 ; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2
129 ; GFX9-NEXT: v_not_b32_e32 v2, v2
130 ; GFX9-NEXT: v_mul_lo_u32 v2, s4, v2
131 ; GFX9-NEXT: v_add_u32_e32 v4, s2, v3
132 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v4
133 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
134 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v2
135 ; GFX9-NEXT: s_add_u32 s2, s2, 1
136 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2
137 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2
138 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
139 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
140 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
141 ; GFX9-NEXT: s_add_u32 s0, s0, 4
142 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
143 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
144 ; GFX9-NEXT: s_cbranch_scc0 BB1_1
145 ; GFX9-NEXT: ; %bb.2: ; %bb2
146 ; GFX9-NEXT: s_endpgm
148 ; GFX10-LABEL: urem32_invariant_denom:
149 ; GFX10: ; %bb.0: ; %bb
150 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
151 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
152 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
153 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
155 ; GFX10-NEXT: s_sub_i32 s5, 0, s4
156 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
157 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
158 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
159 ; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0
160 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
161 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
162 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
163 ; GFX10-NEXT: BB1_1: ; %bb3
164 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
165 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
166 ; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0
167 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2
168 ; GFX10-NEXT: v_mul_lo_u32 v3, s5, v2
169 ; GFX10-NEXT: v_not_b32_e32 v2, v2
170 ; GFX10-NEXT: v_mul_lo_u32 v2, s4, v2
171 ; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v3
172 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4
173 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
174 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2
175 ; GFX10-NEXT: s_add_u32 s2, s2, 1
176 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
177 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v2
178 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2
179 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
180 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
181 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
182 ; GFX10-NEXT: s_add_u32 s0, s0, 4
183 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
184 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
185 ; GFX10-NEXT: s_cbranch_scc0 BB1_1
186 ; GFX10-NEXT: ; %bb.2: ; %bb2
187 ; GFX10-NEXT: s_endpgm
194 bb3: ; preds = %bb3, %bb
195 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
196 %tmp4 = urem i32 %tmp, %arg1
197 %tmp5 = zext i32 %tmp to i64
198 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
199 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
200 %tmp7 = add nuw nsw i32 %tmp, 1
201 %tmp8 = icmp eq i32 %tmp7, 1024
202 br i1 %tmp8, label %bb2, label %bb3
205 define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
206 ; GFX9-LABEL: sdiv32_invariant_denom:
207 ; GFX9: ; %bb.0: ; %bb
208 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
210 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
211 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX9-NEXT: s_ashr_i32 s2, s3, 31
213 ; GFX9-NEXT: s_add_i32 s3, s3, s2
214 ; GFX9-NEXT: s_xor_b32 s4, s3, s2
215 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
216 ; GFX9-NEXT: s_sub_i32 s3, 0, s4
217 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
218 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
219 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
220 ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0
221 ; GFX9-NEXT: s_mov_b32 s3, 0
222 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
223 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
224 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
225 ; GFX9-NEXT: BB2_1: ; %bb3
226 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
227 ; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0
228 ; GFX9-NEXT: v_mul_lo_u32 v3, v2, s4
229 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
230 ; GFX9-NEXT: v_sub_u32_e32 v3, s3, v3
231 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
232 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
233 ; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3
234 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
235 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
236 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
237 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
238 ; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2
239 ; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2
240 ; GFX9-NEXT: s_add_i32 s3, s3, 1
241 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
242 ; GFX9-NEXT: s_add_u32 s0, s0, 4
243 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
244 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
245 ; GFX9-NEXT: s_cbranch_scc0 BB2_1
246 ; GFX9-NEXT: ; %bb.2: ; %bb2
247 ; GFX9-NEXT: s_endpgm
249 ; GFX10-LABEL: sdiv32_invariant_denom:
250 ; GFX10: ; %bb.0: ; %bb
251 ; GFX10-NEXT: s_clause 0x1
252 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c
253 ; GFX10-NEXT: s_nop 0
254 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
255 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX10-NEXT: s_ashr_i32 s2, s3, 31
257 ; GFX10-NEXT: s_add_i32 s3, s3, s2
258 ; GFX10-NEXT: s_xor_b32 s3, s3, s2
259 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
260 ; GFX10-NEXT: s_sub_i32 s4, 0, s3
261 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
262 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
263 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
264 ; GFX10-NEXT: v_mul_lo_u32 v1, s4, v0
265 ; GFX10-NEXT: s_mov_b32 s4, 0
266 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
267 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
268 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
269 ; GFX10-NEXT: BB2_1: ; %bb3
270 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
271 ; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
272 ; GFX10-NEXT: v_mul_lo_u32 v3, v2, s3
273 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2
274 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3
275 ; GFX10-NEXT: s_add_i32 s4, s4, 1
276 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
277 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s3, v3
278 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
279 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
280 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2
281 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
282 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
283 ; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2
284 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2
285 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
286 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
287 ; GFX10-NEXT: s_add_u32 s0, s0, 4
288 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
289 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
290 ; GFX10-NEXT: s_cbranch_scc0 BB2_1
291 ; GFX10-NEXT: ; %bb.2: ; %bb2
292 ; GFX10-NEXT: s_endpgm
299 bb3: ; preds = %bb3, %bb
300 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
301 %tmp4 = sdiv i32 %tmp, %arg1
302 %tmp5 = zext i32 %tmp to i64
303 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
304 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
305 %tmp7 = add nuw nsw i32 %tmp, 1
306 %tmp8 = icmp eq i32 %tmp7, 1024
307 br i1 %tmp8, label %bb2, label %bb3
310 define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
311 ; GFX9-LABEL: srem32_invariant_denom:
312 ; GFX9: ; %bb.0: ; %bb
313 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
315 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
316 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
317 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
318 ; GFX9-NEXT: s_add_i32 s2, s2, s3
319 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
320 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
321 ; GFX9-NEXT: s_sub_i32 s3, 0, s2
322 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
323 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
324 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
325 ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0
326 ; GFX9-NEXT: s_mov_b32 s3, 0
327 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
328 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
329 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
330 ; GFX9-NEXT: BB3_1: ; %bb3
331 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
332 ; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0
333 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
334 ; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
335 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
336 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
337 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
338 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
339 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
340 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
341 ; GFX9-NEXT: s_add_i32 s3, s3, 1
342 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
343 ; GFX9-NEXT: s_add_u32 s0, s0, 4
344 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
345 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
346 ; GFX9-NEXT: s_cbranch_scc0 BB3_1
347 ; GFX9-NEXT: ; %bb.2: ; %bb2
348 ; GFX9-NEXT: s_endpgm
350 ; GFX10-LABEL: srem32_invariant_denom:
351 ; GFX10: ; %bb.0: ; %bb
352 ; GFX10-NEXT: s_clause 0x1
353 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
354 ; GFX10-NEXT: s_nop 0
355 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
356 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
357 ; GFX10-NEXT: s_ashr_i32 s3, s2, 31
358 ; GFX10-NEXT: s_add_i32 s2, s2, s3
359 ; GFX10-NEXT: s_xor_b32 s2, s2, s3
360 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
361 ; GFX10-NEXT: s_sub_i32 s3, 0, s2
362 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
363 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
364 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
365 ; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0
366 ; GFX10-NEXT: s_mov_b32 s3, 0
367 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
368 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
369 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
370 ; GFX10-NEXT: BB3_1: ; %bb3
371 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
372 ; GFX10-NEXT: v_mul_hi_u32 v2, s3, v0
373 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2
374 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
375 ; GFX10-NEXT: s_add_i32 s3, s3, 1
376 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2
377 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
378 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
379 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2
380 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
381 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
382 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
383 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
384 ; GFX10-NEXT: s_add_u32 s0, s0, 4
385 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
386 ; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
387 ; GFX10-NEXT: s_cbranch_scc0 BB3_1
388 ; GFX10-NEXT: ; %bb.2: ; %bb2
389 ; GFX10-NEXT: s_endpgm
396 bb3: ; preds = %bb3, %bb
397 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
398 %tmp4 = srem i32 %tmp, %arg1
399 %tmp5 = zext i32 %tmp to i64
400 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
401 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
402 %tmp7 = add nuw nsw i32 %tmp, 1
403 %tmp8 = icmp eq i32 %tmp7, 1024
404 br i1 %tmp8, label %bb2, label %bb3
407 define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
408 ; GFX9-LABEL: udiv16_invariant_denom:
409 ; GFX9: ; %bb.0: ; %bb
410 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
411 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
412 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
413 ; GFX9-NEXT: s_movk_i32 s5, 0x400
414 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
415 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
416 ; GFX9-NEXT: s_and_b32 s2, s4, s2
417 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
418 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
419 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
420 ; GFX9-NEXT: BB4_1: ; %bb3
421 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
422 ; GFX9-NEXT: v_and_b32_e32 v0, s4, v4
423 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
424 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
425 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
427 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5
428 ; GFX9-NEXT: v_mul_f32_e32 v0, v8, v3
429 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0
430 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
431 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
432 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
433 ; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8
434 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2
435 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
436 ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
437 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
438 ; GFX9-NEXT: global_store_short v[5:6], v0, off
439 ; GFX9-NEXT: s_cbranch_vccz BB4_1
440 ; GFX9-NEXT: ; %bb.2: ; %bb2
441 ; GFX9-NEXT: s_endpgm
443 ; GFX10-LABEL: udiv16_invariant_denom:
444 ; GFX10: ; %bb.0: ; %bb
445 ; GFX10-NEXT: s_clause 0x1
446 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
447 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
448 ; GFX10-NEXT: s_mov_b32 s1, 0xffff
449 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
450 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
451 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX10-NEXT: s_and_b32 s0, s1, s4
453 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0
454 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
455 ; GFX10-NEXT: BB4_1: ; %bb3
456 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
457 ; GFX10-NEXT: v_and_b32_e32 v0, s1, v4
458 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
459 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0
460 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
461 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
462 ; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3
463 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
464 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
465 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
466 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
467 ; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7
468 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
469 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v2
470 ; GFX10-NEXT: v_add_co_ci_u32_e64 v0, s0, 0, v0, s0
471 ; GFX10-NEXT: global_store_short v[5:6], v0, off
472 ; GFX10-NEXT: s_cbranch_vccz BB4_1
473 ; GFX10-NEXT: ; %bb.2: ; %bb2
474 ; GFX10-NEXT: s_endpgm
481 bb3: ; preds = %bb3, %bb
482 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
483 %tmp4 = udiv i16 %tmp, %arg1
484 %tmp5 = zext i16 %tmp to i64
485 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
486 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
487 %tmp7 = add nuw nsw i16 %tmp, 1
488 %tmp8 = icmp eq i16 %tmp7, 1024
489 br i1 %tmp8, label %bb2, label %bb3
492 define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
493 ; GFX9-LABEL: urem16_invariant_denom:
494 ; GFX9: ; %bb.0: ; %bb
495 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
496 ; GFX9-NEXT: s_mov_b32 s6, 0xffff
497 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
498 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
499 ; GFX9-NEXT: s_movk_i32 s8, 0x400
500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX9-NEXT: s_and_b32 s7, s6, s2
502 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7
503 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
504 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
505 ; GFX9-NEXT: BB5_1: ; %bb3
506 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
507 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v4
508 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
509 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
510 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
511 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4
512 ; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3
513 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
514 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9
515 ; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8
516 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2
517 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
518 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
519 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7
520 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
521 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
522 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
523 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8
524 ; GFX9-NEXT: global_store_short v[5:6], v0, off
525 ; GFX9-NEXT: s_cbranch_vccz BB5_1
526 ; GFX9-NEXT: ; %bb.2: ; %bb2
527 ; GFX9-NEXT: s_endpgm
529 ; GFX10-LABEL: urem16_invariant_denom:
530 ; GFX10: ; %bb.0: ; %bb
531 ; GFX10-NEXT: s_clause 0x1
532 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
533 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
534 ; GFX10-NEXT: s_mov_b32 s1, 0xffff
535 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
536 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
537 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX10-NEXT: s_and_b32 s4, s1, s4
539 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4
540 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
541 ; GFX10-NEXT: BB5_1: ; %bb3
542 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
543 ; GFX10-NEXT: v_and_b32_e32 v0, s1, v4
544 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
545 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0
546 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
547 ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v3
548 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
549 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
550 ; GFX10-NEXT: v_trunc_f32_e32 v8, v8
551 ; GFX10-NEXT: v_mad_f32 v7, -v8, v2, v7
552 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
553 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
554 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
555 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
556 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4
557 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
558 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7
559 ; GFX10-NEXT: global_store_short v[5:6], v0, off
560 ; GFX10-NEXT: s_cbranch_vccz BB5_1
561 ; GFX10-NEXT: ; %bb.2: ; %bb2
562 ; GFX10-NEXT: s_endpgm
569 bb3: ; preds = %bb3, %bb
570 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
571 %tmp4 = urem i16 %tmp, %arg1
572 %tmp5 = zext i16 %tmp to i64
573 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
574 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
575 %tmp7 = add nuw nsw i16 %tmp, 1
576 %tmp8 = icmp eq i16 %tmp7, 1024
577 br i1 %tmp8, label %bb2, label %bb3
580 define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
581 ; GFX9-LABEL: sdiv16_invariant_denom:
582 ; GFX9: ; %bb.0: ; %bb
583 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
584 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
585 ; GFX9-NEXT: s_movk_i32 s5, 0x400
586 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
587 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
588 ; GFX9-NEXT: s_sext_i32_i16 s4, s2
589 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
590 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
591 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
592 ; GFX9-NEXT: BB6_1: ; %bb3
593 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
594 ; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16
595 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
596 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5
597 ; GFX9-NEXT: v_xor_b32_e32 v8, s4, v5
598 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
599 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
600 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
601 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5
602 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
603 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v3
604 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
605 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v8
606 ; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7
607 ; GFX9-NEXT: v_mad_f32 v7, -v7, v2, v9
608 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
609 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
610 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v2|
611 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
612 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1]
613 ; GFX9-NEXT: v_add_u32_e32 v0, v8, v0
614 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
615 ; GFX9-NEXT: global_store_short v[5:6], v0, off
616 ; GFX9-NEXT: s_cbranch_vccz BB6_1
617 ; GFX9-NEXT: ; %bb.2: ; %bb2
618 ; GFX9-NEXT: s_endpgm
620 ; GFX10-LABEL: sdiv16_invariant_denom:
621 ; GFX10: ; %bb.0: ; %bb
622 ; GFX10-NEXT: s_clause 0x1
623 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
624 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
625 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
626 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
627 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
628 ; GFX10-NEXT: s_sext_i32_i16 s4, s4
629 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4
630 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
631 ; GFX10-NEXT: BB6_1: ; %bb3
632 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
633 ; GFX10-NEXT: v_bfe_i32 v5, v4, 0, 16
634 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
635 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
636 ; GFX10-NEXT: v_cvt_f32_i32_e32 v7, v5
637 ; GFX10-NEXT: v_xor_b32_e32 v8, s4, v5
638 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
639 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
640 ; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3
641 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8
642 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
643 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
644 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
645 ; GFX10-NEXT: v_or_b32_e32 v8, 1, v8
646 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
647 ; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7
648 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
649 ; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v2|
650 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1
651 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v7
652 ; GFX10-NEXT: global_store_short v[5:6], v0, off
653 ; GFX10-NEXT: s_cbranch_vccz BB6_1
654 ; GFX10-NEXT: ; %bb.2: ; %bb2
655 ; GFX10-NEXT: s_endpgm
662 bb3: ; preds = %bb3, %bb
663 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
664 %tmp4 = sdiv i16 %tmp, %arg1
665 %tmp5 = zext i16 %tmp to i64
666 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
667 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
668 %tmp7 = add nuw nsw i16 %tmp, 1
669 %tmp8 = icmp eq i16 %tmp7, 1024
670 br i1 %tmp8, label %bb2, label %bb3
673 define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
674 ; GFX9-LABEL: srem16_invariant_denom:
675 ; GFX9: ; %bb.0: ; %bb
676 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
677 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
678 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
679 ; GFX9-NEXT: s_movk_i32 s7, 0x400
680 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
681 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX9-NEXT: s_sext_i32_i16 s6, s2
683 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6
684 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
685 ; GFX9-NEXT: BB7_1: ; %bb3
686 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
687 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16
688 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7
689 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
690 ; GFX9-NEXT: v_xor_b32_e32 v9, s6, v7
691 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
692 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v9
693 ; GFX9-NEXT: v_mul_f32_e32 v9, v10, v3
694 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
695 ; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9
696 ; GFX9-NEXT: v_mad_f32 v9, -v9, v2, v10
697 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
698 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v2|
699 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[2:3]
700 ; GFX9-NEXT: v_add_u32_e32 v0, v11, v0
701 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6
702 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
703 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4
704 ; GFX9-NEXT: v_mov_b32_e32 v8, s5
705 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
706 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
707 ; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0
708 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
709 ; GFX9-NEXT: global_store_short v[5:6], v0, off
710 ; GFX9-NEXT: s_cbranch_vccz BB7_1
711 ; GFX9-NEXT: ; %bb.2: ; %bb2
712 ; GFX9-NEXT: s_endpgm
714 ; GFX10-LABEL: srem16_invariant_denom:
715 ; GFX10: ; %bb.0: ; %bb
716 ; GFX10-NEXT: s_clause 0x1
717 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
718 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
719 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
720 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
721 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX10-NEXT: s_sext_i32_i16 s1, s4
723 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s1
724 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
725 ; GFX10-NEXT: BB7_1: ; %bb3
726 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
727 ; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16
728 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
729 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
730 ; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7
731 ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7
732 ; GFX10-NEXT: v_mul_f32_e32 v8, v5, v3
733 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6
734 ; GFX10-NEXT: v_trunc_f32_e32 v8, v8
735 ; GFX10-NEXT: v_or_b32_e32 v6, 1, v6
736 ; GFX10-NEXT: v_mad_f32 v5, -v8, v2, v5
737 ; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8
738 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v2|
739 ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo
740 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
741 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
742 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v9
743 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
744 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
745 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
746 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, s1
747 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v7, v0
748 ; GFX10-NEXT: global_store_short v[5:6], v0, off
749 ; GFX10-NEXT: s_cbranch_vccz BB7_1
750 ; GFX10-NEXT: ; %bb.2: ; %bb2
751 ; GFX10-NEXT: s_endpgm
758 bb3: ; preds = %bb3, %bb
759 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
760 %tmp4 = srem i16 %tmp, %arg1
761 %tmp5 = zext i16 %tmp to i64
762 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
763 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
764 %tmp7 = add nuw nsw i16 %tmp, 1
765 %tmp8 = icmp eq i16 %tmp7, 1024
766 br i1 %tmp8, label %bb2, label %bb3