1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5 define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
6 ; GFX9-LABEL: udiv32_invariant_denom:
8 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
9 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
10 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
13 ; GFX9-NEXT: s_sub_i32 s5, 0, s4
14 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
15 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
16 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
17 ; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0
18 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
19 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
20 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
21 ; GFX9-NEXT: .LBB0_1: ; %bb3
22 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
23 ; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0
24 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0
25 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
26 ; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2
27 ; GFX9-NEXT: v_not_b32_e32 v5, v2
28 ; GFX9-NEXT: v_mul_lo_u32 v5, s4, v5
29 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
30 ; GFX9-NEXT: v_add_u32_e32 v3, s2, v3
31 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
32 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
33 ; GFX9-NEXT: v_add_u32_e32 v4, s2, v5
34 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
35 ; GFX9-NEXT: s_add_u32 s2, s2, 1
36 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
37 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
38 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
39 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
40 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
41 ; GFX9-NEXT: s_add_u32 s0, s0, 4
42 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
43 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
44 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
45 ; GFX9-NEXT: ; %bb.2: ; %bb2
48 ; GFX10-LABEL: udiv32_invariant_denom:
49 ; GFX10: ; %bb.0: ; %bb
50 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
51 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
52 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
53 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
55 ; GFX10-NEXT: s_sub_i32 s5, 0, s4
56 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
57 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
58 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
59 ; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0
60 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
61 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
62 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
63 ; GFX10-NEXT: .LBB0_1: ; %bb3
64 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
65 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
66 ; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0
67 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2
68 ; GFX10-NEXT: v_not_b32_e32 v3, v2
69 ; GFX10-NEXT: v_mul_lo_u32 v4, s5, v2
70 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v2
71 ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3
72 ; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v4
73 ; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3
74 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4
75 ; GFX10-NEXT: s_add_u32 s2, s2, 1
76 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
77 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
78 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
79 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2
80 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3
81 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
82 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
83 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
84 ; GFX10-NEXT: s_add_u32 s0, s0, 4
85 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
86 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
87 ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
88 ; GFX10-NEXT: ; %bb.2: ; %bb2
89 ; GFX10-NEXT: s_endpgm
96 bb3: ; preds = %bb3, %bb
97 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
98 %tmp4 = udiv i32 %tmp, %arg1
99 %tmp5 = zext i32 %tmp to i64
100 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
101 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
102 %tmp7 = add nuw nsw i32 %tmp, 1
103 %tmp8 = icmp eq i32 %tmp7, 1024
104 br i1 %tmp8, label %bb2, label %bb3
107 define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
108 ; GFX9-LABEL: urem32_invariant_denom:
109 ; GFX9: ; %bb.0: ; %bb
110 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
111 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
112 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
113 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
114 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
115 ; GFX9-NEXT: s_sub_i32 s5, 0, s4
116 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
117 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
118 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
119 ; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0
120 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
121 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
122 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
123 ; GFX9-NEXT: .LBB1_1: ; %bb3
124 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
125 ; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0
126 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0
127 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
128 ; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2
129 ; GFX9-NEXT: v_not_b32_e32 v2, v2
130 ; GFX9-NEXT: v_mul_lo_u32 v2, s4, v2
131 ; GFX9-NEXT: v_add_u32_e32 v3, s2, v3
132 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
133 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v2
134 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
135 ; GFX9-NEXT: s_add_u32 s2, s2, 1
136 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2
137 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2
138 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
139 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
140 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
141 ; GFX9-NEXT: s_add_u32 s0, s0, 4
142 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
143 ; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
144 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
145 ; GFX9-NEXT: ; %bb.2: ; %bb2
146 ; GFX9-NEXT: s_endpgm
148 ; GFX10-LABEL: urem32_invariant_denom:
149 ; GFX10: ; %bb.0: ; %bb
150 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
151 ; GFX10-NEXT: s_mov_b64 s[2:3], 0
152 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
153 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
155 ; GFX10-NEXT: s_sub_i32 s5, 0, s4
156 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
157 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
158 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
159 ; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0
160 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
161 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
162 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
163 ; GFX10-NEXT: .LBB1_1: ; %bb3
164 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
165 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
166 ; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0
167 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2
168 ; GFX10-NEXT: v_not_b32_e32 v3, v2
169 ; GFX10-NEXT: v_mul_lo_u32 v2, s5, v2
170 ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3
171 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2
172 ; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3
173 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2
174 ; GFX10-NEXT: s_add_u32 s2, s2, 1
175 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
176 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
177 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v2
178 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2
179 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
180 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
181 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
182 ; GFX10-NEXT: s_add_u32 s0, s0, 4
183 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
184 ; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
185 ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
186 ; GFX10-NEXT: ; %bb.2: ; %bb2
187 ; GFX10-NEXT: s_endpgm
194 bb3: ; preds = %bb3, %bb
195 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
196 %tmp4 = urem i32 %tmp, %arg1
197 %tmp5 = zext i32 %tmp to i64
198 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
199 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
200 %tmp7 = add nuw nsw i32 %tmp, 1
201 %tmp8 = icmp eq i32 %tmp7, 1024
202 br i1 %tmp8, label %bb2, label %bb3
205 define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
206 ; GFX9-LABEL: sdiv32_invariant_denom:
207 ; GFX9: ; %bb.0: ; %bb
208 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
209 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX9-NEXT: s_ashr_i32 s2, s3, 31
211 ; GFX9-NEXT: s_add_i32 s3, s3, s2
212 ; GFX9-NEXT: s_xor_b32 s3, s3, s2
213 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
214 ; GFX9-NEXT: s_sub_i32 s4, 0, s3
215 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
216 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
217 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
218 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
219 ; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0
220 ; GFX9-NEXT: s_mov_b32 s4, 0
221 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
222 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
223 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
224 ; GFX9-NEXT: .LBB2_1: ; %bb3
225 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
226 ; GFX9-NEXT: v_mul_hi_u32 v2, s4, v0
227 ; GFX9-NEXT: v_mul_lo_u32 v3, v2, s3
228 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
229 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
230 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
231 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
232 ; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
233 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
234 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
235 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
236 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
237 ; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2
238 ; GFX9-NEXT: s_add_i32 s4, s4, 1
239 ; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2
240 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
242 ; GFX9-NEXT: s_add_u32 s0, s0, 4
243 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
244 ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
245 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
246 ; GFX9-NEXT: ; %bb.2: ; %bb2
247 ; GFX9-NEXT: s_endpgm
249 ; GFX10-LABEL: sdiv32_invariant_denom:
250 ; GFX10: ; %bb.0: ; %bb
251 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c
252 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-NEXT: s_ashr_i32 s2, s3, 31
254 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
255 ; GFX10-NEXT: s_add_i32 s3, s3, s2
256 ; GFX10-NEXT: s_xor_b32 s3, s3, s2
257 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
258 ; GFX10-NEXT: s_sub_i32 s4, 0, s3
259 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
260 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
261 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
262 ; GFX10-NEXT: v_mul_lo_u32 v1, s4, v0
263 ; GFX10-NEXT: s_mov_b32 s4, 0
264 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
265 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
266 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
267 ; GFX10-NEXT: .LBB2_1: ; %bb3
268 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
269 ; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0
270 ; GFX10-NEXT: v_mul_lo_u32 v3, v2, s3
271 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2
272 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3
273 ; GFX10-NEXT: s_add_i32 s4, s4, 1
274 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s3, v3
275 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
276 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
277 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
278 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2
279 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
280 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
281 ; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2
282 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2
283 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
285 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
286 ; GFX10-NEXT: s_add_u32 s0, s0, 4
287 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
288 ; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
289 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
290 ; GFX10-NEXT: ; %bb.2: ; %bb2
291 ; GFX10-NEXT: s_endpgm
298 bb3: ; preds = %bb3, %bb
299 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
300 %tmp4 = sdiv i32 %tmp, %arg1
301 %tmp5 = zext i32 %tmp to i64
302 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
303 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
304 %tmp7 = add nuw nsw i32 %tmp, 1
305 %tmp8 = icmp eq i32 %tmp7, 1024
306 br i1 %tmp8, label %bb2, label %bb3
309 define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
310 ; GFX9-LABEL: srem32_invariant_denom:
311 ; GFX9: ; %bb.0: ; %bb
312 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
313 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
315 ; GFX9-NEXT: s_add_i32 s2, s2, s3
316 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
317 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
318 ; GFX9-NEXT: s_sub_i32 s3, 0, s2
319 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
320 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
321 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
322 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
323 ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0
324 ; GFX9-NEXT: s_mov_b32 s3, 0
325 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
326 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
327 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
328 ; GFX9-NEXT: .LBB3_1: ; %bb3
329 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
330 ; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0
331 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
332 ; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
333 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
334 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
335 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
336 ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
337 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
338 ; GFX9-NEXT: s_add_i32 s3, s3, 1
339 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
340 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
342 ; GFX9-NEXT: s_add_u32 s0, s0, 4
343 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
344 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
345 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
346 ; GFX9-NEXT: ; %bb.2: ; %bb2
347 ; GFX9-NEXT: s_endpgm
349 ; GFX10-LABEL: srem32_invariant_denom:
350 ; GFX10: ; %bb.0: ; %bb
351 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
352 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX10-NEXT: s_ashr_i32 s3, s2, 31
354 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
355 ; GFX10-NEXT: s_add_i32 s2, s2, s3
356 ; GFX10-NEXT: s_xor_b32 s2, s2, s3
357 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
358 ; GFX10-NEXT: s_sub_i32 s3, 0, s2
359 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
360 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
361 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
362 ; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0
363 ; GFX10-NEXT: s_mov_b32 s3, 0
364 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
365 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
366 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
367 ; GFX10-NEXT: .LBB3_1: ; %bb3
368 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
369 ; GFX10-NEXT: v_mul_hi_u32 v2, s3, v0
370 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2
371 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
372 ; GFX10-NEXT: s_add_i32 s3, s3, 1
373 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2
374 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
375 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
376 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2
377 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
378 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
379 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
381 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
382 ; GFX10-NEXT: s_add_u32 s0, s0, 4
383 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
384 ; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
385 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
386 ; GFX10-NEXT: ; %bb.2: ; %bb2
387 ; GFX10-NEXT: s_endpgm
394 bb3: ; preds = %bb3, %bb
395 %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
396 %tmp4 = srem i32 %tmp, %arg1
397 %tmp5 = zext i32 %tmp to i64
398 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
399 store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
400 %tmp7 = add nuw nsw i32 %tmp, 1
401 %tmp8 = icmp eq i32 %tmp7, 1024
402 br i1 %tmp8, label %bb2, label %bb3
405 define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
406 ; GFX9-LABEL: udiv16_invariant_denom:
407 ; GFX9: ; %bb.0: ; %bb
408 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
409 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
410 ; GFX9-NEXT: s_movk_i32 s4, 0x400
411 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
412 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
414 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
415 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
416 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
417 ; GFX9-NEXT: .LBB4_1: ; %bb3
418 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
419 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
420 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
421 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
422 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
423 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
424 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5
425 ; GFX9-NEXT: v_mul_f32_e32 v0, v8, v3
426 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0
427 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
428 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
429 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
430 ; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8
431 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
432 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2
433 ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
434 ; GFX9-NEXT: global_store_short v[5:6], v0, off
435 ; GFX9-NEXT: s_cbranch_vccz .LBB4_1
436 ; GFX9-NEXT: ; %bb.2: ; %bb2
437 ; GFX9-NEXT: s_endpgm
439 ; GFX10-LABEL: udiv16_invariant_denom:
440 ; GFX10: ; %bb.0: ; %bb
441 ; GFX10-NEXT: s_clause 0x1
442 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
443 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
444 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
445 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
446 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
447 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s4
448 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0
449 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
450 ; GFX10-NEXT: .LBB4_1: ; %bb3
451 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
452 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
453 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
454 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0
455 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
456 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
457 ; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3
458 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
459 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
460 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
461 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
462 ; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7
463 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
464 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v2
465 ; GFX10-NEXT: v_add_co_ci_u32_e64 v0, s0, 0, v0, s0
466 ; GFX10-NEXT: global_store_short v[5:6], v0, off
467 ; GFX10-NEXT: s_cbranch_vccz .LBB4_1
468 ; GFX10-NEXT: ; %bb.2: ; %bb2
469 ; GFX10-NEXT: s_endpgm
476 bb3: ; preds = %bb3, %bb
477 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
478 %tmp4 = udiv i16 %tmp, %arg1
479 %tmp5 = zext i16 %tmp to i64
480 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
481 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
482 %tmp7 = add nuw nsw i16 %tmp, 1
483 %tmp8 = icmp eq i16 %tmp7, 1024
484 br i1 %tmp8, label %bb2, label %bb3
487 define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
488 ; GFX9-LABEL: urem16_invariant_denom:
489 ; GFX9: ; %bb.0: ; %bb
490 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
491 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
492 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
493 ; GFX9-NEXT: s_movk_i32 s7, 0x400
494 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s2
497 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
498 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
499 ; GFX9-NEXT: .LBB5_1: ; %bb3
500 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
501 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
502 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
503 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
504 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
505 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
506 ; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3
507 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
508 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9
509 ; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8
510 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2
511 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4
512 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
513 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6
514 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
515 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
516 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8
517 ; GFX9-NEXT: global_store_short v[5:6], v0, off
518 ; GFX9-NEXT: s_cbranch_vccz .LBB5_1
519 ; GFX9-NEXT: ; %bb.2: ; %bb2
520 ; GFX9-NEXT: s_endpgm
522 ; GFX10-LABEL: urem16_invariant_denom:
523 ; GFX10: ; %bb.0: ; %bb
524 ; GFX10-NEXT: s_clause 0x1
525 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
526 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
527 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
528 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
529 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s4
531 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1
532 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
533 ; GFX10-NEXT: .LBB5_1: ; %bb3
534 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
535 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
536 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
537 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0
538 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
539 ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v3
540 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
541 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
542 ; GFX10-NEXT: v_trunc_f32_e32 v8, v8
543 ; GFX10-NEXT: v_mad_f32 v7, -v8, v2, v7
544 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
545 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
546 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
547 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
548 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1
549 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7
550 ; GFX10-NEXT: global_store_short v[5:6], v0, off
551 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1
552 ; GFX10-NEXT: ; %bb.2: ; %bb2
553 ; GFX10-NEXT: s_endpgm
560 bb3: ; preds = %bb3, %bb
561 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
562 %tmp4 = urem i16 %tmp, %arg1
563 %tmp5 = zext i16 %tmp to i64
564 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
565 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
566 %tmp7 = add nuw nsw i16 %tmp, 1
567 %tmp8 = icmp eq i16 %tmp7, 1024
568 br i1 %tmp8, label %bb2, label %bb3
571 define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
572 ; GFX9-LABEL: sdiv16_invariant_denom:
573 ; GFX9: ; %bb.0: ; %bb
574 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
575 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
576 ; GFX9-NEXT: s_movk_i32 s5, 0x400
577 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
578 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX9-NEXT: s_sext_i32_i16 s4, s2
580 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
581 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
582 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
583 ; GFX9-NEXT: .LBB6_1: ; %bb3
584 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
585 ; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16
586 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
587 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5
588 ; GFX9-NEXT: v_xor_b32_e32 v8, s4, v5
589 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
590 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
591 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
592 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5
593 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
594 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v3
595 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
596 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v8
597 ; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7
598 ; GFX9-NEXT: v_mad_f32 v7, -v7, v2, v9
599 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
600 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
601 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v2|
602 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
603 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1]
604 ; GFX9-NEXT: v_add_u32_e32 v0, v8, v0
605 ; GFX9-NEXT: global_store_short v[5:6], v0, off
606 ; GFX9-NEXT: s_cbranch_vccz .LBB6_1
607 ; GFX9-NEXT: ; %bb.2: ; %bb2
608 ; GFX9-NEXT: s_endpgm
610 ; GFX10-LABEL: sdiv16_invariant_denom:
611 ; GFX10: ; %bb.0: ; %bb
612 ; GFX10-NEXT: s_clause 0x1
613 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
614 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
615 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
616 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
617 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
618 ; GFX10-NEXT: s_sext_i32_i16 s4, s4
619 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4
620 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
621 ; GFX10-NEXT: .LBB6_1: ; %bb3
622 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
623 ; GFX10-NEXT: v_bfe_i32 v5, v4, 0, 16
624 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
625 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
626 ; GFX10-NEXT: v_cvt_f32_i32_e32 v7, v5
627 ; GFX10-NEXT: v_xor_b32_e32 v8, s4, v5
628 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
629 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
630 ; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3
631 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8
632 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
633 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0
634 ; GFX10-NEXT: v_or_b32_e32 v8, 1, v8
635 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
636 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
637 ; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7
638 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
639 ; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v2|
640 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1
641 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v7
642 ; GFX10-NEXT: global_store_short v[5:6], v0, off
643 ; GFX10-NEXT: s_cbranch_vccz .LBB6_1
644 ; GFX10-NEXT: ; %bb.2: ; %bb2
645 ; GFX10-NEXT: s_endpgm
652 bb3: ; preds = %bb3, %bb
653 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
654 %tmp4 = sdiv i16 %tmp, %arg1
655 %tmp5 = zext i16 %tmp to i64
656 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
657 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
658 %tmp7 = add nuw nsw i16 %tmp, 1
659 %tmp8 = icmp eq i16 %tmp7, 1024
660 br i1 %tmp8, label %bb2, label %bb3
663 define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
664 ; GFX9-LABEL: srem16_invariant_denom:
665 ; GFX9: ; %bb.0: ; %bb
666 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
667 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
668 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
669 ; GFX9-NEXT: s_movk_i32 s7, 0x400
670 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
671 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
672 ; GFX9-NEXT: s_sext_i32_i16 s6, s2
673 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6
674 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
675 ; GFX9-NEXT: .LBB7_1: ; %bb3
676 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
677 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16
678 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7
679 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
680 ; GFX9-NEXT: v_xor_b32_e32 v9, s6, v7
681 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
682 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v9
683 ; GFX9-NEXT: v_mul_f32_e32 v9, v10, v3
684 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
685 ; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9
686 ; GFX9-NEXT: v_mad_f32 v9, -v9, v2, v10
687 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
688 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v2|
689 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[2:3]
690 ; GFX9-NEXT: v_add_u32_e32 v0, v11, v0
691 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6
692 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
693 ; GFX9-NEXT: v_mov_b32_e32 v8, s5
694 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4
695 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
696 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
697 ; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0
698 ; GFX9-NEXT: global_store_short v[5:6], v0, off
699 ; GFX9-NEXT: s_cbranch_vccz .LBB7_1
700 ; GFX9-NEXT: ; %bb.2: ; %bb2
701 ; GFX9-NEXT: s_endpgm
703 ; GFX10-LABEL: srem16_invariant_denom:
704 ; GFX10: ; %bb.0: ; %bb
705 ; GFX10-NEXT: s_clause 0x1
706 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
707 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
708 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
709 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
710 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
711 ; GFX10-NEXT: s_sext_i32_i16 s1, s4
712 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s1
713 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
714 ; GFX10-NEXT: .LBB7_1: ; %bb3
715 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
716 ; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16
717 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
718 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
719 ; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7
720 ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7
721 ; GFX10-NEXT: v_mul_f32_e32 v8, v5, v3
722 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6
723 ; GFX10-NEXT: v_trunc_f32_e32 v8, v8
724 ; GFX10-NEXT: v_or_b32_e32 v6, 1, v6
725 ; GFX10-NEXT: v_mad_f32 v5, -v8, v2, v5
726 ; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8
727 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v2|
728 ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo
729 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
730 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
731 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v9
732 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
733 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
734 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, s1
735 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v7, v0
736 ; GFX10-NEXT: global_store_short v[5:6], v0, off
737 ; GFX10-NEXT: s_cbranch_vccz .LBB7_1
738 ; GFX10-NEXT: ; %bb.2: ; %bb2
739 ; GFX10-NEXT: s_endpgm
746 bb3: ; preds = %bb3, %bb
747 %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
748 %tmp4 = srem i16 %tmp, %arg1
749 %tmp5 = zext i16 %tmp to i64
750 %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
751 store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
752 %tmp7 = add nuw nsw i16 %tmp, 1
753 %tmp8 = icmp eq i16 %tmp7, 1024
754 br i1 %tmp8, label %bb2, label %bb3