1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
4 ; ModuleID = 'kernel_round1_passing.bc'
5 source_filename = "/tmp/comgr-295d04/input/CompileSource"
6 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
7 target triple = "amdgcn-amd-amdhsa"
9 @kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1
10 @kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4
11 @kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4
13 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
14 declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0
16 ; Function Attrs: convergent nounwind
17 declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
19 ; Function Attrs: convergent nounwind
20 declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
22 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
23 declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0
25 ; Function Attrs: convergent nounwind
26 declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1
28 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
29 declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0
31 ; Function Attrs: convergent nounwind
32 declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1
34 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
35 declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0
37 ; Function Attrs: convergent norecurse nounwind
38 define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
39 ; CHECK-LABEL: kernel_round1:
41 ; CHECK-NEXT: s_add_u32 s10, s10, s15
42 ; CHECK-NEXT: s_mov_b32 s32, 0
43 ; CHECK-NEXT: s_addc_u32 s11, s11, 0
44 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
45 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
46 ; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0
47 ; CHECK-NEXT: s_add_u32 s0, s0, s15
48 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
49 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
50 ; CHECK-NEXT: v_mov_b32_e32 v41, v0
51 ; CHECK-NEXT: s_add_u32 s42, s34, 40
52 ; CHECK-NEXT: v_mov_b32_e32 v31, v0
53 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
54 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
55 ; CHECK-NEXT: s_addc_u32 s43, s35, 0
56 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
57 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
58 ; CHECK-NEXT: s_mov_b32 s33, s14
59 ; CHECK-NEXT: s_mov_b32 s40, s13
60 ; CHECK-NEXT: s_mov_b32 s41, s12
61 ; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5]
62 ; CHECK-NEXT: s_getpc_b64 s[6:7]
63 ; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4
64 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12
65 ; CHECK-NEXT: v_mov_b32_e32 v45, 0
66 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
67 ; CHECK-NEXT: v_mov_b32_e32 v43, v0
68 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
69 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
70 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
71 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
72 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
73 ; CHECK-NEXT: s_mov_b32 s12, s41
74 ; CHECK-NEXT: s_mov_b32 s13, s40
75 ; CHECK-NEXT: s_mov_b32 s14, s33
76 ; CHECK-NEXT: s_getpc_b64 s[6:7]
77 ; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4
78 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12
79 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
80 ; CHECK-NEXT: v_mov_b32_e32 v40, v0
81 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
82 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
83 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
84 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
85 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
86 ; CHECK-NEXT: s_mov_b32 s12, s41
87 ; CHECK-NEXT: s_mov_b32 s13, s40
88 ; CHECK-NEXT: s_mov_b32 s14, s33
89 ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
90 ; CHECK-NEXT: s_getpc_b64 s[52:53]
91 ; CHECK-NEXT: s_add_u32 s52, s52, _Z7barrierj@rel32@lo+4
92 ; CHECK-NEXT: s_addc_u32 s53, s53, _Z7barrierj@rel32@hi+12
93 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53]
94 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
95 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
96 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
97 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
98 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
99 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
100 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
101 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
102 ; CHECK-NEXT: s_mov_b32 s12, s41
103 ; CHECK-NEXT: s_mov_b32 s13, s40
104 ; CHECK-NEXT: global_load_dword v0, v0, s[48:49]
105 ; CHECK-NEXT: s_mov_b32 s14, s33
106 ; CHECK-NEXT: s_getpc_b64 s[6:7]
107 ; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4
108 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12
109 ; CHECK-NEXT: s_waitcnt vmcnt(0)
110 ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
111 ; CHECK-NEXT: v_mov_b32_e32 v1, 12
112 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
113 ; CHECK-NEXT: v_mov_b32_e32 v42, v0
114 ; CHECK-NEXT: s_mov_b32 s48, exec_lo
115 ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
116 ; CHECK-NEXT: s_cbranch_execz .LBB0_25
117 ; CHECK-NEXT: ; %bb.1: ; %.preheader5
118 ; CHECK-NEXT: v_mul_lo_u32 v0, v40, 14
119 ; CHECK-NEXT: s_mov_b32 s4, 0
120 ; CHECK-NEXT: s_mov_b32 s5, 0
121 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
122 ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
123 ; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
124 ; CHECK-NEXT: s_add_i32 s5, s5, 1
125 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
126 ; CHECK-NEXT: ds_write_b8 v1, v45
127 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
128 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
129 ; CHECK-NEXT: s_cbranch_execnz .LBB0_2
130 ; CHECK-NEXT: ; %bb.3:
131 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
132 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
133 ; CHECK-NEXT: s_mov_b32 s49, 0
134 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
135 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
136 ; CHECK-NEXT: s_cbranch_execz .LBB0_25
137 ; CHECK-NEXT: ; %bb.4:
138 ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
139 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
140 ; CHECK-NEXT: v_mov_b32_e32 v47, 0
141 ; CHECK-NEXT: s_getpc_b64 s[42:43]
142 ; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj@rel32@lo+4
143 ; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj@rel32@hi+12
144 ; CHECK-NEXT: s_mov_b32 s55, 0
145 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
146 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
147 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
148 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44
149 ; CHECK-NEXT: s_lshl_b32 s4, s55, 5
150 ; CHECK-NEXT: s_add_i32 s54, s55, 1
151 ; CHECK-NEXT: s_add_i32 s5, s55, 5
152 ; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54
153 ; CHECK-NEXT: ds_read_u8 v0, v0
154 ; CHECK-NEXT: v_mov_b32_e32 v58, s54
155 ; CHECK-NEXT: s_mov_b32 s56, exec_lo
156 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
157 ; CHECK-NEXT: v_and_b32_e32 v56, 0xff, v0
158 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
159 ; CHECK-NEXT: s_cbranch_execz .LBB0_17
160 ; CHECK-NEXT: ; %bb.6: ; %.preheader2
161 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
162 ; CHECK-NEXT: s_mov_b32 s57, 0
163 ; CHECK-NEXT: s_mov_b32 s58, 0
164 ; CHECK-NEXT: s_branch .LBB0_8
165 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
166 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
167 ; CHECK-NEXT: s_add_i32 s58, s58, 4
168 ; CHECK-NEXT: s_add_i32 s4, s55, s58
169 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s58, v57
170 ; CHECK-NEXT: s_add_i32 s5, s4, 5
171 ; CHECK-NEXT: s_add_i32 s4, s4, 1
172 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
173 ; CHECK-NEXT: v_mov_b32_e32 v58, s4
174 ; CHECK-NEXT: s_or_b32 s57, vcc_lo, s57
175 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s57
176 ; CHECK-NEXT: s_cbranch_execz .LBB0_16
177 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
178 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
179 ; CHECK-NEXT: v_add_nc_u32_e32 v59, s58, v46
180 ; CHECK-NEXT: v_add_nc_u32_e32 v58, s58, v57
181 ; CHECK-NEXT: s_mov_b32 s59, exec_lo
182 ; CHECK-NEXT: ds_read_u8 v0, v59
183 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
184 ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0
185 ; CHECK-NEXT: s_cbranch_execz .LBB0_10
186 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
187 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
188 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
189 ; CHECK-NEXT: s_add_u32 s8, s34, 40
190 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
191 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
192 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
193 ; CHECK-NEXT: s_mov_b32 s12, s41
194 ; CHECK-NEXT: s_mov_b32 s13, s40
195 ; CHECK-NEXT: s_mov_b32 s14, s33
196 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
197 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
198 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
199 ; CHECK-NEXT: ds_write_b32 v0, v58
200 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
201 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
202 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
203 ; CHECK-NEXT: s_mov_b32 s59, exec_lo
204 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
205 ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0
206 ; CHECK-NEXT: s_cbranch_execz .LBB0_12
207 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
208 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
209 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
210 ; CHECK-NEXT: s_add_u32 s8, s34, 40
211 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
212 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
213 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
214 ; CHECK-NEXT: s_mov_b32 s12, s41
215 ; CHECK-NEXT: s_mov_b32 s13, s40
216 ; CHECK-NEXT: s_mov_b32 s14, s33
217 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
218 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
219 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
220 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
221 ; CHECK-NEXT: ds_write_b32 v0, v60
222 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
223 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
224 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
225 ; CHECK-NEXT: s_mov_b32 s59, exec_lo
226 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
227 ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0
228 ; CHECK-NEXT: s_cbranch_execz .LBB0_14
229 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
230 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
231 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
232 ; CHECK-NEXT: s_add_u32 s8, s34, 40
233 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
234 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
235 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
236 ; CHECK-NEXT: s_mov_b32 s12, s41
237 ; CHECK-NEXT: s_mov_b32 s13, s40
238 ; CHECK-NEXT: s_mov_b32 s14, s33
239 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
240 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
241 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
242 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
243 ; CHECK-NEXT: ds_write_b32 v0, v60
244 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
245 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
246 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
247 ; CHECK-NEXT: s_mov_b32 s59, exec_lo
248 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
249 ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0
250 ; CHECK-NEXT: s_cbranch_execz .LBB0_7
251 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
252 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
253 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
254 ; CHECK-NEXT: s_add_u32 s8, s34, 40
255 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
256 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
257 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
258 ; CHECK-NEXT: s_mov_b32 s12, s41
259 ; CHECK-NEXT: s_mov_b32 s13, s40
260 ; CHECK-NEXT: s_mov_b32 s14, s33
261 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
262 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
263 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
264 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
265 ; CHECK-NEXT: ds_write_b32 v0, v58
266 ; CHECK-NEXT: s_branch .LBB0_7
267 ; CHECK-NEXT: .LBB0_16: ; %Flow43
268 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
269 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
270 ; CHECK-NEXT: v_mov_b32_e32 v57, v0
271 ; CHECK-NEXT: .LBB0_17: ; %Flow44
272 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
273 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
274 ; CHECK-NEXT: s_mov_b32 s55, exec_lo
275 ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42
276 ; CHECK-NEXT: s_cbranch_execz .LBB0_23
277 ; CHECK-NEXT: ; %bb.18: ; %.preheader
278 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
279 ; CHECK-NEXT: s_mov_b32 s56, 0
280 ; CHECK-NEXT: s_inst_prefetch 0x1
281 ; CHECK-NEXT: s_branch .LBB0_20
282 ; CHECK-NEXT: .p2align 6
283 ; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
284 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
285 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58
286 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
287 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42
288 ; CHECK-NEXT: s_or_b32 s56, vcc_lo, s56
289 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s56
290 ; CHECK-NEXT: s_cbranch_execz .LBB0_22
291 ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
292 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
293 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
294 ; CHECK-NEXT: s_mov_b32 s57, exec_lo
295 ; CHECK-NEXT: ds_read_u8 v0, v0
296 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
297 ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0
298 ; CHECK-NEXT: s_cbranch_execz .LBB0_19
299 ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
300 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
301 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
302 ; CHECK-NEXT: s_add_u32 s8, s34, 40
303 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
304 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
305 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
306 ; CHECK-NEXT: s_mov_b32 s12, s41
307 ; CHECK-NEXT: s_mov_b32 s13, s40
308 ; CHECK-NEXT: s_mov_b32 s14, s33
309 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
310 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
311 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
312 ; CHECK-NEXT: ds_write_b32 v0, v57
313 ; CHECK-NEXT: s_branch .LBB0_19
314 ; CHECK-NEXT: .LBB0_22: ; %Flow41
315 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
316 ; CHECK-NEXT: s_inst_prefetch 0x2
317 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
318 ; CHECK-NEXT: .LBB0_23: ; %Flow42
319 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
320 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
321 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
322 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
323 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
324 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
325 ; CHECK-NEXT: s_mov_b32 s55, s54
326 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
327 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
328 ; CHECK-NEXT: s_or_b32 s49, s4, s49
329 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
330 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5
331 ; CHECK-NEXT: .LBB0_25: ; %Flow49
332 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
333 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
334 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
335 ; CHECK-NEXT: s_add_u32 s8, s34, 40
336 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
337 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
338 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
339 ; CHECK-NEXT: s_mov_b32 s12, s41
340 ; CHECK-NEXT: s_mov_b32 s13, s40
341 ; CHECK-NEXT: s_mov_b32 s14, s33
342 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53]
343 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
344 ; CHECK-NEXT: s_mov_b32 s4, exec_lo
345 ; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360
346 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
347 ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v40
348 ; CHECK-NEXT: s_cbranch_execz .LBB0_33
349 ; CHECK-NEXT: ; %bb.26:
350 ; CHECK-NEXT: s_add_u32 s52, s44, 8
351 ; CHECK-NEXT: s_addc_u32 s53, s45, 0
352 ; CHECK-NEXT: s_getpc_b64 s[42:43]
353 ; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_addPU3AS1Vjj@rel32@lo+4
354 ; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_addPU3AS1Vjj@rel32@hi+12
355 ; CHECK-NEXT: s_mov_b32 s54, 0
356 ; CHECK-NEXT: s_getpc_b64 s[44:45]
357 ; CHECK-NEXT: s_add_u32 s44, s44, _Z10atomic_subPU3AS1Vjj@rel32@lo+4
358 ; CHECK-NEXT: s_addc_u32 s45, s45, _Z10atomic_subPU3AS1Vjj@rel32@hi+12
359 ; CHECK-NEXT: s_getpc_b64 s[48:49]
360 ; CHECK-NEXT: s_add_u32 s48, s48, _Z14get_local_sizej@rel32@lo+4
361 ; CHECK-NEXT: s_addc_u32 s49, s49, _Z14get_local_sizej@rel32@hi+12
362 ; CHECK-NEXT: s_branch .LBB0_28
363 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
364 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
365 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
366 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
367 ; CHECK-NEXT: s_add_u32 s8, s34, 40
368 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
369 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
370 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
371 ; CHECK-NEXT: s_mov_b32 s12, s41
372 ; CHECK-NEXT: s_mov_b32 s13, s40
373 ; CHECK-NEXT: s_mov_b32 s14, s33
374 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[48:49]
375 ; CHECK-NEXT: v_add_co_u32 v40, vcc_lo, v0, v40
376 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v40
377 ; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54
378 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54
379 ; CHECK-NEXT: s_cbranch_execz .LBB0_33
380 ; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1
381 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v40
382 ; CHECK-NEXT: s_mov_b32 s55, exec_lo
383 ; CHECK-NEXT: ds_read_b32 v0, v0
384 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
385 ; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
386 ; CHECK-NEXT: v_bfe_u32 v62, v0, 5, 5
387 ; CHECK-NEXT: v_and_b32_e32 v72, 31, v0
388 ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
389 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
390 ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
391 ; CHECK-NEXT: v_add_co_u32 v2, s4, s52, v1
392 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s53, 0, s4
393 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
394 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
395 ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
396 ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
397 ; CHECK-NEXT: s_clause 0x1
398 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
399 ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
400 ; CHECK-NEXT: s_waitcnt vmcnt(0)
401 ; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
402 ; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
403 ; CHECK-NEXT: v_xor_b32_e32 v57, v11, v7
404 ; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6
405 ; CHECK-NEXT: v_or_b32_e32 v5, v46, v57
406 ; CHECK-NEXT: v_or_b32_e32 v4, v45, v56
407 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
408 ; CHECK-NEXT: s_cbranch_execz .LBB0_27
409 ; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
410 ; CHECK-NEXT: s_clause 0x1
411 ; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
412 ; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
413 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
414 ; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
415 ; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
416 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
417 ; CHECK-NEXT: s_add_u32 s8, s34, 40
418 ; CHECK-NEXT: v_and_b32_e32 v3, 0xf000, v0
419 ; CHECK-NEXT: v_and_b32_e32 v4, 0xf00, v1
420 ; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0
421 ; CHECK-NEXT: v_and_b32_e32 v1, 15, v1
422 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
423 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
424 ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4
425 ; CHECK-NEXT: s_mov_b32 s12, s41
426 ; CHECK-NEXT: s_mov_b32 s13, s40
427 ; CHECK-NEXT: s_mov_b32 s14, s33
428 ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1
429 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73
430 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73
431 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0
432 ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1
433 ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1
434 ; CHECK-NEXT: v_add_co_u32 v42, s4, s50, v0
435 ; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s51, 0, s4
436 ; CHECK-NEXT: v_mov_b32_e32 v2, v44
437 ; CHECK-NEXT: v_mov_b32_e32 v0, v42
438 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
439 ; CHECK-NEXT: v_mov_b32_e32 v1, v43
440 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
441 ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4
442 ; CHECK-NEXT: s_mov_b32 s4, exec_lo
443 ; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0
444 ; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
445 ; CHECK-NEXT: s_cbranch_execz .LBB0_31
446 ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
447 ; CHECK-NEXT: v_xor_b32_e32 v5, v60, v58
448 ; CHECK-NEXT: v_lshrrev_b64 v[3:4], 16, v[56:57]
449 ; CHECK-NEXT: v_mul_u32_u24_e32 v11, 0x180, v73
450 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v0
451 ; CHECK-NEXT: v_lshrrev_b64 v[1:2], 16, v[45:46]
452 ; CHECK-NEXT: v_lshlrev_b32_e32 v7, 16, v5
453 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
454 ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 12, v63
455 ; CHECK-NEXT: v_xor_b32_e32 v6, v61, v59
456 ; CHECK-NEXT: v_lshlrev_b32_e32 v9, 16, v56
457 ; CHECK-NEXT: v_or_b32_e32 v4, v7, v4
458 ; CHECK-NEXT: v_add_co_u32 v7, s5, s46, v11
459 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s47, 0, s5
460 ; CHECK-NEXT: v_or3_b32 v10, v8, v10, v62
461 ; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, v0
462 ; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v11, vcc_lo
463 ; CHECK-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6]
464 ; CHECK-NEXT: v_or_b32_e32 v2, v9, v2
465 ; CHECK-NEXT: global_store_dword v[7:8], v10, off offset:4
466 ; CHECK-NEXT: global_store_dwordx4 v[7:8], v[1:4], off offset:8
467 ; CHECK-NEXT: global_store_dwordx2 v[7:8], v[5:6], off offset:24
468 ; CHECK-NEXT: ; implicit-def: $vgpr42
469 ; CHECK-NEXT: ; implicit-def: $vgpr43
470 ; CHECK-NEXT: ; implicit-def: $vgpr44
471 ; CHECK-NEXT: .LBB0_31: ; %Flow
472 ; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
473 ; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4
474 ; CHECK-NEXT: s_cbranch_execz .LBB0_27
475 ; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
476 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
477 ; CHECK-NEXT: v_mov_b32_e32 v0, v42
478 ; CHECK-NEXT: v_mov_b32_e32 v1, v43
479 ; CHECK-NEXT: v_mov_b32_e32 v2, v44
480 ; CHECK-NEXT: s_add_u32 s8, s34, 40
481 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
482 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
483 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
484 ; CHECK-NEXT: s_mov_b32 s12, s41
485 ; CHECK-NEXT: s_mov_b32 s13, s40
486 ; CHECK-NEXT: s_mov_b32 s14, s33
487 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
488 ; CHECK-NEXT: s_branch .LBB0_27
489 ; CHECK-NEXT: .LBB0_33:
490 ; CHECK-NEXT: s_endpgm
491 %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
492 %7 = trunc i64 %6 to i32
493 %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
494 %9 = trunc i64 %8 to i32
496 %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10
497 store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
498 tail call void @_Z7barrierj(i32 noundef 1) #5
501 %14 = and i32 %13, 28
502 %15 = and i64 %12, 536870911
503 %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15
504 %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11
505 %18 = lshr i32 %17, %14
506 %19 = and i32 %18, 15
507 %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4
508 %21 = icmp eq i32 %20, 0
509 br i1 %21, label %119, label %27
512 %23 = add i32 %20, -1
513 %24 = icmp eq i32 %23, 0
514 br i1 %24, label %119, label %25
520 27: ; preds = %5, %27
521 %28 = phi i32 [ %30, %27 ], [ 0, %5 ]
522 %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28
523 store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15
524 %30 = add nuw i32 %28, 1
525 %31 = icmp eq i32 %30, %20
526 br i1 %31, label %22, label %27
528 32: ; preds = %114, %48
529 %33 = phi i32 [ %50, %48 ], [ %115, %114 ]
530 %34 = icmp ult i32 %44, %23
531 %35 = icmp ult i32 %33, 60
532 %36 = select i1 %34, i1 %35, i1 false
533 br i1 %36, label %37, label %119
535 37: ; preds = %32, %25
536 %38 = phi i32 [ 0, %25 ], [ %44, %32 ]
537 %39 = phi i32 [ 0, %25 ], [ %33, %32 ]
538 %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38
539 %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15
541 %43 = or i32 %42, %26
542 %44 = add nuw i32 %38, 1
543 %45 = or i32 %43, %44
545 %47 = icmp ult i32 %46, %20
546 br i1 %47, label %53, label %48
548 48: ; preds = %98, %37
549 %49 = phi i32 [ %45, %37 ], [ %100, %98 ]
550 %50 = phi i32 [ %39, %37 ], [ %99, %98 ]
551 %51 = phi i32 [ %44, %37 ], [ %54, %98 ]
552 %52 = icmp ult i32 %51, %20
553 br i1 %52, label %103, label %32
555 53: ; preds = %37, %98
556 %54 = phi i32 [ %101, %98 ], [ %46, %37 ]
557 %55 = phi i32 [ %54, %98 ], [ %44, %37 ]
558 %56 = phi i32 [ %99, %98 ], [ %39, %37 ]
559 %57 = phi i32 [ %100, %98 ], [ %45, %37 ]
560 %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55
561 %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15
562 %60 = icmp eq i8 %41, %59
563 br i1 %60, label %61, label %65
567 %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
568 %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63
569 store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11
572 65: ; preds = %61, %53
573 %66 = phi i32 [ %62, %61 ], [ %56, %53 ]
575 %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67
576 %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15
577 %70 = icmp eq i8 %41, %69
578 br i1 %70, label %71, label %76
583 %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
584 %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74
585 store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11
588 76: ; preds = %71, %65
589 %77 = phi i32 [ %73, %71 ], [ %66, %65 ]
591 %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78
592 %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15
593 %81 = icmp eq i8 %41, %80
594 br i1 %81, label %82, label %87
599 %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
600 %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85
601 store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11
604 87: ; preds = %82, %76
605 %88 = phi i32 [ %84, %82 ], [ %77, %76 ]
607 %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89
608 %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15
609 %92 = icmp eq i8 %41, %91
610 br i1 %92, label %93, label %98
615 %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
616 %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96
617 store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11
620 98: ; preds = %93, %87
621 %99 = phi i32 [ %95, %93 ], [ %88, %87 ]
622 %100 = add i32 %57, 4
623 %101 = add i32 %54, 4
624 %102 = icmp ult i32 %101, %20
625 br i1 %102, label %53, label %48
627 103: ; preds = %48, %114
628 %104 = phi i32 [ %117, %114 ], [ %51, %48 ]
629 %105 = phi i32 [ %115, %114 ], [ %50, %48 ]
630 %106 = phi i32 [ %116, %114 ], [ %49, %48 ]
631 %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104
632 %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15
633 %109 = icmp eq i8 %41, %108
634 br i1 %109, label %110, label %114
637 %111 = add i32 %105, 1
638 %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
639 %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112
640 store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11
643 114: ; preds = %110, %103
644 %115 = phi i32 [ %111, %110 ], [ %105, %103 ]
645 %116 = add i32 %106, 1
646 %117 = add nuw i32 %104, 1
647 %118 = icmp ult i32 %117, %20
648 br i1 %118, label %103, label %32
650 119: ; preds = %32, %22, %5
651 tail call void @_Z7barrierj(i32 noundef 1) #5
652 %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
653 %121 = icmp ugt i32 %120, %9
654 br i1 %121, label %122, label %206
657 %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
660 124: ; preds = %201, %122
661 %125 = phi i32 [ %9, %122 ], [ %204, %201 ]
662 %126 = phi i64 [ %8, %122 ], [ %203, %201 ]
663 %127 = and i64 %126, 4294967295
664 %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125
665 %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11
666 %130 = lshr i32 %129, 10
667 %131 = lshr i32 %129, 5
668 %132 = and i32 %131, 31
669 %133 = and i32 %129, 31
670 %134 = mul nuw nsw i32 %130, 384
671 %135 = zext i32 %134 to i64
672 %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135
673 %137 = shl nuw nsw i32 %132, 5
674 %138 = zext i32 %137 to i64
675 %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138
676 %140 = shl nuw nsw i32 %133, 5
677 %141 = zext i32 %140 to i64
678 %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141
679 %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1
680 %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16
681 %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1
682 %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16
683 %147 = xor i64 %146, %144
684 %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16
685 %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16
686 %150 = xor i64 %149, %148
687 %151 = icmp ne i64 %147, 0
688 %152 = icmp ne i64 %150, 0
689 %153 = select i1 %151, i1 true, i1 %152
690 br i1 %153, label %154, label %201
693 %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2
694 %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16
695 %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2
696 %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16
697 %159 = and i64 %147, 983040
698 %160 = shl i64 %147, 4
699 %161 = and i64 %160, 61440
700 %162 = or i64 %161, %159
701 %163 = lshr i64 %147, 12
702 %164 = and i64 %163, 3840
703 %165 = or i64 %162, %164
704 %166 = and i64 %160, 240
705 %167 = or i64 %165, %166
706 %168 = and i64 %163, 15
707 %169 = or i64 %167, %168
708 %170 = trunc i64 %169 to i32
709 %171 = lshr i64 %169, 3
710 %172 = shl nuw nsw i32 %170, 2
711 %173 = and i32 %172, 28
712 %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171
713 %175 = shl nuw nsw i32 1, %173
714 %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
715 %177 = lshr i32 %176, %173
716 %178 = and i32 %177, 15
717 %179 = icmp ugt i32 %178, 11
718 br i1 %179, label %180, label %182
721 %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
725 %183 = xor i64 %158, %156
726 %184 = lshr i64 %183, 16
727 %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48)
728 %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48)
729 %187 = shl nuw nsw i32 %133, 6
730 %188 = shl i32 %130, 12
731 %189 = or i32 %187, %188
732 %190 = or i32 %189, %132
733 %191 = mul nuw nsw i64 %169, 384
734 %192 = and i64 %191, 4294967168
735 %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
736 %194 = shl nuw nsw i32 %178, 5
737 %195 = or i32 %194, 8
738 %196 = zext i32 %195 to i64
739 %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
740 %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
741 store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11
742 store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16
743 %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8
744 store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16
745 %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16
746 store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16
749 201: ; preds = %182, %180, %124
750 %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4
751 %203 = add i64 %202, %127
752 %204 = trunc i64 %203 to i32
753 %205 = icmp ugt i32 %120, %204
754 br i1 %205, label %124, label %206
756 206: ; preds = %201, %119
760 ; Removed most of the if-else blocks
762 define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
763 ; CHECK-LABEL: kernel_round1_short:
764 ; CHECK: ; %bb.0: ; %.5
765 ; CHECK-NEXT: s_add_u32 s10, s10, s15
766 ; CHECK-NEXT: s_mov_b32 s32, 0
767 ; CHECK-NEXT: s_addc_u32 s11, s11, 0
768 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
769 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
770 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[6:7], 0x10
771 ; CHECK-NEXT: s_add_u32 s0, s0, s15
772 ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
773 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
774 ; CHECK-NEXT: v_mov_b32_e32 v40, v0
775 ; CHECK-NEXT: s_add_u32 s42, s36, 40
776 ; CHECK-NEXT: v_mov_b32_e32 v31, v0
777 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
778 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
779 ; CHECK-NEXT: s_addc_u32 s43, s37, 0
780 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
781 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
782 ; CHECK-NEXT: s_mov_b32 s33, s14
783 ; CHECK-NEXT: s_mov_b32 s40, s13
784 ; CHECK-NEXT: s_mov_b32 s41, s12
785 ; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5]
786 ; CHECK-NEXT: s_getpc_b64 s[6:7]
787 ; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4
788 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12
789 ; CHECK-NEXT: v_mov_b32_e32 v43, 0
790 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
791 ; CHECK-NEXT: v_mov_b32_e32 v42, v0
792 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
793 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
794 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
795 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
796 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
797 ; CHECK-NEXT: s_mov_b32 s12, s41
798 ; CHECK-NEXT: s_mov_b32 s13, s40
799 ; CHECK-NEXT: s_mov_b32 s14, s33
800 ; CHECK-NEXT: s_getpc_b64 s[6:7]
801 ; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4
802 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12
803 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
804 ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
805 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
806 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
807 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
808 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
809 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
810 ; CHECK-NEXT: s_mov_b32 s12, s41
811 ; CHECK-NEXT: s_mov_b32 s13, s40
812 ; CHECK-NEXT: s_mov_b32 s14, s33
813 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
814 ; CHECK-NEXT: s_getpc_b64 s[44:45]
815 ; CHECK-NEXT: s_add_u32 s44, s44, _Z7barrierj@rel32@lo+4
816 ; CHECK-NEXT: s_addc_u32 s45, s45, _Z7barrierj@rel32@hi+12
817 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
818 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
819 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
820 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
821 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
822 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
823 ; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
824 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
825 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
826 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
827 ; CHECK-NEXT: s_mov_b32 s12, s41
828 ; CHECK-NEXT: s_mov_b32 s13, s40
829 ; CHECK-NEXT: global_load_dword v0, v0, s[46:47]
830 ; CHECK-NEXT: s_mov_b32 s14, s33
831 ; CHECK-NEXT: s_getpc_b64 s[6:7]
832 ; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4
833 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12
834 ; CHECK-NEXT: s_waitcnt vmcnt(0)
835 ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
836 ; CHECK-NEXT: v_mov_b32_e32 v1, 12
837 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
838 ; CHECK-NEXT: v_mov_b32_e32 v41, v0
839 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
840 ; CHECK-NEXT: s_getpc_b64 s[42:43]
841 ; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj@rel32@lo+4
842 ; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj@rel32@hi+12
843 ; CHECK-NEXT: s_mov_b32 s46, 0
844 ; CHECK-NEXT: s_mov_b32 s4, 0
845 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
846 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
847 ; CHECK-NEXT: .LBB1_1: ; %.37
848 ; CHECK-NEXT: ; =>This Loop Header: Depth=1
849 ; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
850 ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
851 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
852 ; CHECK-NEXT: s_lshl_b32 s5, s4, 5
853 ; CHECK-NEXT: s_add_i32 s47, s4, 1
854 ; CHECK-NEXT: s_add_i32 s6, s4, 5
855 ; CHECK-NEXT: v_or3_b32 v47, s5, v42, s47
856 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
857 ; CHECK-NEXT: ds_read_u8 v46, v0
858 ; CHECK-NEXT: v_mov_b32_e32 v56, s47
859 ; CHECK-NEXT: s_mov_b32 s5, exec_lo
860 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
861 ; CHECK-NEXT: s_cbranch_execz .LBB1_5
862 ; CHECK-NEXT: ; %bb.2: ; %.53.preheader
863 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
864 ; CHECK-NEXT: s_mov_b32 s6, 0
865 ; CHECK-NEXT: s_mov_b32 s7, 0
866 ; CHECK-NEXT: .LBB1_3: ; %.53
867 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
868 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
869 ; CHECK-NEXT: s_add_i32 s7, s7, 4
870 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
871 ; CHECK-NEXT: s_add_i32 s8, s4, s7
872 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
873 ; CHECK-NEXT: s_add_i32 s9, s8, 5
874 ; CHECK-NEXT: s_add_i32 s8, s8, 1
875 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
876 ; CHECK-NEXT: v_mov_b32_e32 v56, s8
877 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
878 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
879 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3
880 ; CHECK-NEXT: ; %bb.4: ; %Flow3
881 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
882 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
883 ; CHECK-NEXT: v_mov_b32_e32 v47, v0
884 ; CHECK-NEXT: .LBB1_5: ; %Flow4
885 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
886 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
887 ; CHECK-NEXT: s_mov_b32 s48, exec_lo
888 ; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
889 ; CHECK-NEXT: s_cbranch_execz .LBB1_11
890 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader
891 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
892 ; CHECK-NEXT: s_mov_b32 s49, 0
893 ; CHECK-NEXT: s_inst_prefetch 0x1
894 ; CHECK-NEXT: s_branch .LBB1_8
895 ; CHECK-NEXT: .p2align 6
896 ; CHECK-NEXT: .LBB1_7: ; %.114
897 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
898 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s50
899 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
900 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
901 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
902 ; CHECK-NEXT: s_or_b32 s49, vcc_lo, s49
903 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
904 ; CHECK-NEXT: s_cbranch_execz .LBB1_10
905 ; CHECK-NEXT: .LBB1_8: ; %.103
906 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
907 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
908 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
909 ; CHECK-NEXT: ds_read_u8 v0, v0
910 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
911 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
912 ; CHECK-NEXT: s_and_saveexec_b32 s50, s4
913 ; CHECK-NEXT: s_cbranch_execz .LBB1_7
914 ; CHECK-NEXT: ; %bb.9: ; %.110
915 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
916 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
917 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
918 ; CHECK-NEXT: s_add_u32 s8, s36, 40
919 ; CHECK-NEXT: s_addc_u32 s9, s37, 0
920 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
921 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
922 ; CHECK-NEXT: s_mov_b32 s12, s41
923 ; CHECK-NEXT: s_mov_b32 s13, s40
924 ; CHECK-NEXT: s_mov_b32 s14, s33
925 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
926 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
927 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
928 ; CHECK-NEXT: ds_write_b32 v0, v47
929 ; CHECK-NEXT: s_branch .LBB1_7
930 ; CHECK-NEXT: .LBB1_10: ; %Flow
931 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
932 ; CHECK-NEXT: s_inst_prefetch 0x2
933 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
934 ; CHECK-NEXT: .LBB1_11: ; %Flow2
935 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
936 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
937 ; CHECK-NEXT: ; %bb.12: ; %.32
938 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
939 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s47, v45
940 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
941 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
942 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
943 ; CHECK-NEXT: s_or_b32 s46, s4, s46
944 ; CHECK-NEXT: s_mov_b32 s4, s47
945 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s46
946 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1
947 ; CHECK-NEXT: ; %bb.13: ; %.119
948 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46
949 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
950 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
951 ; CHECK-NEXT: s_add_u32 s8, s36, 40
952 ; CHECK-NEXT: s_addc_u32 s9, s37, 0
953 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
954 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
955 ; CHECK-NEXT: s_mov_b32 s12, s41
956 ; CHECK-NEXT: s_mov_b32 s13, s40
957 ; CHECK-NEXT: s_mov_b32 s14, s33
958 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
959 ; CHECK-NEXT: s_endpgm
961 %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
962 %.7 = trunc i64 %.6 to i32
963 %.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
964 %.9 = trunc i64 %.8 to i32
965 %.10 = mul i32 %.9, 14
966 %.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10
967 store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
968 tail call void @_Z7barrierj(i32 noundef 1) #5
969 %.12 = lshr i64 %.6, 3
970 %.13 = shl i32 %.7, 2
971 %.14 = and i32 %.13, 28
972 %.15 = and i64 %.12, 536870911
973 %.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15
974 %.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11
975 %.18 = lshr i32 %.17, %.14
976 %.19 = and i32 %.18, 15
977 %.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4
978 %.21 = icmp eq i32 %.20, 0
979 %.23 = add i32 %.20, -1
980 %.24 = icmp eq i32 %.23, 0
981 store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15
984 .32: ; preds = %.114, %.48
985 %.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ]
986 %.34 = icmp ult i32 %.44, %.23
987 %.35 = icmp ult i32 %.33, 60
988 %.36 = select i1 %.34, i1 %.35, i1 false
989 br i1 %.36, label %.37, label %.119
991 .37: ; preds = %.32, %.25
992 %.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ]
993 %.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ]
994 %.26 = shl i32 %.7, 10
995 %.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38
996 %.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15
997 %.42 = shl i32 %.38, 5
998 %.43 = or i32 %.42, %.26
999 %.44 = add nuw i32 %.38, 1
1000 %.45 = or i32 %.43, %.44
1001 %.46 = add i32 %.38, 5
1002 %.47 = icmp ult i32 %.46, %.20
1003 br i1 %.47, label %.53, label %.48
1005 .48: ; preds = %.98, %.37
1006 %.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ]
1007 %.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ]
1008 %.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ]
1009 %.52 = icmp ult i32 %.51, %.20
1010 br i1 %.52, label %.103, label %.32
1012 .53: ; preds = %.37, %.98
1013 %.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ]
1014 %.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ]
1015 %.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ]
1016 %.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ]
1017 %.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55
1018 %.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15
1019 %.60 = icmp eq i8 %.41, %.59
1022 .98: ; preds = %.93, %.87
1023 %.99 = add i32 %.56, 1
1024 %.100 = add i32 %.57, 4
1025 %.101 = add i32 %.54, 4
1026 %.102 = icmp ult i32 %.101, %.20
1027 br i1 %.102, label %.53, label %.48
1029 .103: ; preds = %.48, %.114
1030 %.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ]
1031 %.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ]
1032 %.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ]
1033 %.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104
1034 %.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15
1035 %.109 = icmp eq i8 %.41, %.108
1036 br i1 %.109, label %.110, label %.114
1038 .110: ; preds = %.103
1039 %.111 = add i32 %.105, 1
1040 %.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
1041 %.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112
1042 store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11
1045 .114: ; preds = %.110, %.103
1046 %.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ]
1047 %.116 = add i32 %.106, 1
1048 %.117 = add nuw i32 %.104, 1
1049 %.118 = icmp ult i32 %.117, %.20
1050 br i1 %.118, label %.103, label %.32
1052 .119: ; preds = %.32, %.22, %.5
1053 tail call void @_Z7barrierj(i32 noundef 1) #5
1054 %.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
1055 %.121 = icmp ugt i32 %.120, %.9
1058 .206: ; preds = %.201, %.119
1062 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
1063 declare i64 @llvm.fshl.i64(i64, i64, i64) #3
1065 attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
1066 attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
1067 attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
1068 attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1069 attributes #4 = { convergent nounwind willreturn memory(none) }
1070 attributes #5 = { convergent nounwind }
1072 !llvm.module.flags = !{!0, !1, !2}
1073 !opencl.ocl.version = !{!3}
1076 !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
1077 !1 = !{i32 1, !"wchar_size", i32 4}
1078 !2 = !{i32 8, !"PIC Level", i32 2}
1079 !3 = !{i32 1, i32 2}
1080 !4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"}
1081 !5 = !{i32 1, i32 1, i32 1, i32 1, i32 1}
1082 !6 = !{!"none", !"none", !"none", !"none", !"none"}
1083 !7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"}
1084 !8 = !{!"", !"", !"", !"", !""}
1085 !9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"}
1086 !10 = !{i32 64, i32 1, i32 1}
1087 !11 = !{!12, !12, i64 0}
1088 !12 = !{!"int", !13, i64 0}
1089 !13 = !{!"omnipotent char", !14, i64 0}
1090 !14 = !{!"Simple C/C++ TBAA"}
1091 !15 = !{!13, !13, i64 0}
1092 !16 = !{!17, !17, i64 0}
1093 !17 = !{!"long", !13, i64 0}