1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
4 ; ModuleID = 'kernel_round1_passing.bc'
5 source_filename = "/tmp/comgr-295d04/input/CompileSource"
6 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
7 target triple = "amdgcn-amd-amdhsa"
9 @kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1
10 @kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4
11 @kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4
13 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
14 declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0
16 ; Function Attrs: convergent nounwind
17 declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
19 ; Function Attrs: convergent nounwind
20 declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
22 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
23 declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0
25 ; Function Attrs: convergent nounwind
26 declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1
28 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
29 declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0
31 ; Function Attrs: convergent nounwind
32 declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1
34 ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
35 declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0
37 ; Function Attrs: convergent norecurse nounwind
38 define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
39 ; CHECK-LABEL: kernel_round1:
41 ; CHECK-NEXT: s_add_u32 s12, s12, s17
42 ; CHECK-NEXT: s_mov_b32 s32, 0
43 ; CHECK-NEXT: s_addc_u32 s13, s13, 0
44 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
45 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
46 ; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0
47 ; CHECK-NEXT: s_add_u32 s0, s0, s17
48 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
49 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
50 ; CHECK-NEXT: v_mov_b32_e32 v40, v0
51 ; CHECK-NEXT: s_add_u32 s44, s34, 40
52 ; CHECK-NEXT: v_mov_b32_e32 v31, v0
53 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
54 ; CHECK-NEXT: s_mov_b32 s33, s16
55 ; CHECK-NEXT: s_addc_u32 s45, s35, 0
56 ; CHECK-NEXT: s_mov_b32 s43, s14
57 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
58 ; CHECK-NEXT: s_mov_b32 s12, s14
59 ; CHECK-NEXT: s_mov_b32 s13, s15
60 ; CHECK-NEXT: s_mov_b32 s14, s33
61 ; CHECK-NEXT: s_mov_b32 s42, s15
62 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
63 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
64 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
65 ; CHECK-NEXT: s_getpc_b64 s[16:17]
66 ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4
67 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12
68 ; CHECK-NEXT: v_mov_b32_e32 v45, 0
69 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
70 ; CHECK-NEXT: v_mov_b32_e32 v43, v0
71 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
72 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
73 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
74 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
75 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
76 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
77 ; CHECK-NEXT: s_mov_b32 s12, s43
78 ; CHECK-NEXT: s_mov_b32 s13, s42
79 ; CHECK-NEXT: s_mov_b32 s14, s33
80 ; CHECK-NEXT: s_getpc_b64 s[16:17]
81 ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4
82 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12
83 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
84 ; CHECK-NEXT: v_mov_b32_e32 v41, v0
85 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
86 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
87 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
88 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
89 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
90 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
91 ; CHECK-NEXT: s_mov_b32 s12, s43
92 ; CHECK-NEXT: s_mov_b32 s13, s42
93 ; CHECK-NEXT: s_mov_b32 s14, s33
94 ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
95 ; CHECK-NEXT: s_getpc_b64 s[16:17]
96 ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4
97 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12
98 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
99 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
100 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
101 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
102 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
103 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
104 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
105 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
106 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
107 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
108 ; CHECK-NEXT: s_mov_b32 s12, s43
109 ; CHECK-NEXT: global_load_dword v0, v0, s[52:53]
110 ; CHECK-NEXT: s_mov_b32 s13, s42
111 ; CHECK-NEXT: s_mov_b32 s14, s33
112 ; CHECK-NEXT: s_getpc_b64 s[16:17]
113 ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4
114 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12
115 ; CHECK-NEXT: s_waitcnt vmcnt(0)
116 ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
117 ; CHECK-NEXT: v_mov_b32_e32 v1, 12
118 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
119 ; CHECK-NEXT: v_mov_b32_e32 v42, v0
120 ; CHECK-NEXT: s_mov_b32 s44, exec_lo
121 ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
122 ; CHECK-NEXT: s_cbranch_execz .LBB0_25
123 ; CHECK-NEXT: ; %bb.1: ; %.preheader5
124 ; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14
125 ; CHECK-NEXT: s_mov_b32 s4, 0
126 ; CHECK-NEXT: s_mov_b32 s5, 0
127 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
128 ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
129 ; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
130 ; CHECK-NEXT: s_add_i32 s5, s5, 1
131 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
132 ; CHECK-NEXT: ds_write_b8 v1, v45
133 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
134 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
135 ; CHECK-NEXT: s_cbranch_execnz .LBB0_2
136 ; CHECK-NEXT: ; %bb.3:
137 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
138 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
139 ; CHECK-NEXT: s_mov_b32 s45, 0
140 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
141 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
142 ; CHECK-NEXT: s_cbranch_execz .LBB0_25
143 ; CHECK-NEXT: ; %bb.4:
144 ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
145 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
146 ; CHECK-NEXT: v_mov_b32_e32 v47, 0
147 ; CHECK-NEXT: s_mov_b32 s47, 0
148 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
149 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
150 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
151 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s47, v44
152 ; CHECK-NEXT: s_lshl_b32 s4, s47, 5
153 ; CHECK-NEXT: s_add_i32 s46, s47, 1
154 ; CHECK-NEXT: s_add_i32 s5, s47, 5
155 ; CHECK-NEXT: v_or3_b32 v57, s4, v43, s46
156 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
157 ; CHECK-NEXT: ds_read_u8 v56, v0
158 ; CHECK-NEXT: v_mov_b32_e32 v58, s46
159 ; CHECK-NEXT: s_mov_b32 s52, exec_lo
160 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
161 ; CHECK-NEXT: s_cbranch_execz .LBB0_17
162 ; CHECK-NEXT: ; %bb.6: ; %.preheader2
163 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
164 ; CHECK-NEXT: s_mov_b32 s53, 0
165 ; CHECK-NEXT: s_mov_b32 s56, 0
166 ; CHECK-NEXT: s_branch .LBB0_8
167 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
168 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
169 ; CHECK-NEXT: s_add_i32 s56, s56, 4
170 ; CHECK-NEXT: s_add_i32 s4, s47, s56
171 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s56, v57
172 ; CHECK-NEXT: s_add_i32 s5, s4, 5
173 ; CHECK-NEXT: s_add_i32 s4, s4, 1
174 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
175 ; CHECK-NEXT: v_mov_b32_e32 v58, s4
176 ; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53
177 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
178 ; CHECK-NEXT: s_cbranch_execz .LBB0_16
179 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
180 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
181 ; CHECK-NEXT: v_add_nc_u32_e32 v59, s56, v46
182 ; CHECK-NEXT: v_add_nc_u32_e32 v58, s56, v57
183 ; CHECK-NEXT: ds_read_u8 v0, v59
184 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
185 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
186 ; CHECK-NEXT: s_and_saveexec_b32 s57, s4
187 ; CHECK-NEXT: s_cbranch_execz .LBB0_10
188 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
189 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
190 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
191 ; CHECK-NEXT: s_add_u32 s8, s34, 40
192 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
193 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
194 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
195 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
196 ; CHECK-NEXT: s_mov_b32 s12, s43
197 ; CHECK-NEXT: s_mov_b32 s13, s42
198 ; CHECK-NEXT: s_mov_b32 s14, s33
199 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
200 ; CHECK-NEXT: s_getpc_b64 s[16:17]
201 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4
202 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12
203 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
204 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
205 ; CHECK-NEXT: ds_write_b32 v0, v58
206 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
207 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
208 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
209 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
210 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
211 ; CHECK-NEXT: s_and_saveexec_b32 s57, s4
212 ; CHECK-NEXT: s_cbranch_execz .LBB0_12
213 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
214 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
215 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
216 ; CHECK-NEXT: s_add_u32 s8, s34, 40
217 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
218 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
219 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
220 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
221 ; CHECK-NEXT: s_mov_b32 s12, s43
222 ; CHECK-NEXT: s_mov_b32 s13, s42
223 ; CHECK-NEXT: s_mov_b32 s14, s33
224 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
225 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
226 ; CHECK-NEXT: s_getpc_b64 s[16:17]
227 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4
228 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12
229 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
230 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
231 ; CHECK-NEXT: ds_write_b32 v0, v60
232 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
233 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
234 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
235 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
236 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
237 ; CHECK-NEXT: s_and_saveexec_b32 s57, s4
238 ; CHECK-NEXT: s_cbranch_execz .LBB0_14
239 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
240 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
241 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
242 ; CHECK-NEXT: s_add_u32 s8, s34, 40
243 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
244 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
245 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
246 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
247 ; CHECK-NEXT: s_mov_b32 s12, s43
248 ; CHECK-NEXT: s_mov_b32 s13, s42
249 ; CHECK-NEXT: s_mov_b32 s14, s33
250 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
251 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
252 ; CHECK-NEXT: s_getpc_b64 s[16:17]
253 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4
254 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12
255 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
256 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
257 ; CHECK-NEXT: ds_write_b32 v0, v60
258 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
259 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
260 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
261 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
262 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
263 ; CHECK-NEXT: s_and_saveexec_b32 s57, s4
264 ; CHECK-NEXT: s_cbranch_execz .LBB0_7
265 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
266 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
267 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
268 ; CHECK-NEXT: s_add_u32 s8, s34, 40
269 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
270 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
271 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
272 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
273 ; CHECK-NEXT: s_mov_b32 s12, s43
274 ; CHECK-NEXT: s_mov_b32 s13, s42
275 ; CHECK-NEXT: s_mov_b32 s14, s33
276 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
277 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
278 ; CHECK-NEXT: s_getpc_b64 s[16:17]
279 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4
280 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12
281 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
282 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
283 ; CHECK-NEXT: ds_write_b32 v0, v58
284 ; CHECK-NEXT: s_branch .LBB0_7
285 ; CHECK-NEXT: .LBB0_16: ; %Flow45
286 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
287 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
288 ; CHECK-NEXT: v_mov_b32_e32 v57, v0
289 ; CHECK-NEXT: .LBB0_17: ; %Flow46
290 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
291 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
292 ; CHECK-NEXT: s_mov_b32 s47, exec_lo
293 ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42
294 ; CHECK-NEXT: s_cbranch_execz .LBB0_23
295 ; CHECK-NEXT: ; %bb.18: ; %.preheader
296 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
297 ; CHECK-NEXT: s_mov_b32 s52, 0
298 ; CHECK-NEXT: s_inst_prefetch 0x1
299 ; CHECK-NEXT: s_branch .LBB0_20
300 ; CHECK-NEXT: .p2align 6
301 ; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
302 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
303 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58
304 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
305 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42
306 ; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52
307 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52
308 ; CHECK-NEXT: s_cbranch_execz .LBB0_22
309 ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
310 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
311 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
312 ; CHECK-NEXT: ds_read_u8 v0, v0
313 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
314 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
315 ; CHECK-NEXT: s_and_saveexec_b32 s53, s4
316 ; CHECK-NEXT: s_cbranch_execz .LBB0_19
317 ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
318 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
319 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
320 ; CHECK-NEXT: s_add_u32 s8, s34, 40
321 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
322 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
323 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
324 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
325 ; CHECK-NEXT: s_mov_b32 s12, s43
326 ; CHECK-NEXT: s_mov_b32 s13, s42
327 ; CHECK-NEXT: s_mov_b32 s14, s33
328 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
329 ; CHECK-NEXT: s_getpc_b64 s[16:17]
330 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4
331 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12
332 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
333 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
334 ; CHECK-NEXT: ds_write_b32 v0, v57
335 ; CHECK-NEXT: s_branch .LBB0_19
336 ; CHECK-NEXT: .LBB0_22: ; %Flow43
337 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
338 ; CHECK-NEXT: s_inst_prefetch 0x2
339 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
340 ; CHECK-NEXT: .LBB0_23: ; %Flow44
341 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
342 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47
343 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
344 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s46, v45
345 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
346 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
347 ; CHECK-NEXT: s_mov_b32 s47, s46
348 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
349 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
350 ; CHECK-NEXT: s_or_b32 s45, s4, s45
351 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45
352 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5
353 ; CHECK-NEXT: .LBB0_25: ; %Flow51
354 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44
355 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
356 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
357 ; CHECK-NEXT: s_add_u32 s8, s34, 40
358 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
359 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
360 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
361 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
362 ; CHECK-NEXT: s_mov_b32 s12, s43
363 ; CHECK-NEXT: s_mov_b32 s13, s42
364 ; CHECK-NEXT: s_mov_b32 s14, s33
365 ; CHECK-NEXT: s_getpc_b64 s[16:17]
366 ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4
367 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12
368 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
369 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
370 ; CHECK-NEXT: s_mov_b32 s4, exec_lo
371 ; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360
372 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
373 ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41
374 ; CHECK-NEXT: s_cbranch_execz .LBB0_33
375 ; CHECK-NEXT: ; %bb.26:
376 ; CHECK-NEXT: s_mov_b32 s44, 0
377 ; CHECK-NEXT: s_branch .LBB0_28
378 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
379 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45
380 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
381 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
382 ; CHECK-NEXT: s_add_u32 s8, s34, 40
383 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
384 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
385 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
386 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
387 ; CHECK-NEXT: s_mov_b32 s12, s43
388 ; CHECK-NEXT: s_mov_b32 s13, s42
389 ; CHECK-NEXT: s_mov_b32 s14, s33
390 ; CHECK-NEXT: s_getpc_b64 s[16:17]
391 ; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4
392 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej@rel32@hi+12
393 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
394 ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
395 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
396 ; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44
397 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
398 ; CHECK-NEXT: s_cbranch_execz .LBB0_33
399 ; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1
400 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41
401 ; CHECK-NEXT: s_mov_b32 s45, exec_lo
402 ; CHECK-NEXT: ds_read_b32 v0, v0
403 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
404 ; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
405 ; CHECK-NEXT: v_bfe_u32 v62, v0, 5, 5
406 ; CHECK-NEXT: v_and_b32_e32 v72, 31, v0
407 ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
408 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
409 ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
410 ; CHECK-NEXT: v_add_co_u32 v2, s4, s48, v1
411 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s49, 0, s4
412 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
413 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
414 ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
415 ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
416 ; CHECK-NEXT: s_clause 0x1
417 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
418 ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
419 ; CHECK-NEXT: s_waitcnt vmcnt(0)
420 ; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
421 ; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
422 ; CHECK-NEXT: v_xor_b32_e32 v57, v11, v7
423 ; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6
424 ; CHECK-NEXT: v_or_b32_e32 v5, v46, v57
425 ; CHECK-NEXT: v_or_b32_e32 v4, v45, v56
426 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
427 ; CHECK-NEXT: s_cbranch_execz .LBB0_27
428 ; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
429 ; CHECK-NEXT: s_clause 0x1
430 ; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
431 ; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
432 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
433 ; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
434 ; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
435 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
436 ; CHECK-NEXT: s_add_u32 s8, s34, 40
437 ; CHECK-NEXT: v_and_b32_e32 v3, 0xf000, v0
438 ; CHECK-NEXT: v_and_b32_e32 v4, 0xf00, v1
439 ; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0
440 ; CHECK-NEXT: v_and_b32_e32 v1, 15, v1
441 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
442 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
443 ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4
444 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
445 ; CHECK-NEXT: s_mov_b32 s12, s43
446 ; CHECK-NEXT: s_mov_b32 s13, s42
447 ; CHECK-NEXT: s_mov_b32 s14, s33
448 ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1
449 ; CHECK-NEXT: s_getpc_b64 s[16:17]
450 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj@rel32@lo+4
451 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj@rel32@hi+12
452 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73
453 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73
454 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0
455 ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1
456 ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1
457 ; CHECK-NEXT: v_add_co_u32 v42, s4, s54, v0
458 ; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s55, 0, s4
459 ; CHECK-NEXT: v_mov_b32_e32 v2, v44
460 ; CHECK-NEXT: v_mov_b32_e32 v0, v42
461 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
462 ; CHECK-NEXT: v_mov_b32_e32 v1, v43
463 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
464 ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4
465 ; CHECK-NEXT: s_mov_b32 s4, exec_lo
466 ; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0
467 ; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
468 ; CHECK-NEXT: s_cbranch_execz .LBB0_31
469 ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
470 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58
471 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57]
472 ; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51]
473 ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0
474 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4
475 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
476 ; CHECK-NEXT: v_lshlrev_b32_e32 v9, 12, v63
477 ; CHECK-NEXT: v_xor_b32_e32 v5, v61, v59
478 ; CHECK-NEXT: v_lshlrev_b32_e32 v11, 16, v56
479 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v3
480 ; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46]
481 ; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, v10
482 ; CHECK-NEXT: v_or3_b32 v8, v8, v9, v62
483 ; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
484 ; CHECK-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
485 ; CHECK-NEXT: v_or_b32_e32 v1, v11, v1
486 ; CHECK-NEXT: ; implicit-def: $vgpr42
487 ; CHECK-NEXT: ; implicit-def: $vgpr43
488 ; CHECK-NEXT: ; implicit-def: $vgpr44
489 ; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4
490 ; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8
491 ; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24
492 ; CHECK-NEXT: .LBB0_31: ; %Flow
493 ; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
494 ; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4
495 ; CHECK-NEXT: s_cbranch_execz .LBB0_27
496 ; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
497 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
498 ; CHECK-NEXT: v_mov_b32_e32 v0, v42
499 ; CHECK-NEXT: v_mov_b32_e32 v1, v43
500 ; CHECK-NEXT: v_mov_b32_e32 v2, v44
501 ; CHECK-NEXT: s_add_u32 s8, s34, 40
502 ; CHECK-NEXT: s_addc_u32 s9, s35, 0
503 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
504 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
505 ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
506 ; CHECK-NEXT: s_mov_b32 s12, s43
507 ; CHECK-NEXT: s_mov_b32 s13, s42
508 ; CHECK-NEXT: s_mov_b32 s14, s33
509 ; CHECK-NEXT: s_getpc_b64 s[16:17]
510 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4
511 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj@rel32@hi+12
512 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
513 ; CHECK-NEXT: s_branch .LBB0_27
514 ; CHECK-NEXT: .LBB0_33:
515 ; CHECK-NEXT: s_endpgm
516 %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
517 %7 = trunc i64 %6 to i32
518 %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
519 %9 = trunc i64 %8 to i32
521 %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10
522 store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
523 tail call void @_Z7barrierj(i32 noundef 1) #5
526 %14 = and i32 %13, 28
527 %15 = and i64 %12, 536870911
528 %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15
529 %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11
530 %18 = lshr i32 %17, %14
531 %19 = and i32 %18, 15
532 %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4
533 %21 = icmp eq i32 %20, 0
534 br i1 %21, label %119, label %27
537 %23 = add i32 %20, -1
538 %24 = icmp eq i32 %23, 0
539 br i1 %24, label %119, label %25
545 27: ; preds = %5, %27
546 %28 = phi i32 [ %30, %27 ], [ 0, %5 ]
547 %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28
548 store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15
549 %30 = add nuw i32 %28, 1
550 %31 = icmp eq i32 %30, %20
551 br i1 %31, label %22, label %27
553 32: ; preds = %114, %48
554 %33 = phi i32 [ %50, %48 ], [ %115, %114 ]
555 %34 = icmp ult i32 %44, %23
556 %35 = icmp ult i32 %33, 60
557 %36 = select i1 %34, i1 %35, i1 false
558 br i1 %36, label %37, label %119
560 37: ; preds = %32, %25
561 %38 = phi i32 [ 0, %25 ], [ %44, %32 ]
562 %39 = phi i32 [ 0, %25 ], [ %33, %32 ]
563 %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38
564 %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15
566 %43 = or i32 %42, %26
567 %44 = add nuw i32 %38, 1
568 %45 = or i32 %43, %44
570 %47 = icmp ult i32 %46, %20
571 br i1 %47, label %53, label %48
573 48: ; preds = %98, %37
574 %49 = phi i32 [ %45, %37 ], [ %100, %98 ]
575 %50 = phi i32 [ %39, %37 ], [ %99, %98 ]
576 %51 = phi i32 [ %44, %37 ], [ %54, %98 ]
577 %52 = icmp ult i32 %51, %20
578 br i1 %52, label %103, label %32
580 53: ; preds = %37, %98
581 %54 = phi i32 [ %101, %98 ], [ %46, %37 ]
582 %55 = phi i32 [ %54, %98 ], [ %44, %37 ]
583 %56 = phi i32 [ %99, %98 ], [ %39, %37 ]
584 %57 = phi i32 [ %100, %98 ], [ %45, %37 ]
585 %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55
586 %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15
587 %60 = icmp eq i8 %41, %59
588 br i1 %60, label %61, label %65
592 %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
593 %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63
594 store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11
597 65: ; preds = %61, %53
598 %66 = phi i32 [ %62, %61 ], [ %56, %53 ]
600 %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67
601 %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15
602 %70 = icmp eq i8 %41, %69
603 br i1 %70, label %71, label %76
608 %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
609 %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74
610 store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11
613 76: ; preds = %71, %65
614 %77 = phi i32 [ %73, %71 ], [ %66, %65 ]
616 %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78
617 %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15
618 %81 = icmp eq i8 %41, %80
619 br i1 %81, label %82, label %87
624 %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
625 %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85
626 store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11
629 87: ; preds = %82, %76
630 %88 = phi i32 [ %84, %82 ], [ %77, %76 ]
632 %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89
633 %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15
634 %92 = icmp eq i8 %41, %91
635 br i1 %92, label %93, label %98
640 %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
641 %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96
642 store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11
645 98: ; preds = %93, %87
646 %99 = phi i32 [ %95, %93 ], [ %88, %87 ]
647 %100 = add i32 %57, 4
648 %101 = add i32 %54, 4
649 %102 = icmp ult i32 %101, %20
650 br i1 %102, label %53, label %48
652 103: ; preds = %48, %114
653 %104 = phi i32 [ %117, %114 ], [ %51, %48 ]
654 %105 = phi i32 [ %115, %114 ], [ %50, %48 ]
655 %106 = phi i32 [ %116, %114 ], [ %49, %48 ]
656 %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104
657 %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15
658 %109 = icmp eq i8 %41, %108
659 br i1 %109, label %110, label %114
662 %111 = add i32 %105, 1
663 %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
664 %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112
665 store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11
668 114: ; preds = %110, %103
669 %115 = phi i32 [ %111, %110 ], [ %105, %103 ]
670 %116 = add i32 %106, 1
671 %117 = add nuw i32 %104, 1
672 %118 = icmp ult i32 %117, %20
673 br i1 %118, label %103, label %32
675 119: ; preds = %32, %22, %5
676 tail call void @_Z7barrierj(i32 noundef 1) #5
677 %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
678 %121 = icmp ugt i32 %120, %9
679 br i1 %121, label %122, label %206
682 %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
685 124: ; preds = %201, %122
686 %125 = phi i32 [ %9, %122 ], [ %204, %201 ]
687 %126 = phi i64 [ %8, %122 ], [ %203, %201 ]
688 %127 = and i64 %126, 4294967295
689 %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125
690 %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11
691 %130 = lshr i32 %129, 10
692 %131 = lshr i32 %129, 5
693 %132 = and i32 %131, 31
694 %133 = and i32 %129, 31
695 %134 = mul nuw nsw i32 %130, 384
696 %135 = zext i32 %134 to i64
697 %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135
698 %137 = shl nuw nsw i32 %132, 5
699 %138 = zext i32 %137 to i64
700 %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138
701 %140 = shl nuw nsw i32 %133, 5
702 %141 = zext i32 %140 to i64
703 %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141
704 %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1
705 %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16
706 %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1
707 %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16
708 %147 = xor i64 %146, %144
709 %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16
710 %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16
711 %150 = xor i64 %149, %148
712 %151 = icmp ne i64 %147, 0
713 %152 = icmp ne i64 %150, 0
714 %153 = select i1 %151, i1 true, i1 %152
715 br i1 %153, label %154, label %201
718 %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2
719 %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16
720 %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2
721 %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16
722 %159 = and i64 %147, 983040
723 %160 = shl i64 %147, 4
724 %161 = and i64 %160, 61440
725 %162 = or i64 %161, %159
726 %163 = lshr i64 %147, 12
727 %164 = and i64 %163, 3840
728 %165 = or i64 %162, %164
729 %166 = and i64 %160, 240
730 %167 = or i64 %165, %166
731 %168 = and i64 %163, 15
732 %169 = or i64 %167, %168
733 %170 = trunc i64 %169 to i32
734 %171 = lshr i64 %169, 3
735 %172 = shl nuw nsw i32 %170, 2
736 %173 = and i32 %172, 28
737 %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171
738 %175 = shl nuw nsw i32 1, %173
739 %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
740 %177 = lshr i32 %176, %173
741 %178 = and i32 %177, 15
742 %179 = icmp ugt i32 %178, 11
743 br i1 %179, label %180, label %182
746 %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
750 %183 = xor i64 %158, %156
751 %184 = lshr i64 %183, 16
752 %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48)
753 %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48)
754 %187 = shl nuw nsw i32 %133, 6
755 %188 = shl i32 %130, 12
756 %189 = or i32 %187, %188
757 %190 = or i32 %189, %132
758 %191 = mul nuw nsw i64 %169, 384
759 %192 = and i64 %191, 4294967168
760 %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
761 %194 = shl nuw nsw i32 %178, 5
762 %195 = or disjoint i32 %194, 8
763 %196 = zext i32 %195 to i64
764 %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
765 %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
766 store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11
767 store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16
768 %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8
769 store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16
770 %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16
771 store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16
774 201: ; preds = %182, %180, %124
775 %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4
776 %203 = add i64 %202, %127
777 %204 = trunc i64 %203 to i32
778 %205 = icmp ugt i32 %120, %204
779 br i1 %205, label %124, label %206
781 206: ; preds = %201, %119
785 ; Removed most of the if-else blocks
787 define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
788 ; CHECK-LABEL: kernel_round1_short:
789 ; CHECK: ; %bb.0: ; %.5
790 ; CHECK-NEXT: s_add_u32 s12, s12, s17
791 ; CHECK-NEXT: s_mov_b32 s32, 0
792 ; CHECK-NEXT: s_addc_u32 s13, s13, 0
793 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
794 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
795 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[8:9], 0x10
796 ; CHECK-NEXT: s_add_u32 s0, s0, s17
797 ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9]
798 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
799 ; CHECK-NEXT: v_mov_b32_e32 v40, v0
800 ; CHECK-NEXT: s_add_u32 s44, s38, 40
801 ; CHECK-NEXT: v_mov_b32_e32 v31, v0
802 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
803 ; CHECK-NEXT: s_mov_b32 s33, s16
804 ; CHECK-NEXT: s_addc_u32 s45, s39, 0
805 ; CHECK-NEXT: s_mov_b32 s43, s14
806 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
807 ; CHECK-NEXT: s_mov_b32 s12, s14
808 ; CHECK-NEXT: s_mov_b32 s13, s15
809 ; CHECK-NEXT: s_mov_b32 s14, s33
810 ; CHECK-NEXT: s_mov_b32 s42, s15
811 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
812 ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
813 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
814 ; CHECK-NEXT: s_getpc_b64 s[16:17]
815 ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4
816 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12
817 ; CHECK-NEXT: v_mov_b32_e32 v43, 0
818 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
819 ; CHECK-NEXT: v_mov_b32_e32 v42, v0
820 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
821 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
822 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
823 ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
824 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
825 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
826 ; CHECK-NEXT: s_mov_b32 s12, s43
827 ; CHECK-NEXT: s_mov_b32 s13, s42
828 ; CHECK-NEXT: s_mov_b32 s14, s33
829 ; CHECK-NEXT: s_getpc_b64 s[16:17]
830 ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4
831 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12
832 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
833 ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
834 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
835 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
836 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
837 ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
838 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
839 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
840 ; CHECK-NEXT: s_mov_b32 s12, s43
841 ; CHECK-NEXT: s_mov_b32 s13, s42
842 ; CHECK-NEXT: s_mov_b32 s14, s33
843 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
844 ; CHECK-NEXT: s_getpc_b64 s[16:17]
845 ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4
846 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12
847 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
848 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
849 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
850 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
851 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
852 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
853 ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
854 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
855 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
856 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
857 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
858 ; CHECK-NEXT: s_mov_b32 s12, s43
859 ; CHECK-NEXT: global_load_dword v0, v0, s[46:47]
860 ; CHECK-NEXT: s_mov_b32 s13, s42
861 ; CHECK-NEXT: s_mov_b32 s14, s33
862 ; CHECK-NEXT: s_getpc_b64 s[16:17]
863 ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4
864 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12
865 ; CHECK-NEXT: s_waitcnt vmcnt(0)
866 ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
867 ; CHECK-NEXT: v_mov_b32_e32 v1, 12
868 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
869 ; CHECK-NEXT: v_mov_b32_e32 v41, v0
870 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
871 ; CHECK-NEXT: s_mov_b32 s44, 0
872 ; CHECK-NEXT: s_mov_b32 s4, 0
873 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
874 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
875 ; CHECK-NEXT: .LBB1_1: ; %.37
876 ; CHECK-NEXT: ; =>This Loop Header: Depth=1
877 ; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
878 ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
879 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
880 ; CHECK-NEXT: s_lshl_b32 s5, s4, 5
881 ; CHECK-NEXT: s_add_i32 s45, s4, 1
882 ; CHECK-NEXT: s_add_i32 s6, s4, 5
883 ; CHECK-NEXT: v_or3_b32 v47, s5, v42, s45
884 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
885 ; CHECK-NEXT: ds_read_u8 v46, v0
886 ; CHECK-NEXT: v_mov_b32_e32 v56, s45
887 ; CHECK-NEXT: s_mov_b32 s5, exec_lo
888 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
889 ; CHECK-NEXT: s_cbranch_execz .LBB1_5
890 ; CHECK-NEXT: ; %bb.2: ; %.53.preheader
891 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
892 ; CHECK-NEXT: s_mov_b32 s6, 0
893 ; CHECK-NEXT: s_mov_b32 s7, 0
894 ; CHECK-NEXT: .LBB1_3: ; %.53
895 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
896 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
897 ; CHECK-NEXT: s_add_i32 s7, s7, 4
898 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
899 ; CHECK-NEXT: s_add_i32 s8, s4, s7
900 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
901 ; CHECK-NEXT: s_add_i32 s9, s8, 5
902 ; CHECK-NEXT: s_add_i32 s8, s8, 1
903 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
904 ; CHECK-NEXT: v_mov_b32_e32 v56, s8
905 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
906 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
907 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3
908 ; CHECK-NEXT: ; %bb.4: ; %Flow3
909 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
910 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
911 ; CHECK-NEXT: v_mov_b32_e32 v47, v0
912 ; CHECK-NEXT: .LBB1_5: ; %Flow4
913 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
914 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
915 ; CHECK-NEXT: s_mov_b32 s46, exec_lo
916 ; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
917 ; CHECK-NEXT: s_cbranch_execz .LBB1_11
918 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader
919 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
920 ; CHECK-NEXT: s_mov_b32 s47, 0
921 ; CHECK-NEXT: s_inst_prefetch 0x1
922 ; CHECK-NEXT: s_branch .LBB1_8
923 ; CHECK-NEXT: .p2align 6
924 ; CHECK-NEXT: .LBB1_7: ; %.114
925 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
926 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
927 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
928 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
929 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
930 ; CHECK-NEXT: s_or_b32 s47, vcc_lo, s47
931 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s47
932 ; CHECK-NEXT: s_cbranch_execz .LBB1_10
933 ; CHECK-NEXT: .LBB1_8: ; %.103
934 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
935 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
936 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
937 ; CHECK-NEXT: ds_read_u8 v0, v0
938 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
939 ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
940 ; CHECK-NEXT: s_and_saveexec_b32 s48, s4
941 ; CHECK-NEXT: s_cbranch_execz .LBB1_7
942 ; CHECK-NEXT: ; %bb.9: ; %.110
943 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
944 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
945 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
946 ; CHECK-NEXT: s_add_u32 s8, s38, 40
947 ; CHECK-NEXT: s_addc_u32 s9, s39, 0
948 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
949 ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
950 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
951 ; CHECK-NEXT: s_mov_b32 s12, s43
952 ; CHECK-NEXT: s_mov_b32 s13, s42
953 ; CHECK-NEXT: s_mov_b32 s14, s33
954 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
955 ; CHECK-NEXT: s_getpc_b64 s[16:17]
956 ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4
957 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12
958 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
959 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
960 ; CHECK-NEXT: ds_write_b32 v0, v47
961 ; CHECK-NEXT: s_branch .LBB1_7
962 ; CHECK-NEXT: .LBB1_10: ; %Flow
963 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
964 ; CHECK-NEXT: s_inst_prefetch 0x2
965 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47
966 ; CHECK-NEXT: .LBB1_11: ; %Flow2
967 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
968 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46
969 ; CHECK-NEXT: ; %bb.12: ; %.32
970 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
971 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s45, v45
972 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
973 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
974 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
975 ; CHECK-NEXT: s_or_b32 s44, s4, s44
976 ; CHECK-NEXT: s_mov_b32 s4, s45
977 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
978 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1
979 ; CHECK-NEXT: ; %bb.13: ; %.119
980 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44
981 ; CHECK-NEXT: v_mov_b32_e32 v31, v40
982 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
983 ; CHECK-NEXT: s_add_u32 s8, s38, 40
984 ; CHECK-NEXT: s_addc_u32 s9, s39, 0
985 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
986 ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
987 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
988 ; CHECK-NEXT: s_mov_b32 s12, s43
989 ; CHECK-NEXT: s_mov_b32 s13, s42
990 ; CHECK-NEXT: s_mov_b32 s14, s33
991 ; CHECK-NEXT: s_getpc_b64 s[16:17]
992 ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4
993 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12
994 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
995 ; CHECK-NEXT: s_endpgm
997 %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
998 %.7 = trunc i64 %.6 to i32
999 %.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
1000 %.9 = trunc i64 %.8 to i32
1001 %.10 = mul i32 %.9, 14
1002 %.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10
1003 store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
1004 tail call void @_Z7barrierj(i32 noundef 1) #5
1005 %.12 = lshr i64 %.6, 3
1006 %.13 = shl i32 %.7, 2
1007 %.14 = and i32 %.13, 28
1008 %.15 = and i64 %.12, 536870911
1009 %.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15
1010 %.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11
1011 %.18 = lshr i32 %.17, %.14
1012 %.19 = and i32 %.18, 15
1013 %.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4
1014 %.21 = icmp eq i32 %.20, 0
1015 %.23 = add i32 %.20, -1
1016 %.24 = icmp eq i32 %.23, 0
1017 store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15
1020 .32: ; preds = %.114, %.48
1021 %.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ]
1022 %.34 = icmp ult i32 %.44, %.23
1023 %.35 = icmp ult i32 %.33, 60
1024 %.36 = select i1 %.34, i1 %.35, i1 false
1025 br i1 %.36, label %.37, label %.119
1027 .37: ; preds = %.32, %.25
1028 %.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ]
1029 %.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ]
1030 %.26 = shl i32 %.7, 10
1031 %.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38
1032 %.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15
1033 %.42 = shl i32 %.38, 5
1034 %.43 = or i32 %.42, %.26
1035 %.44 = add nuw i32 %.38, 1
1036 %.45 = or i32 %.43, %.44
1037 %.46 = add i32 %.38, 5
1038 %.47 = icmp ult i32 %.46, %.20
1039 br i1 %.47, label %.53, label %.48
1041 .48: ; preds = %.98, %.37
1042 %.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ]
1043 %.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ]
1044 %.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ]
1045 %.52 = icmp ult i32 %.51, %.20
1046 br i1 %.52, label %.103, label %.32
1048 .53: ; preds = %.37, %.98
1049 %.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ]
1050 %.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ]
1051 %.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ]
1052 %.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ]
1053 %.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55
1054 %.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15
1055 %.60 = icmp eq i8 %.41, %.59
1058 .98: ; preds = %.93, %.87
1059 %.99 = add i32 %.56, 1
1060 %.100 = add i32 %.57, 4
1061 %.101 = add i32 %.54, 4
1062 %.102 = icmp ult i32 %.101, %.20
1063 br i1 %.102, label %.53, label %.48
1065 .103: ; preds = %.48, %.114
1066 %.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ]
1067 %.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ]
1068 %.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ]
1069 %.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104
1070 %.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15
1071 %.109 = icmp eq i8 %.41, %.108
1072 br i1 %.109, label %.110, label %.114
1074 .110: ; preds = %.103
1075 %.111 = add i32 %.105, 1
1076 %.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
1077 %.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112
1078 store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11
1081 .114: ; preds = %.110, %.103
1082 %.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ]
1083 %.116 = add i32 %.106, 1
1084 %.117 = add nuw i32 %.104, 1
1085 %.118 = icmp ult i32 %.117, %.20
1086 br i1 %.118, label %.103, label %.32
1088 .119: ; preds = %.32, %.22, %.5
1089 tail call void @_Z7barrierj(i32 noundef 1) #5
1090 %.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
1091 %.121 = icmp ugt i32 %.120, %.9
1094 .206: ; preds = %.201, %.119
1098 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
1099 declare i64 @llvm.fshl.i64(i64, i64, i64) #3
1101 attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
1102 attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
1103 attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
1104 attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1105 attributes #4 = { convergent nounwind willreturn memory(none) }
1106 attributes #5 = { convergent nounwind }
1108 !llvm.module.flags = !{!0, !1, !2}
1109 !opencl.ocl.version = !{!3}
1112 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
1113 !1 = !{i32 1, !"wchar_size", i32 4}
1114 !2 = !{i32 8, !"PIC Level", i32 2}
1115 !3 = !{i32 1, i32 2}
1116 !4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"}
1117 !5 = !{i32 1, i32 1, i32 1, i32 1, i32 1}
1118 !6 = !{!"none", !"none", !"none", !"none", !"none"}
1119 !7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"}
1120 !8 = !{!"", !"", !"", !"", !""}
1121 !9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"}
1122 !10 = !{i32 64, i32 1, i32 1}
1123 !11 = !{!12, !12, i64 0}
1124 !12 = !{!"int", !13, i64 0}
1125 !13 = !{!"omnipotent char", !14, i64 0}
1126 !14 = !{!"Simple C/C++ TBAA"}
1127 !15 = !{!13, !13, i64 0}
1128 !16 = !{!17, !17, i64 0}
1129 !17 = !{!"long", !13, i64 0}