1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL %s
4 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CISI %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1010 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s
10 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
12 ; GCN-ISEL-LABEL: name: sadd64rr
13 ; GCN-ISEL-LABEL: body:
14 ; GCN-ISEL-LABEL: bb.0.entry:
15 ; GCN-ISEL: S_ADD_U64_PSEUDO
17 define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
18 ; CISI-LABEL: sadd64rr:
19 ; CISI: ; %bb.0: ; %entry
20 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
21 ; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
22 ; CISI-NEXT: s_mov_b32 s3, 0xf000
23 ; CISI-NEXT: s_mov_b32 s2, -1
24 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
25 ; CISI-NEXT: s_mov_b32 s0, s4
26 ; CISI-NEXT: s_add_u32 s4, s6, s8
27 ; CISI-NEXT: s_mov_b32 s1, s5
28 ; CISI-NEXT: s_addc_u32 s5, s7, s9
29 ; CISI-NEXT: v_mov_b32_e32 v0, s4
30 ; CISI-NEXT: v_mov_b32_e32 v1, s5
31 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
35 ; VI: ; %bb.0: ; %entry
36 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
37 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
38 ; VI-NEXT: s_waitcnt lgkmcnt(0)
39 ; VI-NEXT: v_mov_b32_e32 v0, s4
40 ; VI-NEXT: s_add_u32 s0, s6, s0
41 ; VI-NEXT: s_addc_u32 s1, s7, s1
42 ; VI-NEXT: v_mov_b32_e32 v3, s1
43 ; VI-NEXT: v_mov_b32_e32 v1, s5
44 ; VI-NEXT: v_mov_b32_e32 v2, s0
45 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
48 ; GFX9-LABEL: sadd64rr:
49 ; GFX9: ; %bb.0: ; %entry
50 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
51 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
52 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
53 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX9-NEXT: s_add_u32 s0, s6, s2
55 ; GFX9-NEXT: s_addc_u32 s1, s7, s3
56 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
57 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
58 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
61 ; GFX1010-LABEL: sadd64rr:
62 ; GFX1010: ; %bb.0: ; %entry
63 ; GFX1010-NEXT: s_clause 0x1
64 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
65 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
66 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
67 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX1010-NEXT: s_add_u32 s0, s6, s2
69 ; GFX1010-NEXT: s_addc_u32 s1, s7, s3
70 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0
71 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1
72 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
73 ; GFX1010-NEXT: s_endpgm
75 ; GFX1030W32-LABEL: sadd64rr:
76 ; GFX1030W32: ; %bb.0: ; %entry
77 ; GFX1030W32-NEXT: s_clause 0x1
78 ; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
79 ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
80 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
81 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX1030W32-NEXT: s_add_u32 s0, s6, s0
83 ; GFX1030W32-NEXT: s_addc_u32 s1, s7, s1
84 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
85 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
86 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
87 ; GFX1030W32-NEXT: s_endpgm
89 ; GFX1030W64-LABEL: sadd64rr:
90 ; GFX1030W64: ; %bb.0: ; %entry
91 ; GFX1030W64-NEXT: s_clause 0x1
92 ; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
93 ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
94 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
95 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX1030W64-NEXT: s_add_u32 s0, s6, s0
97 ; GFX1030W64-NEXT: s_addc_u32 s1, s7, s1
98 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
99 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
100 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
101 ; GFX1030W64-NEXT: s_endpgm
103 ; GFX11-LABEL: sadd64rr:
104 ; GFX11: ; %bb.0: ; %entry
105 ; GFX11-NEXT: s_clause 0x1
106 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
107 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
108 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX11-NEXT: s_add_u32 s0, s6, s0
110 ; GFX11-NEXT: s_addc_u32 s1, s7, s1
111 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
112 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
113 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
114 ; GFX11-NEXT: s_nop 0
115 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
116 ; GFX11-NEXT: s_endpgm
118 %add = add i64 %a, %b
119 store i64 %add, ptr addrspace(1) %out
123 ; GCN-ISEL-LABEL: name: sadd64ri
124 ; GCN-ISEL-LABEL: body:
125 ; GCN-ISEL-LABEL: bb.0.entry:
126 ; GCN-ISEL: S_ADD_U64_PSEUDO
128 define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
129 ; CISI-LABEL: sadd64ri:
130 ; CISI: ; %bb.0: ; %entry
131 ; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
132 ; CISI-NEXT: s_mov_b32 s7, 0xf000
133 ; CISI-NEXT: s_mov_b32 s6, -1
134 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
135 ; CISI-NEXT: s_mov_b32 s4, s0
136 ; CISI-NEXT: s_add_u32 s0, s2, 0x56789876
137 ; CISI-NEXT: s_mov_b32 s5, s1
138 ; CISI-NEXT: s_addc_u32 s1, s3, 0x1234
139 ; CISI-NEXT: v_mov_b32_e32 v0, s0
140 ; CISI-NEXT: v_mov_b32_e32 v1, s1
141 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
142 ; CISI-NEXT: s_endpgm
144 ; VI-LABEL: sadd64ri:
145 ; VI: ; %bb.0: ; %entry
146 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
148 ; VI-NEXT: v_mov_b32_e32 v0, s0
149 ; VI-NEXT: s_add_u32 s0, s2, 0x56789876
150 ; VI-NEXT: v_mov_b32_e32 v1, s1
151 ; VI-NEXT: s_addc_u32 s1, s3, 0x1234
152 ; VI-NEXT: v_mov_b32_e32 v3, s1
153 ; VI-NEXT: v_mov_b32_e32 v2, s0
154 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
157 ; GFX9-LABEL: sadd64ri:
158 ; GFX9: ; %bb.0: ; %entry
159 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
160 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
161 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876
163 ; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234
164 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
165 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
166 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
167 ; GFX9-NEXT: s_endpgm
169 ; GFX1010-LABEL: sadd64ri:
170 ; GFX1010: ; %bb.0: ; %entry
171 ; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
172 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
173 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876
175 ; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234
176 ; GFX1010-NEXT: v_mov_b32_e32 v0, s2
177 ; GFX1010-NEXT: v_mov_b32_e32 v1, s3
178 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
179 ; GFX1010-NEXT: s_endpgm
181 ; GFX1030W32-LABEL: sadd64ri:
182 ; GFX1030W32: ; %bb.0: ; %entry
183 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
184 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
185 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876
187 ; GFX1030W32-NEXT: s_addc_u32 s3, s3, 0x1234
188 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
189 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
190 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
191 ; GFX1030W32-NEXT: s_endpgm
193 ; GFX1030W64-LABEL: sadd64ri:
194 ; GFX1030W64: ; %bb.0: ; %entry
195 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
196 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
197 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876
199 ; GFX1030W64-NEXT: s_addc_u32 s3, s3, 0x1234
200 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
201 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
202 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
203 ; GFX1030W64-NEXT: s_endpgm
205 ; GFX11-LABEL: sadd64ri:
206 ; GFX11: ; %bb.0: ; %entry
207 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
208 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876
210 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234
211 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
212 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
213 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
214 ; GFX11-NEXT: s_nop 0
215 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
216 ; GFX11-NEXT: s_endpgm
218 %add = add i64 20015998343286, %a
219 store i64 %add, ptr addrspace(1) %out
223 ; GCN-ISEL-LABEL: name: vadd64rr
224 ; GCN-ISEL-LABEL: body:
225 ; GCN-ISEL-LABEL: bb.0.entry:
226 ; GCN-ISEL: V_ADD_U64_PSEUDO
228 define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
229 ; CISI-LABEL: vadd64rr:
230 ; CISI: ; %bb.0: ; %entry
231 ; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
232 ; CISI-NEXT: s_mov_b32 s7, 0xf000
233 ; CISI-NEXT: s_mov_b32 s6, -1
234 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
235 ; CISI-NEXT: v_mov_b32_e32 v1, s3
236 ; CISI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
237 ; CISI-NEXT: s_mov_b32 s4, s0
238 ; CISI-NEXT: s_mov_b32 s5, s1
239 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
240 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
241 ; CISI-NEXT: s_endpgm
243 ; VI-LABEL: vadd64rr:
244 ; VI: ; %bb.0: ; %entry
245 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
246 ; VI-NEXT: s_waitcnt lgkmcnt(0)
247 ; VI-NEXT: v_mov_b32_e32 v4, s3
248 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
249 ; VI-NEXT: v_mov_b32_e32 v1, s0
250 ; VI-NEXT: v_mov_b32_e32 v2, s1
251 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
252 ; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
255 ; GFX9-LABEL: vadd64rr:
256 ; GFX9: ; %bb.0: ; %entry
257 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
258 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
259 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
260 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
261 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
262 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
263 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
264 ; GFX9-NEXT: s_endpgm
266 ; GFX1010-LABEL: vadd64rr:
267 ; GFX1010: ; %bb.0: ; %entry
268 ; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
269 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
270 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0
272 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
273 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
274 ; GFX1010-NEXT: s_endpgm
276 ; GFX1030W32-LABEL: vadd64rr:
277 ; GFX1030W32: ; %bb.0: ; %entry
278 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
279 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
280 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0
282 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
283 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
284 ; GFX1030W32-NEXT: s_endpgm
286 ; GFX1030W64-LABEL: vadd64rr:
287 ; GFX1030W64: ; %bb.0: ; %entry
288 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
289 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
290 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
291 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0
292 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
293 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
294 ; GFX1030W64-NEXT: s_endpgm
296 ; GFX11-LABEL: vadd64rr:
297 ; GFX11: ; %bb.0: ; %entry
298 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
299 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
300 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
301 ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
303 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
304 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
305 ; GFX11-NEXT: s_nop 0
306 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
307 ; GFX11-NEXT: s_endpgm
309 %tid = call i32 @llvm.amdgcn.workitem.id.x()
310 %tid.ext = sext i32 %tid to i64
311 %add = add i64 %a, %tid.ext
312 store i64 %add, ptr addrspace(1) %out
316 ; GCN-ISEL-LABEL: name: vadd64ri
317 ; GCN-ISEL-LABEL: body:
318 ; GCN-ISEL-LABEL: bb.0.entry:
319 ; GCN-ISEL: V_ADD_U64_PSEUDO
321 define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
322 ; CISI-LABEL: vadd64ri:
323 ; CISI: ; %bb.0: ; %entry
324 ; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
325 ; CISI-NEXT: v_add_i32_e32 v0, vcc, 0x56789876, v0
326 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234
327 ; CISI-NEXT: s_mov_b32 s3, 0xf000
328 ; CISI-NEXT: s_mov_b32 s2, -1
329 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
330 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
331 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
332 ; CISI-NEXT: s_endpgm
334 ; VI-LABEL: vadd64ri:
335 ; VI: ; %bb.0: ; %entry
336 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
337 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0
338 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234
339 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
340 ; VI-NEXT: s_waitcnt lgkmcnt(0)
341 ; VI-NEXT: v_mov_b32_e32 v3, s1
342 ; VI-NEXT: v_mov_b32_e32 v2, s0
343 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
346 ; GFX9-LABEL: vadd64ri:
347 ; GFX9: ; %bb.0: ; %entry
348 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
349 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0
350 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
351 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
352 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
353 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
354 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
355 ; GFX9-NEXT: s_endpgm
357 ; GFX1010-LABEL: vadd64ri:
358 ; GFX1010: ; %bb.0: ; %entry
359 ; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
360 ; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
361 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
362 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
363 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
365 ; GFX1010-NEXT: s_endpgm
367 ; GFX1030W32-LABEL: vadd64ri:
368 ; GFX1030W32: ; %bb.0: ; %entry
369 ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
370 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
371 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
372 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
373 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
375 ; GFX1030W32-NEXT: s_endpgm
377 ; GFX1030W64-LABEL: vadd64ri:
378 ; GFX1030W64: ; %bb.0: ; %entry
379 ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
380 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
381 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
382 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
383 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
384 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
385 ; GFX1030W64-NEXT: s_endpgm
387 ; GFX11-LABEL: vadd64ri:
388 ; GFX11: ; %bb.0: ; %entry
389 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
390 ; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
391 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
392 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
393 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
394 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
395 ; GFX11-NEXT: s_nop 0
396 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
397 ; GFX11-NEXT: s_endpgm
399 %tid = call i32 @llvm.amdgcn.workitem.id.x()
400 %tid.ext = sext i32 %tid to i64
401 %add = add i64 20015998343286, %tid.ext
402 store i64 %add, ptr addrspace(1) %out
406 ; GCN-ISEL-LABEL: name: suaddo32
407 ; GCN-ISEL-LABEL: body:
408 ; GCN-ISEL-LABEL: bb.0
409 ; GCN-ISEL: S_ADD_I32
410 define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
411 ; CISI-LABEL: suaddo32:
413 ; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
414 ; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
415 ; CISI-NEXT: s_mov_b32 s3, 0xf000
416 ; CISI-NEXT: s_mov_b32 s2, -1
417 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
418 ; CISI-NEXT: s_add_i32 s4, s4, s5
419 ; CISI-NEXT: v_mov_b32_e32 v0, s4
420 ; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0
421 ; CISI-NEXT: s_endpgm
423 ; VI-LABEL: suaddo32:
425 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
426 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
427 ; VI-NEXT: s_waitcnt lgkmcnt(0)
428 ; VI-NEXT: s_add_i32 s2, s2, s3
429 ; VI-NEXT: v_mov_b32_e32 v0, s0
430 ; VI-NEXT: v_mov_b32_e32 v1, s1
431 ; VI-NEXT: v_mov_b32_e32 v2, s2
432 ; VI-NEXT: flat_store_dword v[0:1], v2
435 ; GFX9-LABEL: suaddo32:
437 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
438 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
439 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
440 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
441 ; GFX9-NEXT: s_add_i32 s0, s2, s3
442 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
443 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
444 ; GFX9-NEXT: s_endpgm
446 ; GFX1010-LABEL: suaddo32:
448 ; GFX1010-NEXT: s_clause 0x1
449 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
450 ; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
451 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
452 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
453 ; GFX1010-NEXT: s_add_i32 s0, s2, s3
454 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0
455 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5]
456 ; GFX1010-NEXT: s_endpgm
458 ; GFX1030W32-LABEL: suaddo32:
459 ; GFX1030W32: ; %bb.0:
460 ; GFX1030W32-NEXT: s_clause 0x1
461 ; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
462 ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
463 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
464 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
465 ; GFX1030W32-NEXT: s_add_i32 s2, s2, s3
466 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
467 ; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
468 ; GFX1030W32-NEXT: s_endpgm
470 ; GFX1030W64-LABEL: suaddo32:
471 ; GFX1030W64: ; %bb.0:
472 ; GFX1030W64-NEXT: s_clause 0x1
473 ; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
474 ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
475 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
476 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
477 ; GFX1030W64-NEXT: s_add_i32 s2, s2, s3
478 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
479 ; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
480 ; GFX1030W64-NEXT: s_endpgm
482 ; GFX11-LABEL: suaddo32:
484 ; GFX11-NEXT: s_clause 0x1
485 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
486 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
487 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
488 ; GFX11-NEXT: s_add_i32 s2, s2, s3
489 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
490 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
491 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
492 ; GFX11-NEXT: s_nop 0
493 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
494 ; GFX11-NEXT: s_endpgm
495 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
496 %val = extractvalue { i32, i1 } %uadd, 0
497 %carry = extractvalue { i32, i1 } %uadd, 1
498 store i32 %val, ptr addrspace(1) %out, align 4
503 ; GCN-ISEL-LABEL: name: uaddo32_vcc_user
504 ; GCN-ISEL-LABEL: body:
505 ; GCN-ISEL-LABEL: bb.0
506 ; GCN-ISEL: V_ADD_CO_U32_e64
508 ; below we check selection to v_add/addc
509 ; because the only user of VCC produced by the UADDOis v_cndmask.
510 ; We select to VALU form to avoid unnecessary s_cselect to copy SCC to VCC
512 define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
513 ; CISI-LABEL: uaddo32_vcc_user:
515 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
516 ; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
517 ; CISI-NEXT: s_mov_b32 s3, 0xf000
518 ; CISI-NEXT: s_mov_b32 s2, -1
519 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
520 ; CISI-NEXT: s_mov_b32 s0, s4
521 ; CISI-NEXT: v_mov_b32_e32 v0, s9
522 ; CISI-NEXT: s_mov_b32 s1, s5
523 ; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
524 ; CISI-NEXT: s_mov_b32 s4, s6
525 ; CISI-NEXT: s_mov_b32 s5, s7
526 ; CISI-NEXT: s_mov_b32 s6, s2
527 ; CISI-NEXT: s_mov_b32 s7, s3
528 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
529 ; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0
530 ; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0
531 ; CISI-NEXT: s_endpgm
533 ; VI-LABEL: uaddo32_vcc_user:
535 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
536 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
537 ; VI-NEXT: s_waitcnt lgkmcnt(0)
538 ; VI-NEXT: v_mov_b32_e32 v0, s4
539 ; VI-NEXT: v_mov_b32_e32 v4, s1
540 ; VI-NEXT: v_mov_b32_e32 v1, s5
541 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
542 ; VI-NEXT: v_mov_b32_e32 v2, s6
543 ; VI-NEXT: v_mov_b32_e32 v3, s7
544 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
545 ; VI-NEXT: flat_store_dword v[0:1], v4
546 ; VI-NEXT: flat_store_byte v[2:3], v5
549 ; GFX9-LABEL: uaddo32_vcc_user:
551 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
552 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
553 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
554 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
555 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
556 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
557 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
558 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
559 ; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
560 ; GFX9-NEXT: s_endpgm
562 ; GFX1010-LABEL: uaddo32_vcc_user:
564 ; GFX1010-NEXT: s_clause 0x1
565 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
566 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
567 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
568 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX1010-NEXT: v_add_co_u32 v1, s0, s2, s3
570 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
571 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5]
572 ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7]
573 ; GFX1010-NEXT: s_endpgm
575 ; GFX1030W32-LABEL: uaddo32_vcc_user:
576 ; GFX1030W32: ; %bb.0:
577 ; GFX1030W32-NEXT: s_clause 0x1
578 ; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
579 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
580 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
581 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
582 ; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5
583 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
584 ; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
585 ; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
586 ; GFX1030W32-NEXT: s_endpgm
588 ; GFX1030W64-LABEL: uaddo32_vcc_user:
589 ; GFX1030W64: ; %bb.0:
590 ; GFX1030W64-NEXT: s_clause 0x1
591 ; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
592 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
593 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
594 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
595 ; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5
596 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
597 ; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
598 ; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
599 ; GFX1030W64-NEXT: s_endpgm
601 ; GFX11-LABEL: uaddo32_vcc_user:
603 ; GFX11-NEXT: s_clause 0x1
604 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
605 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
606 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
607 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
608 ; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5
609 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
610 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
611 ; GFX11-NEXT: s_clause 0x1
612 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
613 ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
614 ; GFX11-NEXT: s_nop 0
615 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
616 ; GFX11-NEXT: s_endpgm
617 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
618 %val = extractvalue { i32, i1 } %uadd, 0
619 %carry = extractvalue { i32, i1 } %uadd, 1
620 store i32 %val, ptr addrspace(1) %out, align 4
621 store i1 %carry, ptr addrspace(1) %carryout
625 ; GCN-ISEL-LABEL: name: suaddo64
626 ; GCN-ISEL-LABEL: body:
627 ; GCN-ISEL-LABEL: bb.0
628 ; GCN-ISEL: S_ADD_U64_PSEUDO
630 define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
631 ; CISI-LABEL: suaddo64:
633 ; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
634 ; CISI-NEXT: s_mov_b32 s11, 0xf000
635 ; CISI-NEXT: s_mov_b32 s10, -1
636 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
637 ; CISI-NEXT: s_add_u32 s6, s4, s6
638 ; CISI-NEXT: v_mov_b32_e32 v0, s4
639 ; CISI-NEXT: s_addc_u32 s7, s5, s7
640 ; CISI-NEXT: v_mov_b32_e32 v1, s5
641 ; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
642 ; CISI-NEXT: v_mov_b32_e32 v2, s6
643 ; CISI-NEXT: s_mov_b32 s8, s0
644 ; CISI-NEXT: s_mov_b32 s9, s1
645 ; CISI-NEXT: s_mov_b32 s0, s2
646 ; CISI-NEXT: s_mov_b32 s1, s3
647 ; CISI-NEXT: s_mov_b32 s2, s10
648 ; CISI-NEXT: s_mov_b32 s3, s11
649 ; CISI-NEXT: v_mov_b32_e32 v3, s7
650 ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
651 ; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
652 ; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
653 ; CISI-NEXT: s_endpgm
655 ; VI-LABEL: suaddo64:
657 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
658 ; VI-NEXT: s_waitcnt lgkmcnt(0)
659 ; VI-NEXT: v_mov_b32_e32 v0, s0
660 ; VI-NEXT: s_add_u32 s0, s4, s6
661 ; VI-NEXT: v_mov_b32_e32 v4, s4
662 ; VI-NEXT: v_mov_b32_e32 v1, s1
663 ; VI-NEXT: s_addc_u32 s1, s5, s7
664 ; VI-NEXT: v_mov_b32_e32 v5, s5
665 ; VI-NEXT: v_mov_b32_e32 v7, s1
666 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
667 ; VI-NEXT: v_mov_b32_e32 v6, s0
668 ; VI-NEXT: v_mov_b32_e32 v2, s2
669 ; VI-NEXT: v_mov_b32_e32 v3, s3
670 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
671 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
672 ; VI-NEXT: flat_store_byte v[2:3], v0
675 ; GFX9-LABEL: suaddo64:
677 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
678 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
679 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
680 ; GFX9-NEXT: s_add_u32 s6, s4, s6
681 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
682 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
683 ; GFX9-NEXT: s_addc_u32 s7, s5, s7
684 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
685 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
686 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
687 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
688 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
689 ; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
690 ; GFX9-NEXT: s_endpgm
692 ; GFX1010-LABEL: suaddo64:
694 ; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
695 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
696 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
697 ; GFX1010-NEXT: s_add_u32 s6, s4, s6
698 ; GFX1010-NEXT: s_addc_u32 s7, s5, s7
699 ; GFX1010-NEXT: v_mov_b32_e32 v0, s6
700 ; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
701 ; GFX1010-NEXT: v_mov_b32_e32 v1, s7
702 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
703 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
704 ; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
705 ; GFX1010-NEXT: s_endpgm
707 ; GFX1030W32-LABEL: suaddo64:
708 ; GFX1030W32: ; %bb.0:
709 ; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
710 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
711 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
712 ; GFX1030W32-NEXT: s_add_u32 s6, s4, s6
713 ; GFX1030W32-NEXT: s_addc_u32 s7, s5, s7
714 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
715 ; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
716 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
717 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
718 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
719 ; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
720 ; GFX1030W32-NEXT: s_endpgm
722 ; GFX1030W64-LABEL: suaddo64:
723 ; GFX1030W64: ; %bb.0:
724 ; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
725 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
726 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
727 ; GFX1030W64-NEXT: s_add_u32 s6, s4, s6
728 ; GFX1030W64-NEXT: s_addc_u32 s7, s5, s7
729 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
730 ; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
731 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
732 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
733 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
734 ; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
735 ; GFX1030W64-NEXT: s_endpgm
737 ; GFX11-LABEL: suaddo64:
739 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
740 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX11-NEXT: s_add_u32 s6, s4, s6
742 ; GFX11-NEXT: s_addc_u32 s7, s5, s7
743 ; GFX11-NEXT: v_mov_b32_e32 v0, s6
744 ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
745 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
746 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
747 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
748 ; GFX11-NEXT: s_clause 0x1
749 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
750 ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
751 ; GFX11-NEXT: s_nop 0
752 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
753 ; GFX11-NEXT: s_endpgm
754 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
755 %val = extractvalue { i64, i1 } %uadd, 0
756 %carry = extractvalue { i64, i1 } %uadd, 1
757 store i64 %val, ptr addrspace(1) %out, align 8
758 store i1 %carry, ptr addrspace(1) %carryout
762 ; GCN-ISEL-LABEL: name: vuaddo64
763 ; GCN-ISEL-LABEL: body:
764 ; GCN-ISEL-LABEL: bb.0
765 ; GCN-ISEL: V_ADD_U64_PSEUDO
767 define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
768 ; CISI-LABEL: vuaddo64:
770 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
771 ; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
772 ; CISI-NEXT: s_mov_b32 s3, 0xf000
773 ; CISI-NEXT: s_mov_b32 s2, -1
774 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
775 ; CISI-NEXT: s_mov_b32 s0, s4
776 ; CISI-NEXT: v_mov_b32_e32 v1, s9
777 ; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
778 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
779 ; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
780 ; CISI-NEXT: s_mov_b32 s1, s5
781 ; CISI-NEXT: s_mov_b32 s4, s6
782 ; CISI-NEXT: s_mov_b32 s5, s7
783 ; CISI-NEXT: s_mov_b32 s6, s2
784 ; CISI-NEXT: s_mov_b32 s7, s3
785 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
786 ; CISI-NEXT: s_waitcnt expcnt(0)
787 ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
788 ; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0
789 ; CISI-NEXT: s_endpgm
791 ; VI-LABEL: vuaddo64:
793 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
794 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
795 ; VI-NEXT: s_waitcnt lgkmcnt(0)
796 ; VI-NEXT: v_mov_b32_e32 v1, s4
797 ; VI-NEXT: v_mov_b32_e32 v6, s1
798 ; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0
799 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
800 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[5:6]
801 ; VI-NEXT: v_mov_b32_e32 v2, s5
802 ; VI-NEXT: v_mov_b32_e32 v3, s6
803 ; VI-NEXT: v_mov_b32_e32 v4, s7
804 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
805 ; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
806 ; VI-NEXT: flat_store_byte v[3:4], v0
809 ; GFX9-LABEL: vuaddo64:
811 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
812 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
813 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
814 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
815 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
816 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
817 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
818 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
819 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
820 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
821 ; GFX9-NEXT: global_store_byte v2, v0, s[6:7]
822 ; GFX9-NEXT: s_endpgm
824 ; GFX1010-LABEL: vuaddo64:
826 ; GFX1010-NEXT: s_clause 0x1
827 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
828 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
829 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
830 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
831 ; GFX1010-NEXT: v_add_co_u32 v0, s0, s2, v0
832 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
833 ; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
834 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
835 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
836 ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
837 ; GFX1010-NEXT: s_endpgm
839 ; GFX1030W32-LABEL: vuaddo64:
840 ; GFX1030W32: ; %bb.0:
841 ; GFX1030W32-NEXT: s_clause 0x1
842 ; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
843 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
844 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
845 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
846 ; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0
847 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
848 ; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
849 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
850 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
851 ; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
852 ; GFX1030W32-NEXT: s_endpgm
854 ; GFX1030W64-LABEL: vuaddo64:
855 ; GFX1030W64: ; %bb.0:
856 ; GFX1030W64-NEXT: s_clause 0x1
857 ; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
858 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
859 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
860 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
861 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0
862 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
863 ; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
864 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
865 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
866 ; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
867 ; GFX1030W64-NEXT: s_endpgm
869 ; GFX11-LABEL: vuaddo64:
871 ; GFX11-NEXT: s_clause 0x1
872 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
873 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
874 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
875 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
876 ; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0
877 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
878 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
879 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
880 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
881 ; GFX11-NEXT: s_clause 0x1
882 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
883 ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
884 ; GFX11-NEXT: s_nop 0
885 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
886 ; GFX11-NEXT: s_endpgm
887 %tid = call i32 @llvm.amdgcn.workitem.id.x()
888 %tid.ext = sext i32 %tid to i64
889 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
890 %val = extractvalue { i64, i1 } %uadd, 0
891 %carry = extractvalue { i64, i1 } %uadd, 1
892 store i64 %val, ptr addrspace(1) %out, align 8
893 store i1 %carry, ptr addrspace(1) %carryout
897 ; GCN-ISEL-LABEL: name: ssub64rr
898 ; GCN-ISEL-LABEL: body:
899 ; GCN-ISEL-LABEL: bb.0.entry:
900 ; GCN-ISEL: S_SUB_U64_PSEUDO
902 define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
903 ; CISI-LABEL: ssub64rr:
904 ; CISI: ; %bb.0: ; %entry
905 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
906 ; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
907 ; CISI-NEXT: s_mov_b32 s3, 0xf000
908 ; CISI-NEXT: s_mov_b32 s2, -1
909 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
910 ; CISI-NEXT: s_mov_b32 s0, s4
911 ; CISI-NEXT: s_sub_u32 s4, s6, s8
912 ; CISI-NEXT: s_mov_b32 s1, s5
913 ; CISI-NEXT: s_subb_u32 s5, s7, s9
914 ; CISI-NEXT: v_mov_b32_e32 v0, s4
915 ; CISI-NEXT: v_mov_b32_e32 v1, s5
916 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
917 ; CISI-NEXT: s_endpgm
919 ; VI-LABEL: ssub64rr:
920 ; VI: ; %bb.0: ; %entry
921 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
922 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
923 ; VI-NEXT: s_waitcnt lgkmcnt(0)
924 ; VI-NEXT: v_mov_b32_e32 v0, s4
925 ; VI-NEXT: s_sub_u32 s0, s6, s0
926 ; VI-NEXT: s_subb_u32 s1, s7, s1
927 ; VI-NEXT: v_mov_b32_e32 v3, s1
928 ; VI-NEXT: v_mov_b32_e32 v1, s5
929 ; VI-NEXT: v_mov_b32_e32 v2, s0
930 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
933 ; GFX9-LABEL: ssub64rr:
934 ; GFX9: ; %bb.0: ; %entry
935 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
936 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
937 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
938 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
939 ; GFX9-NEXT: s_sub_u32 s0, s6, s2
940 ; GFX9-NEXT: s_subb_u32 s1, s7, s3
941 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
942 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
943 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
944 ; GFX9-NEXT: s_endpgm
946 ; GFX1010-LABEL: ssub64rr:
947 ; GFX1010: ; %bb.0: ; %entry
948 ; GFX1010-NEXT: s_clause 0x1
949 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
950 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
951 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
952 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
953 ; GFX1010-NEXT: s_sub_u32 s0, s6, s2
954 ; GFX1010-NEXT: s_subb_u32 s1, s7, s3
955 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0
956 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1
957 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
958 ; GFX1010-NEXT: s_endpgm
960 ; GFX1030W32-LABEL: ssub64rr:
961 ; GFX1030W32: ; %bb.0: ; %entry
962 ; GFX1030W32-NEXT: s_clause 0x1
963 ; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
964 ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
965 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
966 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
967 ; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0
968 ; GFX1030W32-NEXT: s_subb_u32 s1, s7, s1
969 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
970 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
971 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
972 ; GFX1030W32-NEXT: s_endpgm
974 ; GFX1030W64-LABEL: ssub64rr:
975 ; GFX1030W64: ; %bb.0: ; %entry
976 ; GFX1030W64-NEXT: s_clause 0x1
977 ; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
978 ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
979 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
980 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
981 ; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0
982 ; GFX1030W64-NEXT: s_subb_u32 s1, s7, s1
983 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
984 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
985 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
986 ; GFX1030W64-NEXT: s_endpgm
988 ; GFX11-LABEL: ssub64rr:
989 ; GFX11: ; %bb.0: ; %entry
990 ; GFX11-NEXT: s_clause 0x1
991 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
992 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
993 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
994 ; GFX11-NEXT: s_sub_u32 s0, s6, s0
995 ; GFX11-NEXT: s_subb_u32 s1, s7, s1
996 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
997 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
998 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
999 ; GFX11-NEXT: s_nop 0
1000 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1001 ; GFX11-NEXT: s_endpgm
1003 %sub = sub i64 %a, %b
1004 store i64 %sub, ptr addrspace(1) %out
1008 ; GCN-ISEL-LABEL: name: ssub64ri
1009 ; GCN-ISEL-LABEL: body:
1010 ; GCN-ISEL-LABEL: bb.0.entry:
1011 ; GCN-ISEL: S_SUB_U64_PSEUDO
1013 define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
1014 ; CISI-LABEL: ssub64ri:
1015 ; CISI: ; %bb.0: ; %entry
1016 ; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1017 ; CISI-NEXT: s_mov_b32 s7, 0xf000
1018 ; CISI-NEXT: s_mov_b32 s6, -1
1019 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1020 ; CISI-NEXT: s_mov_b32 s4, s0
1021 ; CISI-NEXT: s_sub_u32 s0, 0x56789876, s2
1022 ; CISI-NEXT: s_mov_b32 s5, s1
1023 ; CISI-NEXT: s_subb_u32 s1, 0x1234, s3
1024 ; CISI-NEXT: v_mov_b32_e32 v0, s0
1025 ; CISI-NEXT: v_mov_b32_e32 v1, s1
1026 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1027 ; CISI-NEXT: s_endpgm
1029 ; VI-LABEL: ssub64ri:
1030 ; VI: ; %bb.0: ; %entry
1031 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1032 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1033 ; VI-NEXT: v_mov_b32_e32 v0, s0
1034 ; VI-NEXT: s_sub_u32 s0, 0x56789876, s2
1035 ; VI-NEXT: v_mov_b32_e32 v1, s1
1036 ; VI-NEXT: s_subb_u32 s1, 0x1234, s3
1037 ; VI-NEXT: v_mov_b32_e32 v3, s1
1038 ; VI-NEXT: v_mov_b32_e32 v2, s0
1039 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1042 ; GFX9-LABEL: ssub64ri:
1043 ; GFX9: ; %bb.0: ; %entry
1044 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1045 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1046 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1047 ; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2
1048 ; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3
1049 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1050 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1051 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1052 ; GFX9-NEXT: s_endpgm
1054 ; GFX1010-LABEL: ssub64ri:
1055 ; GFX1010: ; %bb.0: ; %entry
1056 ; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1057 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
1058 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1059 ; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2
1060 ; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3
1061 ; GFX1010-NEXT: v_mov_b32_e32 v0, s2
1062 ; GFX1010-NEXT: v_mov_b32_e32 v1, s3
1063 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1064 ; GFX1010-NEXT: s_endpgm
1066 ; GFX1030W32-LABEL: ssub64ri:
1067 ; GFX1030W32: ; %bb.0: ; %entry
1068 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1069 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
1070 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1071 ; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2
1072 ; GFX1030W32-NEXT: s_subb_u32 s3, 0x1234, s3
1073 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
1074 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
1075 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1076 ; GFX1030W32-NEXT: s_endpgm
1078 ; GFX1030W64-LABEL: ssub64ri:
1079 ; GFX1030W64: ; %bb.0: ; %entry
1080 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1081 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
1082 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1083 ; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2
1084 ; GFX1030W64-NEXT: s_subb_u32 s3, 0x1234, s3
1085 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
1086 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
1087 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1088 ; GFX1030W64-NEXT: s_endpgm
1090 ; GFX11-LABEL: ssub64ri:
1091 ; GFX11: ; %bb.0: ; %entry
1092 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1093 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1094 ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2
1095 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3
1096 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1097 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1098 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1099 ; GFX11-NEXT: s_nop 0
1100 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1101 ; GFX11-NEXT: s_endpgm
1103 %sub = sub i64 20015998343286, %a
1104 store i64 %sub, ptr addrspace(1) %out
1108 ; GCN-ISEL-LABEL: name: vsub64rr
1109 ; GCN-ISEL-LABEL: body:
1110 ; GCN-ISEL-LABEL: bb.0.entry:
1111 ; GCN-ISEL: V_SUB_U64_PSEUDO
1113 define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
1114 ; CISI-LABEL: vsub64rr:
1115 ; CISI: ; %bb.0: ; %entry
1116 ; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1117 ; CISI-NEXT: s_mov_b32 s7, 0xf000
1118 ; CISI-NEXT: s_mov_b32 s6, -1
1119 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1120 ; CISI-NEXT: v_mov_b32_e32 v1, s3
1121 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
1122 ; CISI-NEXT: s_mov_b32 s4, s0
1123 ; CISI-NEXT: s_mov_b32 s5, s1
1124 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1125 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1126 ; CISI-NEXT: s_endpgm
1128 ; VI-LABEL: vsub64rr:
1129 ; VI: ; %bb.0: ; %entry
1130 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1131 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1132 ; VI-NEXT: v_mov_b32_e32 v4, s3
1133 ; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0
1134 ; VI-NEXT: v_mov_b32_e32 v1, s0
1135 ; VI-NEXT: v_mov_b32_e32 v2, s1
1136 ; VI-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
1137 ; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
1140 ; GFX9-LABEL: vsub64rr:
1141 ; GFX9: ; %bb.0: ; %entry
1142 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1143 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1144 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1145 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1146 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
1147 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1148 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1149 ; GFX9-NEXT: s_endpgm
1151 ; GFX1010-LABEL: vsub64rr:
1152 ; GFX1010: ; %bb.0: ; %entry
1153 ; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1154 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
1155 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1156 ; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0
1157 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
1158 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1159 ; GFX1010-NEXT: s_endpgm
1161 ; GFX1030W32-LABEL: vsub64rr:
1162 ; GFX1030W32: ; %bb.0: ; %entry
1163 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1164 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
1165 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1166 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0
1167 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
1168 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1169 ; GFX1030W32-NEXT: s_endpgm
1171 ; GFX1030W64-LABEL: vsub64rr:
1172 ; GFX1030W64: ; %bb.0: ; %entry
1173 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1174 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
1175 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1176 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0
1177 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
1178 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1179 ; GFX1030W64-NEXT: s_endpgm
1181 ; GFX11-LABEL: vsub64rr:
1182 ; GFX11: ; %bb.0: ; %entry
1183 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1184 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1185 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1186 ; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0
1187 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1188 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
1189 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1190 ; GFX11-NEXT: s_nop 0
1191 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1192 ; GFX11-NEXT: s_endpgm
1194 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1195 %tid.ext = sext i32 %tid to i64
1196 %sub = sub i64 %a, %tid.ext
1197 store i64 %sub, ptr addrspace(1) %out
1201 ; GCN-ISEL-LABEL: name: vsub64ri
1202 ; GCN-ISEL-LABEL: body:
1203 ; GCN-ISEL-LABEL: bb.0.entry:
1204 ; GCN-ISEL: V_SUB_U64_PSEUDO
1206 define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
1207 ; CISI-LABEL: vsub64ri:
1208 ; CISI: ; %bb.0: ; %entry
1209 ; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1210 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, 0x56789876, v0
1211 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234
1212 ; CISI-NEXT: s_mov_b32 s3, 0xf000
1213 ; CISI-NEXT: s_mov_b32 s2, -1
1214 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1215 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1216 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1217 ; CISI-NEXT: s_endpgm
1219 ; VI-LABEL: vsub64ri:
1220 ; VI: ; %bb.0: ; %entry
1221 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1222 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0
1223 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234
1224 ; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1225 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1226 ; VI-NEXT: v_mov_b32_e32 v3, s1
1227 ; VI-NEXT: v_mov_b32_e32 v2, s0
1228 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1231 ; GFX9-LABEL: vsub64ri:
1232 ; GFX9: ; %bb.0: ; %entry
1233 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1234 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0
1235 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
1236 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1237 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1238 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1239 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1240 ; GFX9-NEXT: s_endpgm
1242 ; GFX1010-LABEL: vsub64ri:
1243 ; GFX1010: ; %bb.0: ; %entry
1244 ; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1245 ; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
1246 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
1247 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
1248 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1249 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1250 ; GFX1010-NEXT: s_endpgm
1252 ; GFX1030W32-LABEL: vsub64ri:
1253 ; GFX1030W32: ; %bb.0: ; %entry
1254 ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1255 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
1256 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
1257 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
1258 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1259 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1260 ; GFX1030W32-NEXT: s_endpgm
1262 ; GFX1030W64-LABEL: vsub64ri:
1263 ; GFX1030W64: ; %bb.0: ; %entry
1264 ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1265 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0
1266 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
1267 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
1268 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1269 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1270 ; GFX1030W64-NEXT: s_endpgm
1272 ; GFX11-LABEL: vsub64ri:
1273 ; GFX11: ; %bb.0: ; %entry
1274 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1275 ; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
1276 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1277 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
1278 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1279 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1280 ; GFX11-NEXT: s_nop 0
1281 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1282 ; GFX11-NEXT: s_endpgm
1284 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1285 %tid.ext = sext i32 %tid to i64
1286 %sub = sub i64 20015998343286, %tid.ext
1287 store i64 %sub, ptr addrspace(1) %out
1291 ; GCN-ISEL-LABEL: name: susubo32
1292 ; GCN-ISEL-LABEL: body:
1293 ; GCN-ISEL-LABEL: bb.0
1294 ; GCN-ISEL: S_SUB_I32
1296 define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
1297 ; CISI-LABEL: susubo32:
1299 ; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1300 ; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1301 ; CISI-NEXT: s_mov_b32 s3, 0xf000
1302 ; CISI-NEXT: s_mov_b32 s2, -1
1303 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1304 ; CISI-NEXT: s_sub_i32 s4, s4, s5
1305 ; CISI-NEXT: v_mov_b32_e32 v0, s4
1306 ; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1307 ; CISI-NEXT: s_endpgm
1309 ; VI-LABEL: susubo32:
1311 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1312 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1313 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1314 ; VI-NEXT: s_sub_i32 s2, s2, s3
1315 ; VI-NEXT: v_mov_b32_e32 v0, s0
1316 ; VI-NEXT: v_mov_b32_e32 v1, s1
1317 ; VI-NEXT: v_mov_b32_e32 v2, s2
1318 ; VI-NEXT: flat_store_dword v[0:1], v2
1321 ; GFX9-LABEL: susubo32:
1323 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1324 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1325 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1326 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1327 ; GFX9-NEXT: s_sub_i32 s0, s2, s3
1328 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
1329 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1330 ; GFX9-NEXT: s_endpgm
1332 ; GFX1010-LABEL: susubo32:
1334 ; GFX1010-NEXT: s_clause 0x1
1335 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1336 ; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1337 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
1338 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1339 ; GFX1010-NEXT: s_sub_i32 s0, s2, s3
1340 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0
1341 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5]
1342 ; GFX1010-NEXT: s_endpgm
1344 ; GFX1030W32-LABEL: susubo32:
1345 ; GFX1030W32: ; %bb.0:
1346 ; GFX1030W32-NEXT: s_clause 0x1
1347 ; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1348 ; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1349 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
1350 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1351 ; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3
1352 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
1353 ; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
1354 ; GFX1030W32-NEXT: s_endpgm
1356 ; GFX1030W64-LABEL: susubo32:
1357 ; GFX1030W64: ; %bb.0:
1358 ; GFX1030W64-NEXT: s_clause 0x1
1359 ; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1360 ; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1361 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
1362 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1363 ; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3
1364 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
1365 ; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
1366 ; GFX1030W64-NEXT: s_endpgm
1368 ; GFX11-LABEL: susubo32:
1370 ; GFX11-NEXT: s_clause 0x1
1371 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
1372 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1373 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1374 ; GFX11-NEXT: s_sub_i32 s2, s2, s3
1375 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1376 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
1377 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1378 ; GFX11-NEXT: s_nop 0
1379 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1380 ; GFX11-NEXT: s_endpgm
1381 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
1382 %val = extractvalue { i32, i1 } %usub, 0
1383 %carry = extractvalue { i32, i1 } %usub, 1
1384 store i32 %val, ptr addrspace(1) %out, align 4
1389 ; GCN-ISEL-LABEL: name: usubo32_vcc_user
1390 ; GCN-ISEL-LABEL: body:
1391 ; GCN-ISEL-LABEL: bb.0
1392 ; GCN-ISEL: V_SUB_CO_U32_e64
1394 ; below we check selection to v_sub/subb
1395 ; because the only user of VCC produced by the USUBOis v_cndmask.
1396 ; We select to VALU form to avoid unnecessary s_cselect to copy SCC to VCC
1398 define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
1399 ; CISI-LABEL: usubo32_vcc_user:
1401 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1402 ; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1403 ; CISI-NEXT: s_mov_b32 s3, 0xf000
1404 ; CISI-NEXT: s_mov_b32 s2, -1
1405 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1406 ; CISI-NEXT: s_mov_b32 s0, s4
1407 ; CISI-NEXT: v_mov_b32_e32 v0, s9
1408 ; CISI-NEXT: s_mov_b32 s1, s5
1409 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
1410 ; CISI-NEXT: s_mov_b32 s4, s6
1411 ; CISI-NEXT: s_mov_b32 s5, s7
1412 ; CISI-NEXT: s_mov_b32 s6, s2
1413 ; CISI-NEXT: s_mov_b32 s7, s3
1414 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
1415 ; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1416 ; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0
1417 ; CISI-NEXT: s_endpgm
1419 ; VI-LABEL: usubo32_vcc_user:
1421 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1422 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1423 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1424 ; VI-NEXT: v_mov_b32_e32 v0, s4
1425 ; VI-NEXT: v_mov_b32_e32 v4, s1
1426 ; VI-NEXT: v_mov_b32_e32 v1, s5
1427 ; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
1428 ; VI-NEXT: v_mov_b32_e32 v2, s6
1429 ; VI-NEXT: v_mov_b32_e32 v3, s7
1430 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1431 ; VI-NEXT: flat_store_dword v[0:1], v4
1432 ; VI-NEXT: flat_store_byte v[2:3], v5
1435 ; GFX9-LABEL: usubo32_vcc_user:
1437 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1438 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1439 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1440 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1441 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1442 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1
1443 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1444 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1445 ; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
1446 ; GFX9-NEXT: s_endpgm
1448 ; GFX1010-LABEL: usubo32_vcc_user:
1450 ; GFX1010-NEXT: s_clause 0x1
1451 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1452 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1453 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
1454 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1455 ; GFX1010-NEXT: v_sub_co_u32 v1, s0, s2, s3
1456 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
1457 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5]
1458 ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7]
1459 ; GFX1010-NEXT: s_endpgm
1461 ; GFX1030W32-LABEL: usubo32_vcc_user:
1462 ; GFX1030W32: ; %bb.0:
1463 ; GFX1030W32-NEXT: s_clause 0x1
1464 ; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1465 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1466 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
1467 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1468 ; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5
1469 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
1470 ; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
1471 ; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
1472 ; GFX1030W32-NEXT: s_endpgm
1474 ; GFX1030W64-LABEL: usubo32_vcc_user:
1475 ; GFX1030W64: ; %bb.0:
1476 ; GFX1030W64-NEXT: s_clause 0x1
1477 ; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1478 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1479 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
1480 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1481 ; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5
1482 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
1483 ; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
1484 ; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
1485 ; GFX1030W64-NEXT: s_endpgm
1487 ; GFX11-LABEL: usubo32_vcc_user:
1489 ; GFX11-NEXT: s_clause 0x1
1490 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
1491 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1492 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1493 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1494 ; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5
1495 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1496 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
1497 ; GFX11-NEXT: s_clause 0x1
1498 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1499 ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
1500 ; GFX11-NEXT: s_nop 0
1501 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1502 ; GFX11-NEXT: s_endpgm
1503 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
1504 %val = extractvalue { i32, i1 } %usub, 0
1505 %carry = extractvalue { i32, i1 } %usub, 1
1506 store i32 %val, ptr addrspace(1) %out, align 4
1507 store i1 %carry, ptr addrspace(1) %carryout
1511 ; GCN-ISEL-LABEL: name: susubo64
1512 ; GCN-ISEL-LABEL: body:
1513 ; GCN-ISEL-LABEL: bb.0
1514 ; GCN-ISEL: S_SUB_U64_PSEUDO
1516 define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
1517 ; CISI-LABEL: susubo64:
1519 ; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1520 ; CISI-NEXT: s_mov_b32 s11, 0xf000
1521 ; CISI-NEXT: s_mov_b32 s10, -1
1522 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1523 ; CISI-NEXT: s_sub_u32 s6, s4, s6
1524 ; CISI-NEXT: v_mov_b32_e32 v0, s4
1525 ; CISI-NEXT: s_subb_u32 s7, s5, s7
1526 ; CISI-NEXT: v_mov_b32_e32 v1, s5
1527 ; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
1528 ; CISI-NEXT: v_mov_b32_e32 v2, s6
1529 ; CISI-NEXT: s_mov_b32 s8, s0
1530 ; CISI-NEXT: s_mov_b32 s9, s1
1531 ; CISI-NEXT: s_mov_b32 s0, s2
1532 ; CISI-NEXT: s_mov_b32 s1, s3
1533 ; CISI-NEXT: s_mov_b32 s2, s10
1534 ; CISI-NEXT: s_mov_b32 s3, s11
1535 ; CISI-NEXT: v_mov_b32_e32 v3, s7
1536 ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1537 ; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
1538 ; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1539 ; CISI-NEXT: s_endpgm
1541 ; VI-LABEL: susubo64:
1543 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1544 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1545 ; VI-NEXT: v_mov_b32_e32 v0, s0
1546 ; VI-NEXT: s_sub_u32 s0, s4, s6
1547 ; VI-NEXT: v_mov_b32_e32 v4, s4
1548 ; VI-NEXT: v_mov_b32_e32 v1, s1
1549 ; VI-NEXT: s_subb_u32 s1, s5, s7
1550 ; VI-NEXT: v_mov_b32_e32 v5, s5
1551 ; VI-NEXT: v_mov_b32_e32 v7, s1
1552 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
1553 ; VI-NEXT: v_mov_b32_e32 v6, s0
1554 ; VI-NEXT: v_mov_b32_e32 v2, s2
1555 ; VI-NEXT: v_mov_b32_e32 v3, s3
1556 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
1557 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1558 ; VI-NEXT: flat_store_byte v[2:3], v0
1561 ; GFX9-LABEL: susubo64:
1563 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1564 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1565 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1566 ; GFX9-NEXT: s_sub_u32 s6, s4, s6
1567 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1568 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1569 ; GFX9-NEXT: s_subb_u32 s7, s5, s7
1570 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1571 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
1572 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1573 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1574 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
1575 ; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
1576 ; GFX9-NEXT: s_endpgm
1578 ; GFX1010-LABEL: susubo64:
1580 ; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1581 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
1582 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1583 ; GFX1010-NEXT: s_sub_u32 s6, s4, s6
1584 ; GFX1010-NEXT: s_subb_u32 s7, s5, s7
1585 ; GFX1010-NEXT: v_mov_b32_e32 v0, s6
1586 ; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
1587 ; GFX1010-NEXT: v_mov_b32_e32 v1, s7
1588 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
1589 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1590 ; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
1591 ; GFX1010-NEXT: s_endpgm
1593 ; GFX1030W32-LABEL: susubo64:
1594 ; GFX1030W32: ; %bb.0:
1595 ; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1596 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
1597 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1598 ; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6
1599 ; GFX1030W32-NEXT: s_subb_u32 s7, s5, s7
1600 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
1601 ; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
1602 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
1603 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
1604 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1605 ; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
1606 ; GFX1030W32-NEXT: s_endpgm
1608 ; GFX1030W64-LABEL: susubo64:
1609 ; GFX1030W64: ; %bb.0:
1610 ; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1611 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
1612 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1613 ; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6
1614 ; GFX1030W64-NEXT: s_subb_u32 s7, s5, s7
1615 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
1616 ; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
1617 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
1618 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
1619 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1620 ; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
1621 ; GFX1030W64-NEXT: s_endpgm
1623 ; GFX11-LABEL: susubo64:
1625 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
1626 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1627 ; GFX11-NEXT: s_sub_u32 s6, s4, s6
1628 ; GFX11-NEXT: s_subb_u32 s7, s5, s7
1629 ; GFX11-NEXT: v_mov_b32_e32 v0, s6
1630 ; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
1631 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
1632 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1633 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
1634 ; GFX11-NEXT: s_clause 0x1
1635 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1636 ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
1637 ; GFX11-NEXT: s_nop 0
1638 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1639 ; GFX11-NEXT: s_endpgm
1640 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
1641 %val = extractvalue { i64, i1 } %usub, 0
1642 %carry = extractvalue { i64, i1 } %usub, 1
1643 store i64 %val, ptr addrspace(1) %out, align 8
1644 store i1 %carry, ptr addrspace(1) %carryout
1648 ; GCN-ISEL-LABEL: name: vusubo64
1649 ; GCN-ISEL-LABEL: body:
1650 ; GCN-ISEL-LABEL: bb.0
1651 ; GCN-ISEL: V_SUB_U64_PSEUDO
1653 define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
1654 ; CISI-LABEL: vusubo64:
1656 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1657 ; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1658 ; CISI-NEXT: s_mov_b32 s3, 0xf000
1659 ; CISI-NEXT: s_mov_b32 s2, -1
1660 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1661 ; CISI-NEXT: s_mov_b32 s0, s4
1662 ; CISI-NEXT: v_mov_b32_e32 v1, s9
1663 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
1664 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1665 ; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
1666 ; CISI-NEXT: s_mov_b32 s1, s5
1667 ; CISI-NEXT: s_mov_b32 s4, s6
1668 ; CISI-NEXT: s_mov_b32 s5, s7
1669 ; CISI-NEXT: s_mov_b32 s6, s2
1670 ; CISI-NEXT: s_mov_b32 s7, s3
1671 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1672 ; CISI-NEXT: s_waitcnt expcnt(0)
1673 ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1674 ; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1675 ; CISI-NEXT: s_endpgm
1677 ; VI-LABEL: vusubo64:
1679 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1680 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1681 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1682 ; VI-NEXT: v_mov_b32_e32 v1, s4
1683 ; VI-NEXT: v_mov_b32_e32 v6, s1
1684 ; VI-NEXT: v_sub_u32_e32 v5, vcc, s0, v0
1685 ; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
1686 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[5:6]
1687 ; VI-NEXT: v_mov_b32_e32 v2, s5
1688 ; VI-NEXT: v_mov_b32_e32 v3, s6
1689 ; VI-NEXT: v_mov_b32_e32 v4, s7
1690 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1691 ; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
1692 ; VI-NEXT: flat_store_byte v[3:4], v0
1695 ; GFX9-LABEL: vusubo64:
1697 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1698 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1699 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1700 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1701 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1702 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
1703 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1704 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
1705 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1706 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1707 ; GFX9-NEXT: global_store_byte v2, v0, s[6:7]
1708 ; GFX9-NEXT: s_endpgm
1710 ; GFX1010-LABEL: vusubo64:
1712 ; GFX1010-NEXT: s_clause 0x1
1713 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1714 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1715 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
1716 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
1717 ; GFX1010-NEXT: v_sub_co_u32 v0, s0, s2, v0
1718 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s3, 0, s0
1719 ; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
1720 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1721 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1722 ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
1723 ; GFX1010-NEXT: s_endpgm
1725 ; GFX1030W32-LABEL: vusubo64:
1726 ; GFX1030W32: ; %bb.0:
1727 ; GFX1030W32-NEXT: s_clause 0x1
1728 ; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1729 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1730 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
1731 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
1732 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0
1733 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
1734 ; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
1735 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1736 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1737 ; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
1738 ; GFX1030W32-NEXT: s_endpgm
1740 ; GFX1030W64-LABEL: vusubo64:
1741 ; GFX1030W64: ; %bb.0:
1742 ; GFX1030W64-NEXT: s_clause 0x1
1743 ; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
1744 ; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1745 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
1746 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
1747 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0
1748 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
1749 ; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
1750 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1751 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1752 ; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
1753 ; GFX1030W64-NEXT: s_endpgm
1755 ; GFX11-LABEL: vusubo64:
1757 ; GFX11-NEXT: s_clause 0x1
1758 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
1759 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1760 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1761 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1762 ; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0
1763 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1764 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
1765 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
1766 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1767 ; GFX11-NEXT: s_clause 0x1
1768 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1769 ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
1770 ; GFX11-NEXT: s_nop 0
1771 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1772 ; GFX11-NEXT: s_endpgm
1773 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1774 %tid.ext = sext i32 %tid to i64
1775 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
1776 %val = extractvalue { i64, i1 } %usub, 0
1777 %carry = extractvalue { i64, i1 } %usub, 1
1778 store i64 %val, ptr addrspace(1) %out, align 8
1779 store i1 %carry, ptr addrspace(1) %carryout
1783 ; GCN-ISEL-LABEL: name: sudiv64
1784 ; GCN-ISEL-LABEL: body:
1785 ; GCN-ISEL-LABEL: bb.3
1786 ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64
1787 ; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]]
1788 ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64
1789 ; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]]
1791 define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
1792 ; CISI-LABEL: sudiv64:
1794 ; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1795 ; CISI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
1796 ; CISI-NEXT: s_waitcnt lgkmcnt(0)
1797 ; CISI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3]
1798 ; CISI-NEXT: s_mov_b32 s0, 0
1799 ; CISI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
1800 ; CISI-NEXT: s_and_b64 vcc, exec, s[0:1]
1801 ; CISI-NEXT: s_cbranch_vccz .LBB16_4
1802 ; CISI-NEXT: ; %bb.1:
1803 ; CISI-NEXT: v_cvt_f32_u32_e32 v0, s2
1804 ; CISI-NEXT: v_cvt_f32_u32_e32 v1, s3
1805 ; CISI-NEXT: s_sub_u32 s0, 0, s2
1806 ; CISI-NEXT: s_subb_u32 s1, 0, s3
1807 ; CISI-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
1808 ; CISI-NEXT: v_rcp_f32_e32 v0, v0
1809 ; CISI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
1810 ; CISI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
1811 ; CISI-NEXT: v_trunc_f32_e32 v1, v1
1812 ; CISI-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
1813 ; CISI-NEXT: v_cvt_u32_f32_e32 v1, v1
1814 ; CISI-NEXT: v_cvt_u32_f32_e32 v0, v0
1815 ; CISI-NEXT: v_mul_lo_u32 v2, s0, v1
1816 ; CISI-NEXT: v_mul_hi_u32 v3, s0, v0
1817 ; CISI-NEXT: v_mul_lo_u32 v5, s1, v0
1818 ; CISI-NEXT: v_mul_lo_u32 v4, s0, v0
1819 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1820 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v5
1821 ; CISI-NEXT: v_mul_hi_u32 v3, v0, v4
1822 ; CISI-NEXT: v_mul_lo_u32 v5, v0, v2
1823 ; CISI-NEXT: v_mul_hi_u32 v7, v0, v2
1824 ; CISI-NEXT: v_mul_lo_u32 v6, v1, v4
1825 ; CISI-NEXT: v_mul_hi_u32 v4, v1, v4
1826 ; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v5
1827 ; CISI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1828 ; CISI-NEXT: v_mul_hi_u32 v7, v1, v2
1829 ; CISI-NEXT: v_mul_lo_u32 v2, v1, v2
1830 ; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v6
1831 ; CISI-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
1832 ; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
1833 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
1834 ; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
1835 ; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1836 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
1837 ; CISI-NEXT: v_mul_lo_u32 v2, s0, v1
1838 ; CISI-NEXT: v_mul_hi_u32 v3, s0, v0
1839 ; CISI-NEXT: v_mul_lo_u32 v4, s1, v0
1840 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1841 ; CISI-NEXT: v_mul_lo_u32 v3, s0, v0
1842 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
1843 ; CISI-NEXT: v_mul_lo_u32 v6, v0, v2
1844 ; CISI-NEXT: v_mul_hi_u32 v7, v0, v3
1845 ; CISI-NEXT: v_mul_hi_u32 v8, v0, v2
1846 ; CISI-NEXT: v_mul_hi_u32 v5, v1, v3
1847 ; CISI-NEXT: v_mul_lo_u32 v3, v1, v3
1848 ; CISI-NEXT: v_mul_hi_u32 v4, v1, v2
1849 ; CISI-NEXT: v_add_i32_e32 v6, vcc, v7, v6
1850 ; CISI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
1851 ; CISI-NEXT: v_mul_lo_u32 v2, v1, v2
1852 ; CISI-NEXT: v_add_i32_e32 v3, vcc, v6, v3
1853 ; CISI-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
1854 ; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1855 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
1856 ; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
1857 ; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1858 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
1859 ; CISI-NEXT: v_mul_lo_u32 v2, s6, v1
1860 ; CISI-NEXT: v_mul_hi_u32 v3, s6, v0
1861 ; CISI-NEXT: v_mul_hi_u32 v4, s6, v1
1862 ; CISI-NEXT: v_mul_hi_u32 v5, s7, v1
1863 ; CISI-NEXT: v_mul_lo_u32 v1, s7, v1
1864 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
1865 ; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
1866 ; CISI-NEXT: v_mul_lo_u32 v4, s7, v0
1867 ; CISI-NEXT: v_mul_hi_u32 v0, s7, v0
1868 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
1869 ; CISI-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
1870 ; CISI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
1871 ; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1872 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
1873 ; CISI-NEXT: v_mul_lo_u32 v2, s2, v1
1874 ; CISI-NEXT: v_mul_hi_u32 v3, s2, v0
1875 ; CISI-NEXT: v_mul_lo_u32 v4, s3, v0
1876 ; CISI-NEXT: v_mov_b32_e32 v5, s3
1877 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1878 ; CISI-NEXT: v_mul_lo_u32 v3, s2, v0
1879 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
1880 ; CISI-NEXT: v_sub_i32_e32 v4, vcc, s7, v2
1881 ; CISI-NEXT: v_sub_i32_e32 v3, vcc, s6, v3
1882 ; CISI-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
1883 ; CISI-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3
1884 ; CISI-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
1885 ; CISI-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4
1886 ; CISI-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
1887 ; CISI-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5
1888 ; CISI-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1889 ; CISI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4
1890 ; CISI-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1]
1891 ; CISI-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0
1892 ; CISI-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
1893 ; CISI-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0
1894 ; CISI-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
1895 ; CISI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
1896 ; CISI-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1]
1897 ; CISI-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
1898 ; CISI-NEXT: v_mov_b32_e32 v6, s7
1899 ; CISI-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc
1900 ; CISI-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
1901 ; CISI-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
1902 ; CISI-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
1903 ; CISI-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
1904 ; CISI-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2
1905 ; CISI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
1906 ; CISI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
1907 ; CISI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
1908 ; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1909 ; CISI-NEXT: s_cbranch_execnz .LBB16_3
1910 ; CISI-NEXT: .LBB16_2:
1911 ; CISI-NEXT: v_cvt_f32_u32_e32 v0, s2
1912 ; CISI-NEXT: s_sub_i32 s0, 0, s2
1913 ; CISI-NEXT: v_rcp_iflag_f32_e32 v0, v0
1914 ; CISI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1915 ; CISI-NEXT: v_cvt_u32_f32_e32 v0, v0
1916 ; CISI-NEXT: v_mul_lo_u32 v1, s0, v0
1917 ; CISI-NEXT: v_mul_hi_u32 v1, v0, v1
1918 ; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1919 ; CISI-NEXT: v_mul_hi_u32 v0, s6, v0
1920 ; CISI-NEXT: v_readfirstlane_b32 s0, v0
1921 ; CISI-NEXT: s_mul_i32 s0, s0, s2
1922 ; CISI-NEXT: s_sub_i32 s0, s6, s0
1923 ; CISI-NEXT: s_sub_i32 s1, s0, s2
1924 ; CISI-NEXT: v_add_i32_e32 v1, vcc, 1, v0
1925 ; CISI-NEXT: s_cmp_ge_u32 s0, s2
1926 ; CISI-NEXT: s_cselect_b64 vcc, -1, 0
1927 ; CISI-NEXT: s_cselect_b32 s0, s1, s0
1928 ; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1929 ; CISI-NEXT: v_add_i32_e32 v1, vcc, 1, v0
1930 ; CISI-NEXT: s_cmp_ge_u32 s0, s2
1931 ; CISI-NEXT: s_cselect_b64 vcc, -1, 0
1932 ; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1933 ; CISI-NEXT: v_mov_b32_e32 v1, 0
1934 ; CISI-NEXT: .LBB16_3:
1935 ; CISI-NEXT: s_mov_b32 s7, 0xf000
1936 ; CISI-NEXT: s_mov_b32 s6, -1
1937 ; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1938 ; CISI-NEXT: s_endpgm
1939 ; CISI-NEXT: .LBB16_4:
1940 ; CISI-NEXT: ; implicit-def: $vgpr0_vgpr1
1941 ; CISI-NEXT: s_branch .LBB16_2
1943 ; VI-LABEL: sudiv64:
1945 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1946 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1947 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1948 ; VI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3]
1949 ; VI-NEXT: s_mov_b32 s0, 0
1950 ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
1951 ; VI-NEXT: s_cbranch_scc0 .LBB16_4
1953 ; VI-NEXT: v_cvt_f32_u32_e32 v0, s2
1954 ; VI-NEXT: v_cvt_f32_u32_e32 v1, s3
1955 ; VI-NEXT: s_sub_u32 s8, 0, s2
1956 ; VI-NEXT: s_subb_u32 s9, 0, s3
1957 ; VI-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
1958 ; VI-NEXT: v_rcp_f32_e32 v0, v0
1959 ; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
1960 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
1961 ; VI-NEXT: v_trunc_f32_e32 v1, v1
1962 ; VI-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
1963 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v1
1964 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v0
1965 ; VI-NEXT: v_mul_lo_u32 v2, s8, v4
1966 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
1967 ; VI-NEXT: v_mul_lo_u32 v3, s9, v5
1968 ; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1
1969 ; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3
1970 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0
1971 ; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0
1972 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1
1973 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v0, 0
1974 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc
1975 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v3, 0
1976 ; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0
1977 ; VI-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc
1978 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1979 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1980 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1981 ; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v0
1982 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc
1983 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
1984 ; VI-NEXT: v_mul_lo_u32 v4, s8, v7
1985 ; VI-NEXT: v_mul_lo_u32 v5, s9, v6
1986 ; VI-NEXT: v_mul_hi_u32 v8, v6, v0
1987 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0
1988 ; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1
1989 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5
1990 ; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
1991 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0
1992 ; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4
1993 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1994 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2
1995 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
1996 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1997 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
1998 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1999 ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v0
2000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc
2001 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0
2002 ; VI-NEXT: v_mul_hi_u32 v4, s6, v2
2003 ; VI-NEXT: v_readfirstlane_b32 s8, v1
2004 ; VI-NEXT: v_readfirstlane_b32 s9, v0
2005 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v3, 0
2006 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v2, 0
2007 ; VI-NEXT: v_readfirstlane_b32 s10, v4
2008 ; VI-NEXT: s_add_u32 s0, s10, s9
2009 ; VI-NEXT: s_addc_u32 s1, 0, s8
2010 ; VI-NEXT: v_readfirstlane_b32 s10, v2
2011 ; VI-NEXT: v_readfirstlane_b32 s9, v3
2012 ; VI-NEXT: s_add_u32 s0, s0, s10
2013 ; VI-NEXT: v_readfirstlane_b32 s8, v1
2014 ; VI-NEXT: s_addc_u32 s0, s1, s9
2015 ; VI-NEXT: s_addc_u32 s10, s8, 0
2016 ; VI-NEXT: v_readfirstlane_b32 s1, v0
2017 ; VI-NEXT: s_add_u32 s11, s0, s1
2018 ; VI-NEXT: v_mov_b32_e32 v2, s11
2019 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0
2020 ; VI-NEXT: s_addc_u32 s10, 0, s10
2021 ; VI-NEXT: s_mul_i32 s0, s2, s10
2022 ; VI-NEXT: v_readfirstlane_b32 s1, v1
2023 ; VI-NEXT: s_add_i32 s0, s1, s0
2024 ; VI-NEXT: s_mul_i32 s1, s3, s11
2025 ; VI-NEXT: s_add_i32 s12, s0, s1
2026 ; VI-NEXT: s_sub_i32 s0, s7, s12
2027 ; VI-NEXT: v_sub_u32_e32 v0, vcc, s6, v0
2028 ; VI-NEXT: s_cmp_lg_u64 vcc, 0
2029 ; VI-NEXT: s_subb_u32 s13, s0, s3
2030 ; VI-NEXT: v_subrev_u32_e64 v1, s[0:1], s2, v0
2031 ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
2032 ; VI-NEXT: s_subb_u32 s13, s13, 0
2033 ; VI-NEXT: s_cmp_ge_u32 s13, s3
2034 ; VI-NEXT: s_cselect_b32 s14, -1, 0
2035 ; VI-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1
2036 ; VI-NEXT: s_cmp_eq_u32 s13, s3
2037 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
2038 ; VI-NEXT: v_mov_b32_e32 v3, s14
2039 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
2040 ; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
2041 ; VI-NEXT: s_add_u32 s0, s11, 1
2042 ; VI-NEXT: s_addc_u32 s13, s10, 0
2043 ; VI-NEXT: s_add_u32 s1, s11, 2
2044 ; VI-NEXT: s_addc_u32 s11, s10, 0
2045 ; VI-NEXT: v_mov_b32_e32 v3, s0
2046 ; VI-NEXT: v_mov_b32_e32 v4, s1
2047 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
2048 ; VI-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2049 ; VI-NEXT: v_mov_b32_e32 v1, s13
2050 ; VI-NEXT: v_mov_b32_e32 v4, s11
2051 ; VI-NEXT: s_cmp_lg_u64 vcc, 0
2052 ; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2053 ; VI-NEXT: s_subb_u32 s0, s7, s12
2054 ; VI-NEXT: s_cmp_ge_u32 s0, s3
2055 ; VI-NEXT: s_cselect_b32 s1, -1, 0
2056 ; VI-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
2057 ; VI-NEXT: s_cmp_eq_u32 s0, s3
2058 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2059 ; VI-NEXT: v_mov_b32_e32 v4, s1
2060 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2061 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
2062 ; VI-NEXT: v_mov_b32_e32 v4, s10
2063 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2064 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
2065 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
2066 ; VI-NEXT: s_cbranch_execnz .LBB16_3
2067 ; VI-NEXT: .LBB16_2:
2068 ; VI-NEXT: v_cvt_f32_u32_e32 v0, s2
2069 ; VI-NEXT: s_sub_i32 s0, 0, s2
2070 ; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0
2071 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2072 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
2073 ; VI-NEXT: v_mul_lo_u32 v1, s0, v0
2074 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1
2075 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
2076 ; VI-NEXT: v_mul_hi_u32 v0, s6, v0
2077 ; VI-NEXT: v_readfirstlane_b32 s0, v0
2078 ; VI-NEXT: s_mul_i32 s0, s0, s2
2079 ; VI-NEXT: s_sub_i32 s0, s6, s0
2080 ; VI-NEXT: s_sub_i32 s1, s0, s2
2081 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0
2082 ; VI-NEXT: s_cmp_ge_u32 s0, s2
2083 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2084 ; VI-NEXT: s_cselect_b32 s0, s1, s0
2085 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2086 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0
2087 ; VI-NEXT: s_cmp_ge_u32 s0, s2
2088 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2089 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2090 ; VI-NEXT: v_mov_b32_e32 v1, 0
2091 ; VI-NEXT: .LBB16_3:
2092 ; VI-NEXT: v_mov_b32_e32 v2, s4
2093 ; VI-NEXT: v_mov_b32_e32 v3, s5
2094 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2096 ; VI-NEXT: .LBB16_4:
2097 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
2098 ; VI-NEXT: s_branch .LBB16_2
2100 ; GFX9-LABEL: sudiv64:
2102 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2103 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2104 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2105 ; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3]
2106 ; GFX9-NEXT: s_mov_b32 s0, 0
2107 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
2108 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4
2109 ; GFX9-NEXT: ; %bb.1:
2110 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
2111 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
2112 ; GFX9-NEXT: s_sub_u32 s0, 0, s2
2113 ; GFX9-NEXT: s_subb_u32 s1, 0, s3
2114 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
2115 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0
2116 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
2117 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
2118 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
2119 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
2120 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
2121 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2122 ; GFX9-NEXT: v_readfirstlane_b32 s10, v1
2123 ; GFX9-NEXT: v_readfirstlane_b32 s11, v0
2124 ; GFX9-NEXT: s_mul_i32 s12, s0, s10
2125 ; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11
2126 ; GFX9-NEXT: s_mul_i32 s13, s1, s11
2127 ; GFX9-NEXT: s_add_i32 s12, s14, s12
2128 ; GFX9-NEXT: s_add_i32 s12, s12, s13
2129 ; GFX9-NEXT: s_mul_i32 s15, s0, s11
2130 ; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
2131 ; GFX9-NEXT: s_mul_i32 s14, s11, s12
2132 ; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15
2133 ; GFX9-NEXT: s_add_u32 s11, s11, s14
2134 ; GFX9-NEXT: s_addc_u32 s13, 0, s13
2135 ; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15
2136 ; GFX9-NEXT: s_mul_i32 s15, s10, s15
2137 ; GFX9-NEXT: s_add_u32 s11, s11, s15
2138 ; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12
2139 ; GFX9-NEXT: s_addc_u32 s11, s13, s16
2140 ; GFX9-NEXT: s_addc_u32 s13, s14, 0
2141 ; GFX9-NEXT: s_mul_i32 s12, s10, s12
2142 ; GFX9-NEXT: s_add_u32 s11, s11, s12
2143 ; GFX9-NEXT: s_addc_u32 s12, 0, s13
2144 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s11, v0
2145 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
2146 ; GFX9-NEXT: s_addc_u32 s10, s10, s12
2147 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0
2148 ; GFX9-NEXT: s_mul_i32 s11, s0, s10
2149 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12
2150 ; GFX9-NEXT: s_add_i32 s11, s13, s11
2151 ; GFX9-NEXT: s_mul_i32 s1, s1, s12
2152 ; GFX9-NEXT: s_add_i32 s11, s11, s1
2153 ; GFX9-NEXT: s_mul_i32 s0, s0, s12
2154 ; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0
2155 ; GFX9-NEXT: s_mul_i32 s14, s10, s0
2156 ; GFX9-NEXT: s_mul_i32 s16, s12, s11
2157 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0
2158 ; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11
2159 ; GFX9-NEXT: s_add_u32 s0, s0, s16
2160 ; GFX9-NEXT: s_addc_u32 s12, 0, s15
2161 ; GFX9-NEXT: s_add_u32 s0, s0, s14
2162 ; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11
2163 ; GFX9-NEXT: s_addc_u32 s0, s12, s13
2164 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
2165 ; GFX9-NEXT: s_mul_i32 s11, s10, s11
2166 ; GFX9-NEXT: s_add_u32 s0, s0, s11
2167 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
2168 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
2169 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
2170 ; GFX9-NEXT: s_addc_u32 s0, s10, s1
2171 ; GFX9-NEXT: v_readfirstlane_b32 s11, v0
2172 ; GFX9-NEXT: s_mul_i32 s10, s6, s0
2173 ; GFX9-NEXT: s_mul_hi_u32 s12, s6, s11
2174 ; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0
2175 ; GFX9-NEXT: s_add_u32 s10, s12, s10
2176 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
2177 ; GFX9-NEXT: s_mul_hi_u32 s13, s7, s11
2178 ; GFX9-NEXT: s_mul_i32 s11, s7, s11
2179 ; GFX9-NEXT: s_add_u32 s10, s10, s11
2180 ; GFX9-NEXT: s_mul_hi_u32 s12, s7, s0
2181 ; GFX9-NEXT: s_addc_u32 s1, s1, s13
2182 ; GFX9-NEXT: s_addc_u32 s10, s12, 0
2183 ; GFX9-NEXT: s_mul_i32 s0, s7, s0
2184 ; GFX9-NEXT: s_add_u32 s11, s1, s0
2185 ; GFX9-NEXT: s_addc_u32 s10, 0, s10
2186 ; GFX9-NEXT: s_mul_i32 s0, s2, s10
2187 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s11
2188 ; GFX9-NEXT: s_add_i32 s0, s1, s0
2189 ; GFX9-NEXT: s_mul_i32 s1, s3, s11
2190 ; GFX9-NEXT: s_add_i32 s12, s0, s1
2191 ; GFX9-NEXT: s_mul_i32 s1, s2, s11
2192 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
2193 ; GFX9-NEXT: s_sub_i32 s0, s7, s12
2194 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
2195 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
2196 ; GFX9-NEXT: s_subb_u32 s13, s0, s3
2197 ; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0
2198 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
2199 ; GFX9-NEXT: s_subb_u32 s13, s13, 0
2200 ; GFX9-NEXT: s_cmp_ge_u32 s13, s3
2201 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0
2202 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1
2203 ; GFX9-NEXT: s_cmp_eq_u32 s13, s3
2204 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
2205 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
2206 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
2207 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
2208 ; GFX9-NEXT: s_add_u32 s0, s11, 1
2209 ; GFX9-NEXT: s_addc_u32 s13, s10, 0
2210 ; GFX9-NEXT: s_add_u32 s1, s11, 2
2211 ; GFX9-NEXT: s_addc_u32 s14, s10, 0
2212 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
2213 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
2214 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
2215 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
2216 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
2217 ; GFX9-NEXT: v_mov_b32_e32 v3, s14
2218 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
2219 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
2220 ; GFX9-NEXT: s_subb_u32 s0, s7, s12
2221 ; GFX9-NEXT: s_cmp_ge_u32 s0, s3
2222 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
2223 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
2224 ; GFX9-NEXT: s_cmp_eq_u32 s0, s3
2225 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2226 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
2227 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2228 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
2229 ; GFX9-NEXT: v_mov_b32_e32 v3, s10
2230 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2231 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
2232 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
2233 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
2234 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3
2235 ; GFX9-NEXT: .LBB16_2:
2236 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
2237 ; GFX9-NEXT: s_sub_i32 s0, 0, s2
2238 ; GFX9-NEXT: s_mov_b32 s1, 0
2239 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2240 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2241 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2242 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0
2243 ; GFX9-NEXT: s_mul_i32 s0, s0, s3
2244 ; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0
2245 ; GFX9-NEXT: s_add_i32 s3, s3, s0
2246 ; GFX9-NEXT: s_mul_hi_u32 s0, s6, s3
2247 ; GFX9-NEXT: s_mul_i32 s7, s0, s2
2248 ; GFX9-NEXT: s_sub_i32 s6, s6, s7
2249 ; GFX9-NEXT: s_add_i32 s3, s0, 1
2250 ; GFX9-NEXT: s_sub_i32 s7, s6, s2
2251 ; GFX9-NEXT: s_cmp_ge_u32 s6, s2
2252 ; GFX9-NEXT: s_cselect_b32 s0, s3, s0
2253 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6
2254 ; GFX9-NEXT: s_add_i32 s3, s0, 1
2255 ; GFX9-NEXT: s_cmp_ge_u32 s6, s2
2256 ; GFX9-NEXT: s_cselect_b32 s0, s3, s0
2257 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2258 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2259 ; GFX9-NEXT: .LBB16_3:
2260 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2261 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
2262 ; GFX9-NEXT: s_endpgm
2263 ; GFX9-NEXT: .LBB16_4:
2264 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
2265 ; GFX9-NEXT: s_branch .LBB16_2
2267 ; GFX1010-LABEL: sudiv64:
2269 ; GFX1010-NEXT: s_clause 0x1
2270 ; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2271 ; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2272 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
2273 ; GFX1010-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3]
2274 ; GFX1010-NEXT: s_mov_b32 s8, 0
2275 ; GFX1010-NEXT: s_cmp_lg_u64 s[8:9], 0
2276 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4
2277 ; GFX1010-NEXT: ; %bb.1:
2278 ; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2
2279 ; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3
2280 ; GFX1010-NEXT: s_sub_u32 s9, 0, s2
2281 ; GFX1010-NEXT: s_subb_u32 s10, 0, s3
2282 ; GFX1010-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
2283 ; GFX1010-NEXT: v_rcp_f32_e32 v0, v0
2284 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
2285 ; GFX1010-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
2286 ; GFX1010-NEXT: v_trunc_f32_e32 v1, v1
2287 ; GFX1010-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
2288 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1
2289 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
2290 ; GFX1010-NEXT: v_readfirstlane_b32 s0, v1
2291 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0
2292 ; GFX1010-NEXT: s_mul_i32 s11, s9, s0
2293 ; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s1
2294 ; GFX1010-NEXT: s_mul_i32 s12, s10, s1
2295 ; GFX1010-NEXT: s_add_i32 s11, s13, s11
2296 ; GFX1010-NEXT: s_mul_i32 s14, s9, s1
2297 ; GFX1010-NEXT: s_add_i32 s11, s11, s12
2298 ; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14
2299 ; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14
2300 ; GFX1010-NEXT: s_mul_i32 s12, s0, s14
2301 ; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11
2302 ; GFX1010-NEXT: s_mul_i32 s1, s1, s11
2303 ; GFX1010-NEXT: s_mul_hi_u32 s16, s0, s11
2304 ; GFX1010-NEXT: s_add_u32 s1, s13, s1
2305 ; GFX1010-NEXT: s_addc_u32 s13, 0, s14
2306 ; GFX1010-NEXT: s_add_u32 s1, s1, s12
2307 ; GFX1010-NEXT: s_mul_i32 s11, s0, s11
2308 ; GFX1010-NEXT: s_addc_u32 s1, s13, s15
2309 ; GFX1010-NEXT: s_addc_u32 s12, s16, 0
2310 ; GFX1010-NEXT: s_add_u32 s1, s1, s11
2311 ; GFX1010-NEXT: s_addc_u32 s11, 0, s12
2312 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1
2313 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
2314 ; GFX1010-NEXT: s_addc_u32 s0, s0, s11
2315 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0
2316 ; GFX1010-NEXT: s_mul_i32 s11, s9, s0
2317 ; GFX1010-NEXT: s_mul_hi_u32 s12, s9, s1
2318 ; GFX1010-NEXT: s_mul_i32 s10, s10, s1
2319 ; GFX1010-NEXT: s_add_i32 s11, s12, s11
2320 ; GFX1010-NEXT: s_mul_i32 s9, s9, s1
2321 ; GFX1010-NEXT: s_add_i32 s11, s11, s10
2322 ; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s9
2323 ; GFX1010-NEXT: s_mul_i32 s13, s0, s9
2324 ; GFX1010-NEXT: s_mul_hi_u32 s9, s1, s9
2325 ; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11
2326 ; GFX1010-NEXT: s_mul_i32 s1, s1, s11
2327 ; GFX1010-NEXT: s_mul_hi_u32 s10, s0, s11
2328 ; GFX1010-NEXT: s_add_u32 s1, s9, s1
2329 ; GFX1010-NEXT: s_addc_u32 s9, 0, s14
2330 ; GFX1010-NEXT: s_add_u32 s1, s1, s13
2331 ; GFX1010-NEXT: s_mul_i32 s11, s0, s11
2332 ; GFX1010-NEXT: s_addc_u32 s1, s9, s12
2333 ; GFX1010-NEXT: s_addc_u32 s9, s10, 0
2334 ; GFX1010-NEXT: s_add_u32 s1, s1, s11
2335 ; GFX1010-NEXT: s_addc_u32 s9, 0, s9
2336 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1
2337 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
2338 ; GFX1010-NEXT: s_addc_u32 s0, s0, s9
2339 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0
2340 ; GFX1010-NEXT: s_mul_i32 s10, s6, s0
2341 ; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s0
2342 ; GFX1010-NEXT: s_mul_hi_u32 s11, s7, s0
2343 ; GFX1010-NEXT: s_mul_i32 s0, s7, s0
2344 ; GFX1010-NEXT: s_mul_hi_u32 s12, s6, s1
2345 ; GFX1010-NEXT: s_mul_hi_u32 s13, s7, s1
2346 ; GFX1010-NEXT: s_mul_i32 s1, s7, s1
2347 ; GFX1010-NEXT: s_add_u32 s10, s12, s10
2348 ; GFX1010-NEXT: s_addc_u32 s9, 0, s9
2349 ; GFX1010-NEXT: s_add_u32 s1, s10, s1
2350 ; GFX1010-NEXT: s_addc_u32 s1, s9, s13
2351 ; GFX1010-NEXT: s_addc_u32 s9, s11, 0
2352 ; GFX1010-NEXT: s_add_u32 s1, s1, s0
2353 ; GFX1010-NEXT: s_addc_u32 s9, 0, s9
2354 ; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1
2355 ; GFX1010-NEXT: s_mul_i32 s11, s2, s9
2356 ; GFX1010-NEXT: s_mul_i32 s12, s2, s1
2357 ; GFX1010-NEXT: s_add_i32 s0, s0, s11
2358 ; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12
2359 ; GFX1010-NEXT: s_mul_i32 s10, s3, s1
2360 ; GFX1010-NEXT: s_add_i32 s0, s0, s10
2361 ; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2
2362 ; GFX1010-NEXT: s_sub_i32 s10, s7, s0
2363 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
2364 ; GFX1010-NEXT: s_subb_u32 s10, s10, s3
2365 ; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
2366 ; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
2367 ; GFX1010-NEXT: s_subb_u32 s10, s10, 0
2368 ; GFX1010-NEXT: s_cmp_ge_u32 s10, s3
2369 ; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2370 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
2371 ; GFX1010-NEXT: s_cmp_eq_u32 s10, s3
2372 ; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0
2373 ; GFX1010-NEXT: s_add_u32 s10, s1, 1
2374 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2375 ; GFX1010-NEXT: s_addc_u32 s12, s9, 0
2376 ; GFX1010-NEXT: s_add_u32 s13, s1, 2
2377 ; GFX1010-NEXT: s_addc_u32 s14, s9, 0
2378 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
2379 ; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
2380 ; GFX1010-NEXT: s_subb_u32 s0, s7, s0
2381 ; GFX1010-NEXT: v_mov_b32_e32 v2, s13
2382 ; GFX1010-NEXT: s_cmp_ge_u32 s0, s3
2383 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2384 ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0
2385 ; GFX1010-NEXT: s_cmp_eq_u32 s0, s3
2386 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
2387 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
2388 ; GFX1010-NEXT: v_mov_b32_e32 v1, s14
2389 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
2390 ; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
2391 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2392 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
2393 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
2394 ; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
2395 ; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
2396 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3
2397 ; GFX1010-NEXT: .LBB16_2:
2398 ; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2
2399 ; GFX1010-NEXT: s_sub_i32 s1, 0, s2
2400 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0
2401 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2402 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0
2403 ; GFX1010-NEXT: v_readfirstlane_b32 s0, v0
2404 ; GFX1010-NEXT: s_mul_i32 s1, s1, s0
2405 ; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1
2406 ; GFX1010-NEXT: s_add_i32 s0, s0, s1
2407 ; GFX1010-NEXT: s_mul_hi_u32 s0, s6, s0
2408 ; GFX1010-NEXT: s_mul_i32 s1, s0, s2
2409 ; GFX1010-NEXT: s_add_i32 s3, s0, 1
2410 ; GFX1010-NEXT: s_sub_i32 s1, s6, s1
2411 ; GFX1010-NEXT: s_sub_i32 s6, s1, s2
2412 ; GFX1010-NEXT: s_cmp_ge_u32 s1, s2
2413 ; GFX1010-NEXT: s_cselect_b32 s0, s3, s0
2414 ; GFX1010-NEXT: s_cselect_b32 s1, s6, s1
2415 ; GFX1010-NEXT: s_add_i32 s3, s0, 1
2416 ; GFX1010-NEXT: s_cmp_ge_u32 s1, s2
2417 ; GFX1010-NEXT: s_mov_b32 s1, 0
2418 ; GFX1010-NEXT: s_cselect_b32 s0, s3, s0
2419 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0
2420 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1
2421 ; GFX1010-NEXT: .LBB16_3:
2422 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0
2423 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
2424 ; GFX1010-NEXT: s_endpgm
2425 ; GFX1010-NEXT: .LBB16_4:
2426 ; GFX1010-NEXT: ; implicit-def: $vgpr0_vgpr1
2427 ; GFX1010-NEXT: s_branch .LBB16_2
2429 ; GFX1030W32-LABEL: sudiv64:
2430 ; GFX1030W32: ; %bb.0:
2431 ; GFX1030W32-NEXT: s_clause 0x1
2432 ; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2433 ; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2434 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
2435 ; GFX1030W32-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3]
2436 ; GFX1030W32-NEXT: s_mov_b32 s8, 0
2437 ; GFX1030W32-NEXT: s_cmp_lg_u64 s[8:9], 0
2438 ; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4
2439 ; GFX1030W32-NEXT: ; %bb.1:
2440 ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2
2441 ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v1, s3
2442 ; GFX1030W32-NEXT: s_sub_u32 s9, 0, s2
2443 ; GFX1030W32-NEXT: s_subb_u32 s10, 0, s3
2444 ; GFX1030W32-NEXT: v_fmac_f32_e32 v0, 0x4f800000, v1
2445 ; GFX1030W32-NEXT: v_rcp_f32_e32 v0, v0
2446 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
2447 ; GFX1030W32-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
2448 ; GFX1030W32-NEXT: v_trunc_f32_e32 v1, v1
2449 ; GFX1030W32-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
2450 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1
2451 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
2452 ; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v1
2453 ; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0
2454 ; GFX1030W32-NEXT: s_mul_i32 s11, s9, s0
2455 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s1
2456 ; GFX1030W32-NEXT: s_mul_i32 s12, s10, s1
2457 ; GFX1030W32-NEXT: s_add_i32 s11, s13, s11
2458 ; GFX1030W32-NEXT: s_mul_i32 s14, s9, s1
2459 ; GFX1030W32-NEXT: s_add_i32 s11, s11, s12
2460 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s1, s14
2461 ; GFX1030W32-NEXT: s_mul_hi_u32 s15, s0, s14
2462 ; GFX1030W32-NEXT: s_mul_i32 s12, s0, s14
2463 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s11
2464 ; GFX1030W32-NEXT: s_mul_i32 s1, s1, s11
2465 ; GFX1030W32-NEXT: s_mul_hi_u32 s16, s0, s11
2466 ; GFX1030W32-NEXT: s_add_u32 s1, s13, s1
2467 ; GFX1030W32-NEXT: s_addc_u32 s13, 0, s14
2468 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s12
2469 ; GFX1030W32-NEXT: s_mul_i32 s11, s0, s11
2470 ; GFX1030W32-NEXT: s_addc_u32 s1, s13, s15
2471 ; GFX1030W32-NEXT: s_addc_u32 s12, s16, 0
2472 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s11
2473 ; GFX1030W32-NEXT: s_addc_u32 s11, 0, s12
2474 ; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1
2475 ; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0
2476 ; GFX1030W32-NEXT: s_addc_u32 s0, s0, s11
2477 ; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0
2478 ; GFX1030W32-NEXT: s_mul_i32 s11, s9, s0
2479 ; GFX1030W32-NEXT: s_mul_hi_u32 s12, s9, s1
2480 ; GFX1030W32-NEXT: s_mul_i32 s10, s10, s1
2481 ; GFX1030W32-NEXT: s_add_i32 s11, s12, s11
2482 ; GFX1030W32-NEXT: s_mul_i32 s9, s9, s1
2483 ; GFX1030W32-NEXT: s_add_i32 s11, s11, s10
2484 ; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s9
2485 ; GFX1030W32-NEXT: s_mul_i32 s13, s0, s9
2486 ; GFX1030W32-NEXT: s_mul_hi_u32 s9, s1, s9
2487 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s11
2488 ; GFX1030W32-NEXT: s_mul_i32 s1, s1, s11
2489 ; GFX1030W32-NEXT: s_mul_hi_u32 s10, s0, s11
2490 ; GFX1030W32-NEXT: s_add_u32 s1, s9, s1
2491 ; GFX1030W32-NEXT: s_addc_u32 s9, 0, s14
2492 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s13
2493 ; GFX1030W32-NEXT: s_mul_i32 s11, s0, s11
2494 ; GFX1030W32-NEXT: s_addc_u32 s1, s9, s12
2495 ; GFX1030W32-NEXT: s_addc_u32 s9, s10, 0
2496 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s11
2497 ; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9
2498 ; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1
2499 ; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0
2500 ; GFX1030W32-NEXT: s_addc_u32 s0, s0, s9
2501 ; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0
2502 ; GFX1030W32-NEXT: s_mul_i32 s10, s6, s0
2503 ; GFX1030W32-NEXT: s_mul_hi_u32 s9, s6, s0
2504 ; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s0
2505 ; GFX1030W32-NEXT: s_mul_i32 s0, s7, s0
2506 ; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s1
2507 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s1
2508 ; GFX1030W32-NEXT: s_mul_i32 s1, s7, s1
2509 ; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
2510 ; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9
2511 ; GFX1030W32-NEXT: s_add_u32 s1, s10, s1
2512 ; GFX1030W32-NEXT: s_addc_u32 s1, s9, s13
2513 ; GFX1030W32-NEXT: s_addc_u32 s9, s11, 0
2514 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s0
2515 ; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9
2516 ; GFX1030W32-NEXT: s_mul_hi_u32 s0, s2, s1
2517 ; GFX1030W32-NEXT: s_mul_i32 s11, s2, s9
2518 ; GFX1030W32-NEXT: s_mul_i32 s12, s2, s1
2519 ; GFX1030W32-NEXT: s_add_i32 s0, s0, s11
2520 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s11, s6, s12
2521 ; GFX1030W32-NEXT: s_mul_i32 s10, s3, s1
2522 ; GFX1030W32-NEXT: s_add_i32 s0, s0, s10
2523 ; GFX1030W32-NEXT: v_sub_co_u32 v1, s12, v0, s2
2524 ; GFX1030W32-NEXT: s_sub_i32 s10, s7, s0
2525 ; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
2526 ; GFX1030W32-NEXT: s_subb_u32 s10, s10, s3
2527 ; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
2528 ; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
2529 ; GFX1030W32-NEXT: s_subb_u32 s10, s10, 0
2530 ; GFX1030W32-NEXT: s_cmp_ge_u32 s10, s3
2531 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2532 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
2533 ; GFX1030W32-NEXT: s_cmp_eq_u32 s10, s3
2534 ; GFX1030W32-NEXT: s_cselect_b32 vcc_lo, -1, 0
2535 ; GFX1030W32-NEXT: s_add_u32 s10, s1, 1
2536 ; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2537 ; GFX1030W32-NEXT: s_addc_u32 s12, s9, 0
2538 ; GFX1030W32-NEXT: s_add_u32 s13, s1, 2
2539 ; GFX1030W32-NEXT: s_addc_u32 s14, s9, 0
2540 ; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
2541 ; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
2542 ; GFX1030W32-NEXT: s_subb_u32 s0, s7, s0
2543 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, s13
2544 ; GFX1030W32-NEXT: s_cmp_ge_u32 s0, s3
2545 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2546 ; GFX1030W32-NEXT: s_cselect_b32 s7, -1, 0
2547 ; GFX1030W32-NEXT: s_cmp_eq_u32 s0, s3
2548 ; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
2549 ; GFX1030W32-NEXT: s_cselect_b32 s0, -1, 0
2550 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s14
2551 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
2552 ; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
2553 ; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2554 ; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
2555 ; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
2556 ; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
2557 ; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
2558 ; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3
2559 ; GFX1030W32-NEXT: .LBB16_2:
2560 ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2
2561 ; GFX1030W32-NEXT: s_sub_i32 s1, 0, s2
2562 ; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0
2563 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2564 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0
2565 ; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v0
2566 ; GFX1030W32-NEXT: s_mul_i32 s1, s1, s0
2567 ; GFX1030W32-NEXT: s_mul_hi_u32 s1, s0, s1
2568 ; GFX1030W32-NEXT: s_add_i32 s0, s0, s1
2569 ; GFX1030W32-NEXT: s_mul_hi_u32 s0, s6, s0
2570 ; GFX1030W32-NEXT: s_mul_i32 s1, s0, s2
2571 ; GFX1030W32-NEXT: s_add_i32 s3, s0, 1
2572 ; GFX1030W32-NEXT: s_sub_i32 s1, s6, s1
2573 ; GFX1030W32-NEXT: s_sub_i32 s6, s1, s2
2574 ; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2
2575 ; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0
2576 ; GFX1030W32-NEXT: s_cselect_b32 s1, s6, s1
2577 ; GFX1030W32-NEXT: s_add_i32 s3, s0, 1
2578 ; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2
2579 ; GFX1030W32-NEXT: s_mov_b32 s1, 0
2580 ; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0
2581 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
2582 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
2583 ; GFX1030W32-NEXT: .LBB16_3:
2584 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
2585 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
2586 ; GFX1030W32-NEXT: s_endpgm
2587 ; GFX1030W32-NEXT: .LBB16_4:
2588 ; GFX1030W32-NEXT: ; implicit-def: $vgpr0_vgpr1
2589 ; GFX1030W32-NEXT: s_branch .LBB16_2
2591 ; GFX1030W64-LABEL: sudiv64:
2592 ; GFX1030W64: ; %bb.0:
2593 ; GFX1030W64-NEXT: s_clause 0x1
2594 ; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2595 ; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2596 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
2597 ; GFX1030W64-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3]
2598 ; GFX1030W64-NEXT: s_mov_b32 s0, 0
2599 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
2600 ; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4
2601 ; GFX1030W64-NEXT: ; %bb.1:
2602 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2
2603 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s3
2604 ; GFX1030W64-NEXT: s_sub_u32 s9, 0, s2
2605 ; GFX1030W64-NEXT: s_subb_u32 s10, 0, s3
2606 ; GFX1030W64-NEXT: v_fmac_f32_e32 v0, 0x4f800000, v1
2607 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
2608 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
2609 ; GFX1030W64-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
2610 ; GFX1030W64-NEXT: v_trunc_f32_e32 v1, v1
2611 ; GFX1030W64-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
2612 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
2613 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
2614 ; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
2615 ; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0
2616 ; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8
2617 ; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s0
2618 ; GFX1030W64-NEXT: s_mul_i32 s11, s10, s0
2619 ; GFX1030W64-NEXT: s_add_i32 s1, s12, s1
2620 ; GFX1030W64-NEXT: s_mul_i32 s13, s9, s0
2621 ; GFX1030W64-NEXT: s_add_i32 s1, s1, s11
2622 ; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s13
2623 ; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
2624 ; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
2625 ; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1
2626 ; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1
2627 ; GFX1030W64-NEXT: s_mul_hi_u32 s15, s8, s1
2628 ; GFX1030W64-NEXT: s_add_u32 s0, s12, s0
2629 ; GFX1030W64-NEXT: s_addc_u32 s12, 0, s13
2630 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s11
2631 ; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1
2632 ; GFX1030W64-NEXT: s_addc_u32 s0, s12, s14
2633 ; GFX1030W64-NEXT: s_addc_u32 s11, s15, 0
2634 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s1
2635 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s11
2636 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
2637 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
2638 ; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
2639 ; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0
2640 ; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8
2641 ; GFX1030W64-NEXT: s_mul_hi_u32 s11, s9, s0
2642 ; GFX1030W64-NEXT: s_mul_i32 s10, s10, s0
2643 ; GFX1030W64-NEXT: s_add_i32 s1, s11, s1
2644 ; GFX1030W64-NEXT: s_mul_i32 s9, s9, s0
2645 ; GFX1030W64-NEXT: s_add_i32 s1, s1, s10
2646 ; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s9
2647 ; GFX1030W64-NEXT: s_mul_i32 s12, s8, s9
2648 ; GFX1030W64-NEXT: s_mul_hi_u32 s9, s0, s9
2649 ; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1
2650 ; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1
2651 ; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s1
2652 ; GFX1030W64-NEXT: s_add_u32 s0, s9, s0
2653 ; GFX1030W64-NEXT: s_addc_u32 s9, 0, s13
2654 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s12
2655 ; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1
2656 ; GFX1030W64-NEXT: s_addc_u32 s0, s9, s11
2657 ; GFX1030W64-NEXT: s_addc_u32 s9, s10, 0
2658 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s1
2659 ; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
2660 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
2661 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
2662 ; GFX1030W64-NEXT: s_addc_u32 s0, s8, s9
2663 ; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v0
2664 ; GFX1030W64-NEXT: s_mul_i32 s9, s6, s0
2665 ; GFX1030W64-NEXT: s_mul_hi_u32 s8, s6, s0
2666 ; GFX1030W64-NEXT: s_mul_hi_u32 s10, s7, s0
2667 ; GFX1030W64-NEXT: s_mul_i32 s0, s7, s0
2668 ; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s1
2669 ; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s1
2670 ; GFX1030W64-NEXT: s_mul_i32 s1, s7, s1
2671 ; GFX1030W64-NEXT: s_add_u32 s9, s11, s9
2672 ; GFX1030W64-NEXT: s_addc_u32 s8, 0, s8
2673 ; GFX1030W64-NEXT: s_add_u32 s1, s9, s1
2674 ; GFX1030W64-NEXT: s_addc_u32 s1, s8, s12
2675 ; GFX1030W64-NEXT: s_addc_u32 s8, s10, 0
2676 ; GFX1030W64-NEXT: s_add_u32 s10, s1, s0
2677 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
2678 ; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s10
2679 ; GFX1030W64-NEXT: s_mul_i32 s1, s2, s11
2680 ; GFX1030W64-NEXT: s_mul_i32 s9, s2, s10
2681 ; GFX1030W64-NEXT: s_add_i32 s12, s0, s1
2682 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9
2683 ; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
2684 ; GFX1030W64-NEXT: s_add_i32 s12, s12, s8
2685 ; GFX1030W64-NEXT: v_sub_co_u32 v1, s[8:9], v0, s2
2686 ; GFX1030W64-NEXT: s_sub_i32 s13, s7, s12
2687 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
2688 ; GFX1030W64-NEXT: s_subb_u32 s13, s13, s3
2689 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
2690 ; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v1
2691 ; GFX1030W64-NEXT: s_subb_u32 s8, s13, 0
2692 ; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s3
2693 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2694 ; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
2695 ; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s3
2696 ; GFX1030W64-NEXT: s_cselect_b64 vcc, -1, 0
2697 ; GFX1030W64-NEXT: s_add_u32 s8, s10, 1
2698 ; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
2699 ; GFX1030W64-NEXT: s_addc_u32 s9, s11, 0
2700 ; GFX1030W64-NEXT: s_add_u32 s13, s10, 2
2701 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
2702 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0
2703 ; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
2704 ; GFX1030W64-NEXT: s_subb_u32 s0, s7, s12
2705 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, s13
2706 ; GFX1030W64-NEXT: s_cmp_ge_u32 s0, s3
2707 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2708 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
2709 ; GFX1030W64-NEXT: s_cmp_eq_u32 s0, s3
2710 ; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
2711 ; GFX1030W64-NEXT: s_cselect_b64 s[0:1], -1, 0
2712 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s14
2713 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1]
2714 ; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc
2715 ; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
2716 ; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2717 ; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc
2718 ; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc
2719 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
2720 ; GFX1030W64-NEXT: .LBB16_2:
2721 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2
2722 ; GFX1030W64-NEXT: s_sub_i32 s1, 0, s2
2723 ; GFX1030W64-NEXT: v_rcp_iflag_f32_e32 v0, v0
2724 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2725 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
2726 ; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0
2727 ; GFX1030W64-NEXT: s_mul_i32 s1, s1, s0
2728 ; GFX1030W64-NEXT: s_mul_hi_u32 s1, s0, s1
2729 ; GFX1030W64-NEXT: s_add_i32 s0, s0, s1
2730 ; GFX1030W64-NEXT: s_mul_hi_u32 s0, s6, s0
2731 ; GFX1030W64-NEXT: s_mul_i32 s1, s0, s2
2732 ; GFX1030W64-NEXT: s_add_i32 s3, s0, 1
2733 ; GFX1030W64-NEXT: s_sub_i32 s1, s6, s1
2734 ; GFX1030W64-NEXT: s_sub_i32 s6, s1, s2
2735 ; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2
2736 ; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0
2737 ; GFX1030W64-NEXT: s_cselect_b32 s1, s6, s1
2738 ; GFX1030W64-NEXT: s_add_i32 s3, s0, 1
2739 ; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2
2740 ; GFX1030W64-NEXT: s_mov_b32 s1, 0
2741 ; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0
2742 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
2743 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
2744 ; GFX1030W64-NEXT: .LBB16_3:
2745 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
2746 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
2747 ; GFX1030W64-NEXT: s_endpgm
2748 ; GFX1030W64-NEXT: .LBB16_4:
2749 ; GFX1030W64-NEXT: ; implicit-def: $vgpr0_vgpr1
2750 ; GFX1030W64-NEXT: s_branch .LBB16_2
2752 ; GFX11-LABEL: sudiv64:
2754 ; GFX11-NEXT: s_clause 0x1
2755 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2756 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
2757 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2758 ; GFX11-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3]
2759 ; GFX11-NEXT: s_mov_b32 s8, 0
2760 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2761 ; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
2762 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_4
2763 ; GFX11-NEXT: ; %bb.1:
2764 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
2765 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3
2766 ; GFX11-NEXT: s_sub_u32 s9, 0, s2
2767 ; GFX11-NEXT: s_subb_u32 s10, 0, s3
2768 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2769 ; GFX11-NEXT: v_fmac_f32_e32 v0, 0x4f800000, v1
2770 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0
2771 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2772 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
2773 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2774 ; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
2775 ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
2776 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2777 ; GFX11-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
2778 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
2779 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
2780 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2781 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
2782 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
2783 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2784 ; GFX11-NEXT: s_mul_i32 s11, s9, s0
2785 ; GFX11-NEXT: s_mul_hi_u32 s13, s9, s1
2786 ; GFX11-NEXT: s_mul_i32 s12, s10, s1
2787 ; GFX11-NEXT: s_add_i32 s11, s13, s11
2788 ; GFX11-NEXT: s_mul_i32 s14, s9, s1
2789 ; GFX11-NEXT: s_add_i32 s11, s11, s12
2790 ; GFX11-NEXT: s_mul_hi_u32 s13, s1, s14
2791 ; GFX11-NEXT: s_mul_hi_u32 s15, s0, s14
2792 ; GFX11-NEXT: s_mul_i32 s12, s0, s14
2793 ; GFX11-NEXT: s_mul_hi_u32 s14, s1, s11
2794 ; GFX11-NEXT: s_mul_i32 s1, s1, s11
2795 ; GFX11-NEXT: s_mul_hi_u32 s16, s0, s11
2796 ; GFX11-NEXT: s_add_u32 s1, s13, s1
2797 ; GFX11-NEXT: s_addc_u32 s13, 0, s14
2798 ; GFX11-NEXT: s_add_u32 s1, s1, s12
2799 ; GFX11-NEXT: s_mul_i32 s11, s0, s11
2800 ; GFX11-NEXT: s_addc_u32 s1, s13, s15
2801 ; GFX11-NEXT: s_addc_u32 s12, s16, 0
2802 ; GFX11-NEXT: s_add_u32 s1, s1, s11
2803 ; GFX11-NEXT: s_addc_u32 s11, 0, s12
2804 ; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
2805 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2806 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0
2807 ; GFX11-NEXT: s_addc_u32 s0, s0, s11
2808 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
2809 ; GFX11-NEXT: s_mul_i32 s11, s9, s0
2810 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2811 ; GFX11-NEXT: s_mul_hi_u32 s12, s9, s1
2812 ; GFX11-NEXT: s_mul_i32 s10, s10, s1
2813 ; GFX11-NEXT: s_add_i32 s11, s12, s11
2814 ; GFX11-NEXT: s_mul_i32 s9, s9, s1
2815 ; GFX11-NEXT: s_add_i32 s11, s11, s10
2816 ; GFX11-NEXT: s_mul_hi_u32 s12, s0, s9
2817 ; GFX11-NEXT: s_mul_i32 s13, s0, s9
2818 ; GFX11-NEXT: s_mul_hi_u32 s9, s1, s9
2819 ; GFX11-NEXT: s_mul_hi_u32 s14, s1, s11
2820 ; GFX11-NEXT: s_mul_i32 s1, s1, s11
2821 ; GFX11-NEXT: s_mul_hi_u32 s10, s0, s11
2822 ; GFX11-NEXT: s_add_u32 s1, s9, s1
2823 ; GFX11-NEXT: s_addc_u32 s9, 0, s14
2824 ; GFX11-NEXT: s_add_u32 s1, s1, s13
2825 ; GFX11-NEXT: s_mul_i32 s11, s0, s11
2826 ; GFX11-NEXT: s_addc_u32 s1, s9, s12
2827 ; GFX11-NEXT: s_addc_u32 s9, s10, 0
2828 ; GFX11-NEXT: s_add_u32 s1, s1, s11
2829 ; GFX11-NEXT: s_addc_u32 s9, 0, s9
2830 ; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
2831 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2832 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0
2833 ; GFX11-NEXT: s_addc_u32 s0, s0, s9
2834 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0
2835 ; GFX11-NEXT: s_mul_i32 s10, s6, s0
2836 ; GFX11-NEXT: s_mul_hi_u32 s9, s6, s0
2837 ; GFX11-NEXT: s_mul_hi_u32 s11, s7, s0
2838 ; GFX11-NEXT: s_mul_i32 s0, s7, s0
2839 ; GFX11-NEXT: s_mul_hi_u32 s12, s6, s1
2840 ; GFX11-NEXT: s_mul_hi_u32 s13, s7, s1
2841 ; GFX11-NEXT: s_mul_i32 s1, s7, s1
2842 ; GFX11-NEXT: s_add_u32 s10, s12, s10
2843 ; GFX11-NEXT: s_addc_u32 s9, 0, s9
2844 ; GFX11-NEXT: s_add_u32 s1, s10, s1
2845 ; GFX11-NEXT: s_addc_u32 s1, s9, s13
2846 ; GFX11-NEXT: s_addc_u32 s9, s11, 0
2847 ; GFX11-NEXT: s_add_u32 s1, s1, s0
2848 ; GFX11-NEXT: s_addc_u32 s9, 0, s9
2849 ; GFX11-NEXT: s_mul_hi_u32 s0, s2, s1
2850 ; GFX11-NEXT: s_mul_i32 s11, s2, s9
2851 ; GFX11-NEXT: s_mul_i32 s12, s2, s1
2852 ; GFX11-NEXT: s_add_i32 s0, s0, s11
2853 ; GFX11-NEXT: v_sub_co_u32 v0, s11, s6, s12
2854 ; GFX11-NEXT: s_mul_i32 s10, s3, s1
2855 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2856 ; GFX11-NEXT: s_add_i32 s0, s0, s10
2857 ; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2
2858 ; GFX11-NEXT: s_sub_i32 s10, s7, s0
2859 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0
2860 ; GFX11-NEXT: s_subb_u32 s10, s10, s3
2861 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0
2862 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
2863 ; GFX11-NEXT: s_subb_u32 s10, s10, 0
2864 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2865 ; GFX11-NEXT: s_cmp_ge_u32 s10, s3
2866 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2867 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0
2868 ; GFX11-NEXT: s_cmp_eq_u32 s10, s3
2869 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
2870 ; GFX11-NEXT: s_add_u32 s10, s1, 1
2871 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2872 ; GFX11-NEXT: s_addc_u32 s12, s9, 0
2873 ; GFX11-NEXT: s_add_u32 s13, s1, 2
2874 ; GFX11-NEXT: s_addc_u32 s14, s9, 0
2875 ; GFX11-NEXT: v_mov_b32_e32 v2, s13
2876 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0
2877 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
2878 ; GFX11-NEXT: s_subb_u32 s0, s7, s0
2879 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2880 ; GFX11-NEXT: s_cmp_ge_u32 s0, s3
2881 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2882 ; GFX11-NEXT: s_cselect_b32 s7, -1, 0
2883 ; GFX11-NEXT: s_cmp_eq_u32 s0, s3
2884 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
2885 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0
2886 ; GFX11-NEXT: v_mov_b32_e32 v1, s14
2887 ; GFX11-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
2888 ; GFX11-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
2889 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2890 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2891 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
2892 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2893 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
2894 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
2895 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
2896 ; GFX11-NEXT: s_cbranch_vccnz .LBB16_3
2897 ; GFX11-NEXT: .LBB16_2:
2898 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
2899 ; GFX11-NEXT: s_sub_i32 s1, 0, s2
2900 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2901 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
2902 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2903 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2904 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
2905 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2906 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2907 ; GFX11-NEXT: s_mul_i32 s1, s1, s0
2908 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2909 ; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1
2910 ; GFX11-NEXT: s_add_i32 s0, s0, s1
2911 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2912 ; GFX11-NEXT: s_mul_hi_u32 s0, s6, s0
2913 ; GFX11-NEXT: s_mul_i32 s1, s0, s2
2914 ; GFX11-NEXT: s_add_i32 s3, s0, 1
2915 ; GFX11-NEXT: s_sub_i32 s1, s6, s1
2916 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2917 ; GFX11-NEXT: s_sub_i32 s6, s1, s2
2918 ; GFX11-NEXT: s_cmp_ge_u32 s1, s2
2919 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0
2920 ; GFX11-NEXT: s_cselect_b32 s1, s6, s1
2921 ; GFX11-NEXT: s_add_i32 s3, s0, 1
2922 ; GFX11-NEXT: s_cmp_ge_u32 s1, s2
2923 ; GFX11-NEXT: s_mov_b32 s1, 0
2924 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0
2925 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2926 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2927 ; GFX11-NEXT: .LBB16_3:
2928 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2929 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
2930 ; GFX11-NEXT: s_nop 0
2931 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2932 ; GFX11-NEXT: s_endpgm
2933 ; GFX11-NEXT: .LBB16_4:
2934 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
2935 ; GFX11-NEXT: s_branch .LBB16_2
2936 %result = udiv i64 %x, %y
2937 store i64 %result, ptr addrspace(1) %out
2943 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
2945 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
2947 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
2949 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
2951 declare i32 @llvm.amdgcn.workitem.id.x() #1
2953 attributes #0 = { nounwind }
2954 attributes #1 = { nounwind readnone }
2956 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: