1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
6 define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
7 ; SI-LABEL: s_uaddo_i64_zext:
9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
10 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, -1
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_mov_b32 s4, s0
15 ; SI-NEXT: s_mov_b32 s5, s1
16 ; SI-NEXT: s_add_u32 s0, s2, s8
17 ; SI-NEXT: v_mov_b32_e32 v0, s2
18 ; SI-NEXT: v_mov_b32_e32 v1, s3
19 ; SI-NEXT: s_addc_u32 s1, s3, s9
20 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
21 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
22 ; SI-NEXT: v_mov_b32_e32 v1, s1
23 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
24 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
25 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28 ; VI-LABEL: s_uaddo_i64_zext:
30 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
31 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
32 ; VI-NEXT: s_waitcnt lgkmcnt(0)
33 ; VI-NEXT: v_mov_b32_e32 v0, s0
34 ; VI-NEXT: s_add_u32 s0, s2, s4
35 ; VI-NEXT: v_mov_b32_e32 v2, s2
36 ; VI-NEXT: v_mov_b32_e32 v1, s1
37 ; VI-NEXT: v_mov_b32_e32 v3, s3
38 ; VI-NEXT: s_addc_u32 s1, s3, s5
39 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
40 ; VI-NEXT: v_mov_b32_e32 v3, s1
41 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
42 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
43 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
44 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
47 ; GFX9-LABEL: s_uaddo_i64_zext:
49 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
50 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
51 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
52 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
54 ; GFX9-NEXT: s_add_u32 s4, s2, s6
55 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
56 ; GFX9-NEXT: s_addc_u32 s5, s3, s7
57 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
58 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
59 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
60 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
61 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
62 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
64 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
65 %val = extractvalue { i64, i1 } %uadd, 0
66 %carry = extractvalue { i64, i1 } %uadd, 1
67 %ext = zext i1 %carry to i64
68 %add2 = add i64 %val, %ext
69 store i64 %add2, ptr addrspace(1) %out, align 8
73 ; FIXME: Could do scalar
75 define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
76 ; SI-LABEL: s_uaddo_i32:
78 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
79 ; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
80 ; SI-NEXT: s_mov_b32 s7, 0xf000
81 ; SI-NEXT: s_mov_b32 s6, -1
82 ; SI-NEXT: s_mov_b32 s10, s6
83 ; SI-NEXT: s_mov_b32 s11, s7
84 ; SI-NEXT: s_waitcnt lgkmcnt(0)
85 ; SI-NEXT: s_mov_b32 s4, s0
86 ; SI-NEXT: s_mov_b32 s5, s1
87 ; SI-NEXT: s_mov_b32 s8, s2
88 ; SI-NEXT: s_mov_b32 s9, s3
89 ; SI-NEXT: v_mov_b32_e32 v0, s13
90 ; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
91 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
92 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
93 ; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
96 ; VI-LABEL: s_uaddo_i32:
98 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
99 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
101 ; VI-NEXT: v_mov_b32_e32 v0, s0
102 ; VI-NEXT: v_mov_b32_e32 v4, s5
103 ; VI-NEXT: v_mov_b32_e32 v1, s1
104 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
105 ; VI-NEXT: v_mov_b32_e32 v2, s2
106 ; VI-NEXT: v_mov_b32_e32 v3, s3
107 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
108 ; VI-NEXT: flat_store_dword v[0:1], v4
109 ; VI-NEXT: flat_store_byte v[2:3], v5
112 ; GFX9-LABEL: s_uaddo_i32:
114 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
115 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
116 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
119 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1
120 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
121 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
122 ; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
123 ; GFX9-NEXT: s_endpgm
124 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
125 %val = extractvalue { i32, i1 } %uadd, 0
126 %carry = extractvalue { i32, i1 } %uadd, 1
127 store i32 %val, ptr addrspace(1) %out, align 4
128 store i1 %carry, ptr addrspace(1) %carryout
132 define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
133 ; SI-LABEL: v_uaddo_i32:
135 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
136 ; SI-NEXT: s_mov_b32 s11, 0xf000
137 ; SI-NEXT: s_mov_b32 s10, -1
138 ; SI-NEXT: s_mov_b32 s14, s10
139 ; SI-NEXT: s_mov_b32 s15, s11
140 ; SI-NEXT: s_mov_b32 s18, s10
141 ; SI-NEXT: s_mov_b32 s19, s11
142 ; SI-NEXT: s_waitcnt lgkmcnt(0)
143 ; SI-NEXT: s_mov_b32 s12, s4
144 ; SI-NEXT: s_mov_b32 s13, s5
145 ; SI-NEXT: s_mov_b32 s16, s6
146 ; SI-NEXT: s_mov_b32 s17, s7
147 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
148 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
149 ; SI-NEXT: s_mov_b32 s6, s10
150 ; SI-NEXT: s_mov_b32 s7, s11
151 ; SI-NEXT: s_mov_b32 s8, s0
152 ; SI-NEXT: s_mov_b32 s9, s1
153 ; SI-NEXT: s_mov_b32 s4, s2
154 ; SI-NEXT: s_mov_b32 s5, s3
155 ; SI-NEXT: s_waitcnt vmcnt(0)
156 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
157 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
158 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
159 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
162 ; VI-LABEL: v_uaddo_i32:
164 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
165 ; VI-NEXT: s_waitcnt lgkmcnt(0)
166 ; VI-NEXT: v_mov_b32_e32 v0, s4
167 ; VI-NEXT: v_mov_b32_e32 v1, s5
168 ; VI-NEXT: v_mov_b32_e32 v2, s6
169 ; VI-NEXT: v_mov_b32_e32 v3, s7
170 ; VI-NEXT: flat_load_dword v4, v[0:1]
171 ; VI-NEXT: flat_load_dword v5, v[2:3]
172 ; VI-NEXT: v_mov_b32_e32 v0, s0
173 ; VI-NEXT: v_mov_b32_e32 v1, s1
174 ; VI-NEXT: v_mov_b32_e32 v2, s2
175 ; VI-NEXT: v_mov_b32_e32 v3, s3
176 ; VI-NEXT: s_waitcnt vmcnt(0)
177 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
178 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
179 ; VI-NEXT: flat_store_dword v[0:1], v4
180 ; VI-NEXT: flat_store_byte v[2:3], v5
183 ; GFX9-LABEL: v_uaddo_i32:
185 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
186 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
187 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX9-NEXT: global_load_dword v1, v0, s[12:13]
189 ; GFX9-NEXT: global_load_dword v2, v0, s[14:15]
190 ; GFX9-NEXT: s_waitcnt vmcnt(0)
191 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
192 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
193 ; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
194 ; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
195 ; GFX9-NEXT: s_endpgm
196 %tid = call i32 @llvm.amdgcn.workitem.id.x()
197 %tid.ext = sext i32 %tid to i64
198 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
199 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
200 %a = load i32, ptr addrspace(1) %a.gep, align 4
201 %b = load i32, ptr addrspace(1) %b.gep, align 4
202 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
203 %val = extractvalue { i32, i1 } %uadd, 0
204 %carry = extractvalue { i32, i1 } %uadd, 1
205 store i32 %val, ptr addrspace(1) %out, align 4
206 store i1 %carry, ptr addrspace(1) %carryout
210 define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
211 ; SI-LABEL: v_uaddo_i32_novcc:
213 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
214 ; SI-NEXT: s_mov_b32 s11, 0xf000
215 ; SI-NEXT: s_mov_b32 s10, -1
216 ; SI-NEXT: s_mov_b32 s14, s10
217 ; SI-NEXT: s_mov_b32 s15, s11
218 ; SI-NEXT: s_mov_b32 s18, s10
219 ; SI-NEXT: s_mov_b32 s19, s11
220 ; SI-NEXT: s_waitcnt lgkmcnt(0)
221 ; SI-NEXT: s_mov_b32 s12, s4
222 ; SI-NEXT: s_mov_b32 s13, s5
223 ; SI-NEXT: s_mov_b32 s16, s6
224 ; SI-NEXT: s_mov_b32 s17, s7
225 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
226 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
227 ; SI-NEXT: s_mov_b32 s6, s10
228 ; SI-NEXT: s_mov_b32 s7, s11
229 ; SI-NEXT: s_mov_b32 s8, s0
230 ; SI-NEXT: s_mov_b32 s9, s1
231 ; SI-NEXT: s_mov_b32 s4, s2
232 ; SI-NEXT: s_mov_b32 s5, s3
233 ; SI-NEXT: s_waitcnt vmcnt(0)
234 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
235 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
236 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
237 ; SI-NEXT: s_waitcnt vmcnt(0)
238 ; SI-NEXT: ;;#ASMSTART
240 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
241 ; SI-NEXT: s_waitcnt vmcnt(0)
244 ; VI-LABEL: v_uaddo_i32_novcc:
246 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
247 ; VI-NEXT: s_waitcnt lgkmcnt(0)
248 ; VI-NEXT: v_mov_b32_e32 v0, s4
249 ; VI-NEXT: v_mov_b32_e32 v1, s5
250 ; VI-NEXT: v_mov_b32_e32 v2, s6
251 ; VI-NEXT: v_mov_b32_e32 v3, s7
252 ; VI-NEXT: flat_load_dword v4, v[0:1]
253 ; VI-NEXT: flat_load_dword v5, v[2:3]
254 ; VI-NEXT: v_mov_b32_e32 v0, s0
255 ; VI-NEXT: v_mov_b32_e32 v1, s1
256 ; VI-NEXT: v_mov_b32_e32 v2, s2
257 ; VI-NEXT: v_mov_b32_e32 v3, s3
258 ; VI-NEXT: s_waitcnt vmcnt(0)
259 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
260 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
261 ; VI-NEXT: flat_store_dword v[0:1], v4
262 ; VI-NEXT: s_waitcnt vmcnt(0)
263 ; VI-NEXT: ;;#ASMSTART
265 ; VI-NEXT: flat_store_byte v[2:3], v5
266 ; VI-NEXT: s_waitcnt vmcnt(0)
269 ; GFX9-LABEL: v_uaddo_i32_novcc:
271 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
272 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
273 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
274 ; GFX9-NEXT: global_load_dword v1, v0, s[12:13]
275 ; GFX9-NEXT: global_load_dword v2, v0, s[14:15]
276 ; GFX9-NEXT: s_waitcnt vmcnt(0)
277 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
278 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
279 ; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
280 ; GFX9-NEXT: s_waitcnt vmcnt(0)
281 ; GFX9-NEXT: ;;#ASMSTART
282 ; GFX9-NEXT: ;;#ASMEND
283 ; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
284 ; GFX9-NEXT: s_waitcnt vmcnt(0)
285 ; GFX9-NEXT: s_endpgm
286 %tid = call i32 @llvm.amdgcn.workitem.id.x()
287 %tid.ext = sext i32 %tid to i64
288 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
289 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
290 %a = load i32, ptr addrspace(1) %a.gep, align 4
291 %b = load i32, ptr addrspace(1) %b.gep, align 4
292 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
293 %val = extractvalue { i32, i1 } %uadd, 0
294 %carry = extractvalue { i32, i1 } %uadd, 1
295 store volatile i32 %val, ptr addrspace(1) %out, align 4
296 call void asm sideeffect "", "~{vcc}"() #0
297 store volatile i1 %carry, ptr addrspace(1) %carryout
301 define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
302 ; SI-LABEL: s_uaddo_i64:
304 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
305 ; SI-NEXT: s_mov_b32 s11, 0xf000
306 ; SI-NEXT: s_mov_b32 s10, -1
307 ; SI-NEXT: s_waitcnt lgkmcnt(0)
308 ; SI-NEXT: s_add_u32 s6, s4, s6
309 ; SI-NEXT: s_addc_u32 s7, s5, s7
310 ; SI-NEXT: s_mov_b32 s14, s10
311 ; SI-NEXT: s_mov_b32 s15, s11
312 ; SI-NEXT: s_mov_b32 s8, s0
313 ; SI-NEXT: s_mov_b32 s9, s1
314 ; SI-NEXT: s_mov_b32 s12, s2
315 ; SI-NEXT: s_mov_b32 s13, s3
316 ; SI-NEXT: v_mov_b32_e32 v0, s4
317 ; SI-NEXT: v_mov_b32_e32 v1, s5
318 ; SI-NEXT: v_mov_b32_e32 v2, s6
319 ; SI-NEXT: v_mov_b32_e32 v3, s7
320 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
321 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
322 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
323 ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
326 ; VI-LABEL: s_uaddo_i64:
328 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
329 ; VI-NEXT: s_waitcnt lgkmcnt(0)
330 ; VI-NEXT: v_mov_b32_e32 v0, s0
331 ; VI-NEXT: s_add_u32 s0, s4, s6
332 ; VI-NEXT: v_mov_b32_e32 v4, s4
333 ; VI-NEXT: v_mov_b32_e32 v1, s1
334 ; VI-NEXT: s_addc_u32 s1, s5, s7
335 ; VI-NEXT: v_mov_b32_e32 v5, s5
336 ; VI-NEXT: v_mov_b32_e32 v7, s1
337 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
338 ; VI-NEXT: v_mov_b32_e32 v6, s0
339 ; VI-NEXT: v_mov_b32_e32 v2, s2
340 ; VI-NEXT: v_mov_b32_e32 v3, s3
341 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
342 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
343 ; VI-NEXT: flat_store_byte v[2:3], v0
346 ; GFX9-LABEL: s_uaddo_i64:
348 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
349 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
350 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX9-NEXT: s_add_u32 s0, s12, s14
352 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
353 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
354 ; GFX9-NEXT: s_addc_u32 s1, s13, s15
355 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
356 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
357 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
358 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
359 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
360 ; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
361 ; GFX9-NEXT: s_endpgm
362 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
363 %val = extractvalue { i64, i1 } %uadd, 0
364 %carry = extractvalue { i64, i1 } %uadd, 1
365 store i64 %val, ptr addrspace(1) %out, align 8
366 store i1 %carry, ptr addrspace(1) %carryout
370 define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
371 ; SI-LABEL: v_uaddo_i64:
373 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
374 ; SI-NEXT: s_mov_b32 s11, 0xf000
375 ; SI-NEXT: s_mov_b32 s10, -1
376 ; SI-NEXT: s_mov_b32 s14, s10
377 ; SI-NEXT: s_mov_b32 s15, s11
378 ; SI-NEXT: s_mov_b32 s18, s10
379 ; SI-NEXT: s_mov_b32 s19, s11
380 ; SI-NEXT: s_waitcnt lgkmcnt(0)
381 ; SI-NEXT: s_mov_b32 s12, s4
382 ; SI-NEXT: s_mov_b32 s13, s5
383 ; SI-NEXT: s_mov_b32 s16, s6
384 ; SI-NEXT: s_mov_b32 s17, s7
385 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
386 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
387 ; SI-NEXT: s_mov_b32 s6, s10
388 ; SI-NEXT: s_mov_b32 s7, s11
389 ; SI-NEXT: s_mov_b32 s8, s0
390 ; SI-NEXT: s_mov_b32 s9, s1
391 ; SI-NEXT: s_mov_b32 s4, s2
392 ; SI-NEXT: s_mov_b32 s5, s3
393 ; SI-NEXT: s_waitcnt vmcnt(0)
394 ; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
395 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
396 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
397 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
398 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
399 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
402 ; VI-LABEL: v_uaddo_i64:
404 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
405 ; VI-NEXT: s_waitcnt lgkmcnt(0)
406 ; VI-NEXT: v_mov_b32_e32 v0, s4
407 ; VI-NEXT: v_mov_b32_e32 v1, s5
408 ; VI-NEXT: v_mov_b32_e32 v2, s6
409 ; VI-NEXT: v_mov_b32_e32 v3, s7
410 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
411 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
412 ; VI-NEXT: v_mov_b32_e32 v4, s0
413 ; VI-NEXT: v_mov_b32_e32 v5, s1
414 ; VI-NEXT: v_mov_b32_e32 v6, s2
415 ; VI-NEXT: v_mov_b32_e32 v7, s3
416 ; VI-NEXT: s_waitcnt vmcnt(0)
417 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
418 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
419 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
420 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
421 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
422 ; VI-NEXT: flat_store_byte v[6:7], v0
425 ; GFX9-LABEL: v_uaddo_i64:
427 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
428 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
429 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
430 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
431 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
432 ; GFX9-NEXT: s_waitcnt vmcnt(0)
433 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
434 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
435 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
436 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
437 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
438 ; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
439 ; GFX9-NEXT: s_endpgm
440 %tid = call i32 @llvm.amdgcn.workitem.id.x()
441 %tid.ext = sext i32 %tid to i64
442 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
443 %b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr
444 %a = load i64, ptr addrspace(1) %a.gep
445 %b = load i64, ptr addrspace(1) %b.gep
446 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
447 %val = extractvalue { i64, i1 } %uadd, 0
448 %carry = extractvalue { i64, i1 } %uadd, 1
449 store i64 %val, ptr addrspace(1) %out
450 store i1 %carry, ptr addrspace(1) %carryout
454 define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
455 ; SI-LABEL: v_uaddo_i16:
457 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
458 ; SI-NEXT: s_mov_b32 s11, 0xf000
459 ; SI-NEXT: s_mov_b32 s10, -1
460 ; SI-NEXT: s_mov_b32 s14, s10
461 ; SI-NEXT: s_mov_b32 s15, s11
462 ; SI-NEXT: s_mov_b32 s18, s10
463 ; SI-NEXT: s_mov_b32 s19, s11
464 ; SI-NEXT: s_waitcnt lgkmcnt(0)
465 ; SI-NEXT: s_mov_b32 s12, s4
466 ; SI-NEXT: s_mov_b32 s13, s5
467 ; SI-NEXT: s_mov_b32 s16, s6
468 ; SI-NEXT: s_mov_b32 s17, s7
469 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
470 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
471 ; SI-NEXT: s_mov_b32 s6, s10
472 ; SI-NEXT: s_mov_b32 s7, s11
473 ; SI-NEXT: s_mov_b32 s8, s0
474 ; SI-NEXT: s_mov_b32 s9, s1
475 ; SI-NEXT: s_mov_b32 s4, s2
476 ; SI-NEXT: s_mov_b32 s5, s3
477 ; SI-NEXT: s_waitcnt vmcnt(0)
478 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
479 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
480 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
481 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
482 ; SI-NEXT: s_waitcnt expcnt(0)
483 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
484 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
487 ; VI-LABEL: v_uaddo_i16:
489 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
490 ; VI-NEXT: s_waitcnt lgkmcnt(0)
491 ; VI-NEXT: v_mov_b32_e32 v0, s4
492 ; VI-NEXT: v_mov_b32_e32 v1, s5
493 ; VI-NEXT: v_mov_b32_e32 v2, s6
494 ; VI-NEXT: v_mov_b32_e32 v3, s7
495 ; VI-NEXT: flat_load_ushort v4, v[0:1]
496 ; VI-NEXT: flat_load_ushort v5, v[2:3]
497 ; VI-NEXT: v_mov_b32_e32 v0, s0
498 ; VI-NEXT: v_mov_b32_e32 v1, s1
499 ; VI-NEXT: v_mov_b32_e32 v2, s2
500 ; VI-NEXT: v_mov_b32_e32 v3, s3
501 ; VI-NEXT: s_waitcnt vmcnt(0)
502 ; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v5
503 ; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
504 ; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
505 ; VI-NEXT: v_cmp_lt_u32_e32 vcc, v6, v4
506 ; VI-NEXT: flat_store_short v[0:1], v5
507 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
508 ; VI-NEXT: flat_store_byte v[2:3], v0
511 ; GFX9-LABEL: v_uaddo_i16:
513 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
514 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
515 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
516 ; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
517 ; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
518 ; GFX9-NEXT: s_waitcnt vmcnt(0)
519 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v2
520 ; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
521 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
522 ; GFX9-NEXT: global_store_short v0, v2, s[8:9]
523 ; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
524 ; GFX9-NEXT: s_endpgm
525 %tid = call i32 @llvm.amdgcn.workitem.id.x()
526 %tid.ext = sext i32 %tid to i64
527 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
528 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr
529 %a = load i16, ptr addrspace(1) %a.gep
530 %b = load i16, ptr addrspace(1) %b.gep
531 %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
532 %val = extractvalue { i16, i1 } %uadd, 0
533 %carry = extractvalue { i16, i1 } %uadd, 1
534 store i16 %val, ptr addrspace(1) %out
535 store i1 %carry, ptr addrspace(1) %carryout
539 define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
540 ; SI-LABEL: v_uaddo_v2i32:
542 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
543 ; SI-NEXT: s_mov_b32 s11, 0xf000
544 ; SI-NEXT: s_mov_b32 s10, -1
545 ; SI-NEXT: s_mov_b32 s14, s10
546 ; SI-NEXT: s_mov_b32 s15, s11
547 ; SI-NEXT: s_mov_b32 s18, s10
548 ; SI-NEXT: s_mov_b32 s19, s11
549 ; SI-NEXT: s_waitcnt lgkmcnt(0)
550 ; SI-NEXT: s_mov_b32 s12, s4
551 ; SI-NEXT: s_mov_b32 s13, s5
552 ; SI-NEXT: s_mov_b32 s16, s6
553 ; SI-NEXT: s_mov_b32 s17, s7
554 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
555 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
556 ; SI-NEXT: s_mov_b32 s6, s10
557 ; SI-NEXT: s_mov_b32 s7, s11
558 ; SI-NEXT: s_mov_b32 s8, s0
559 ; SI-NEXT: s_mov_b32 s9, s1
560 ; SI-NEXT: s_mov_b32 s4, s2
561 ; SI-NEXT: s_mov_b32 s5, s3
562 ; SI-NEXT: s_waitcnt vmcnt(0)
563 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
564 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
565 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
566 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
567 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
568 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
571 ; VI-LABEL: v_uaddo_v2i32:
573 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
574 ; VI-NEXT: s_waitcnt lgkmcnt(0)
575 ; VI-NEXT: v_mov_b32_e32 v0, s4
576 ; VI-NEXT: v_mov_b32_e32 v1, s5
577 ; VI-NEXT: v_mov_b32_e32 v2, s6
578 ; VI-NEXT: v_mov_b32_e32 v3, s7
579 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
580 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
581 ; VI-NEXT: v_mov_b32_e32 v4, s0
582 ; VI-NEXT: v_mov_b32_e32 v5, s1
583 ; VI-NEXT: v_mov_b32_e32 v6, s2
584 ; VI-NEXT: v_mov_b32_e32 v7, s3
585 ; VI-NEXT: s_waitcnt vmcnt(0)
586 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3
587 ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
588 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
589 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
590 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
591 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[2:3]
594 ; GFX9-LABEL: v_uaddo_v2i32:
596 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
597 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
598 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
600 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
601 ; GFX9-NEXT: s_waitcnt vmcnt(0)
602 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
603 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
604 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
605 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
606 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
607 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
608 ; GFX9-NEXT: s_endpgm
609 %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
610 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
611 %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
612 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
613 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
614 store <2 x i32> %val, ptr addrspace(1) %out, align 4
615 %carry.ext = zext <2 x i1> %carry to <2 x i32>
616 store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
620 define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
621 ; SI-LABEL: s_uaddo_clamp_bit:
622 ; SI: ; %bb.0: ; %entry
623 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
624 ; SI-NEXT: s_waitcnt lgkmcnt(0)
625 ; SI-NEXT: v_mov_b32_e32 v0, s1
626 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
627 ; SI-NEXT: s_cmp_eq_u32 s0, s1
628 ; SI-NEXT: s_mov_b64 s[0:1], 0
629 ; SI-NEXT: s_cbranch_scc1 .LBB8_2
630 ; SI-NEXT: ; %bb.1: ; %if
631 ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
632 ; SI-NEXT: .LBB8_2: ; %exit
633 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
634 ; SI-NEXT: s_mov_b32 s3, 0xf000
635 ; SI-NEXT: s_mov_b32 s2, -1
636 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
637 ; SI-NEXT: s_mov_b32 s10, s2
638 ; SI-NEXT: s_mov_b32 s11, s3
639 ; SI-NEXT: s_waitcnt lgkmcnt(0)
640 ; SI-NEXT: s_mov_b32 s0, s4
641 ; SI-NEXT: s_mov_b32 s1, s5
642 ; SI-NEXT: s_mov_b32 s8, s6
643 ; SI-NEXT: s_mov_b32 s9, s7
644 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
645 ; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
648 ; VI-LABEL: s_uaddo_clamp_bit:
649 ; VI: ; %bb.0: ; %entry
650 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
651 ; VI-NEXT: s_waitcnt lgkmcnt(0)
652 ; VI-NEXT: v_mov_b32_e32 v0, s1
653 ; VI-NEXT: s_cmp_eq_u32 s0, s1
654 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
655 ; VI-NEXT: s_mov_b64 s[0:1], 0
656 ; VI-NEXT: s_cbranch_scc1 .LBB8_2
657 ; VI-NEXT: ; %bb.1: ; %if
658 ; VI-NEXT: s_xor_b64 s[0:1], vcc, -1
659 ; VI-NEXT: .LBB8_2: ; %exit
660 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
661 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
662 ; VI-NEXT: s_waitcnt lgkmcnt(0)
663 ; VI-NEXT: v_mov_b32_e32 v1, s4
664 ; VI-NEXT: v_mov_b32_e32 v2, s5
665 ; VI-NEXT: v_mov_b32_e32 v3, s6
666 ; VI-NEXT: v_mov_b32_e32 v4, s7
667 ; VI-NEXT: flat_store_dword v[1:2], v0
668 ; VI-NEXT: flat_store_byte v[3:4], v5
671 ; GFX9-LABEL: s_uaddo_clamp_bit:
672 ; GFX9: ; %bb.0: ; %entry
673 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
674 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
675 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
676 ; GFX9-NEXT: s_cmp_eq_u32 s0, s1
677 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
678 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
679 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2
680 ; GFX9-NEXT: ; %bb.1: ; %if
681 ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1
682 ; GFX9-NEXT: .LBB8_2: ; %exit
683 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
684 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
685 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
686 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
687 ; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
688 ; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
689 ; GFX9-NEXT: s_endpgm
691 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
692 %val = extractvalue { i32, i1 } %uadd, 0
693 %carry = extractvalue { i32, i1 } %uadd, 1
694 %c2 = icmp eq i1 %carry, false
695 %cc = icmp eq i32 %a, %b
696 br i1 %cc, label %exit, label %if
702 %cout = phi i1 [false, %entry], [%c2, %if]
703 store i32 %val, ptr addrspace(1) %out, align 4
704 store i1 %cout, ptr addrspace(1) %carryout
708 define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
709 ; SI-LABEL: v_uaddo_clamp_bit:
710 ; SI: ; %bb.0: ; %entry
711 ; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
712 ; SI-NEXT: s_mov_b32 s3, 0xf000
713 ; SI-NEXT: s_mov_b32 s2, -1
714 ; SI-NEXT: s_mov_b32 s14, s2
715 ; SI-NEXT: s_mov_b32 s15, s3
716 ; SI-NEXT: s_waitcnt lgkmcnt(0)
717 ; SI-NEXT: s_mov_b32 s0, s8
718 ; SI-NEXT: s_mov_b32 s1, s9
719 ; SI-NEXT: s_mov_b32 s12, s10
720 ; SI-NEXT: s_mov_b32 s13, s11
721 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
722 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
723 ; SI-NEXT: s_waitcnt vmcnt(0)
724 ; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
725 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
726 ; SI-NEXT: s_mov_b64 s[8:9], 0
727 ; SI-NEXT: s_cbranch_vccnz .LBB9_2
728 ; SI-NEXT: ; %bb.1: ; %if
729 ; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
730 ; SI-NEXT: .LBB9_2: ; %exit
731 ; SI-NEXT: s_mov_b32 s0, s4
732 ; SI-NEXT: s_mov_b32 s1, s5
733 ; SI-NEXT: s_mov_b32 s4, s6
734 ; SI-NEXT: s_mov_b32 s5, s7
735 ; SI-NEXT: s_mov_b32 s6, s2
736 ; SI-NEXT: s_mov_b32 s7, s3
737 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
738 ; SI-NEXT: s_waitcnt expcnt(0)
739 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
740 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
743 ; VI-LABEL: v_uaddo_clamp_bit:
744 ; VI: ; %bb.0: ; %entry
745 ; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
746 ; VI-NEXT: s_mov_b64 s[2:3], 0
747 ; VI-NEXT: s_waitcnt lgkmcnt(0)
748 ; VI-NEXT: v_mov_b32_e32 v0, s8
749 ; VI-NEXT: v_mov_b32_e32 v1, s9
750 ; VI-NEXT: v_mov_b32_e32 v2, s10
751 ; VI-NEXT: v_mov_b32_e32 v3, s11
752 ; VI-NEXT: flat_load_dword v1, v[0:1]
753 ; VI-NEXT: flat_load_dword v2, v[2:3]
754 ; VI-NEXT: s_waitcnt vmcnt(0)
755 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
756 ; VI-NEXT: v_add_u32_e64 v0, s[0:1], v1, v2
757 ; VI-NEXT: s_cbranch_vccnz .LBB9_2
758 ; VI-NEXT: ; %bb.1: ; %if
759 ; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1
760 ; VI-NEXT: .LBB9_2: ; %exit
761 ; VI-NEXT: v_mov_b32_e32 v1, s4
762 ; VI-NEXT: v_mov_b32_e32 v2, s5
763 ; VI-NEXT: v_mov_b32_e32 v3, s6
764 ; VI-NEXT: v_mov_b32_e32 v4, s7
765 ; VI-NEXT: flat_store_dword v[1:2], v0
766 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
767 ; VI-NEXT: flat_store_byte v[3:4], v0
770 ; GFX9-LABEL: v_uaddo_clamp_bit:
771 ; GFX9: ; %bb.0: ; %entry
772 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
773 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
774 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
775 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
776 ; GFX9-NEXT: global_load_dword v2, v0, s[12:13]
777 ; GFX9-NEXT: global_load_dword v3, v0, s[14:15]
778 ; GFX9-NEXT: s_waitcnt vmcnt(0)
779 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
780 ; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v2, v3
781 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2
782 ; GFX9-NEXT: ; %bb.1: ; %if
783 ; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1
784 ; GFX9-NEXT: .LBB9_2: ; %exit
785 ; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
786 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
787 ; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
788 ; GFX9-NEXT: s_endpgm
790 %tid = call i32 @llvm.amdgcn.workitem.id.x()
791 %tid.ext = sext i32 %tid to i64
792 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
793 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
794 %a = load i32, ptr addrspace(1) %a.gep
795 %b = load i32, ptr addrspace(1) %b.gep
796 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
797 %val = extractvalue { i32, i1 } %uadd, 0
798 %carry = extractvalue { i32, i1 } %uadd, 1
799 %c2 = icmp eq i1 %carry, false
800 %cc = icmp eq i32 %a, %b
801 br i1 %cc, label %exit, label %if
807 %cout = phi i1 [false, %entry], [%c2, %if]
808 store i32 %val, ptr addrspace(1) %out, align 4
809 store i1 %cout, ptr addrspace(1) %carryout
813 define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
814 ; SI-LABEL: sv_uaddo_i128:
816 ; SI-NEXT: s_mov_b32 s7, 0xf000
817 ; SI-NEXT: s_mov_b32 s6, 0
818 ; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
819 ; SI-NEXT: v_mov_b32_e32 v6, s1
820 ; SI-NEXT: v_mov_b32_e32 v7, s2
821 ; SI-NEXT: v_mov_b32_e32 v8, s3
822 ; SI-NEXT: s_mov_b32 s4, s6
823 ; SI-NEXT: s_mov_b32 s5, s6
824 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc
825 ; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc
826 ; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3]
827 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
828 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc
829 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
830 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
831 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
832 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
833 ; SI-NEXT: v_and_b32_e32 v2, 1, v2
834 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
837 ; VI-LABEL: sv_uaddo_i128:
839 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
840 ; VI-NEXT: v_mov_b32_e32 v6, s1
841 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc
842 ; VI-NEXT: v_mov_b32_e32 v6, s2
843 ; VI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
844 ; VI-NEXT: v_mov_b32_e32 v6, s3
845 ; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
846 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
847 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
848 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
849 ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
850 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
851 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
852 ; VI-NEXT: v_and_b32_e32 v2, 1, v2
853 ; VI-NEXT: flat_store_dword v[0:1], v2
856 ; GFX9-LABEL: sv_uaddo_i128:
858 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
859 ; GFX9-NEXT: v_mov_b32_e32 v6, s1
860 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
861 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
862 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
863 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
864 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
865 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
866 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
867 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
868 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
869 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
870 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
871 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
872 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
873 ; GFX9-NEXT: s_endpgm
874 %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
875 %carry = extractvalue { i128, i1 } %uadd, 1
876 %carry.ext = zext i1 %carry to i32
877 store i32 %carry.ext, ptr addrspace(1) %out
881 declare i32 @llvm.amdgcn.workitem.id.x() #1
882 declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
883 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
884 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
885 declare { i128, i1 } @llvm.uadd.with.overflow.i128(i128, i128) #1
886 declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
889 attributes #0 = { nounwind }
890 attributes #1 = { nounwind readnone }