1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
7 define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
8 ; SI-LABEL: s_usubo_i64_zext:
10 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
11 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
12 ; SI-NEXT: s_mov_b32 s3, 0xf000
13 ; SI-NEXT: s_mov_b32 s2, -1
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s0, s4
16 ; SI-NEXT: s_mov_b32 s1, s5
17 ; SI-NEXT: s_sub_u32 s4, s6, s8
18 ; SI-NEXT: v_mov_b32_e32 v0, s6
19 ; SI-NEXT: v_mov_b32_e32 v1, s7
20 ; SI-NEXT: s_subb_u32 s5, s7, s9
21 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
22 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
23 ; SI-NEXT: v_mov_b32_e32 v1, s5
24 ; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
25 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
26 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
29 ; VI-LABEL: s_usubo_i64_zext:
31 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
32 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: v_mov_b32_e32 v2, s6
35 ; VI-NEXT: s_sub_u32 s0, s6, s0
36 ; VI-NEXT: v_mov_b32_e32 v3, s7
37 ; VI-NEXT: s_subb_u32 s1, s7, s1
38 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
39 ; VI-NEXT: v_mov_b32_e32 v3, s1
40 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
41 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
42 ; VI-NEXT: v_mov_b32_e32 v0, s4
43 ; VI-NEXT: v_mov_b32_e32 v1, s5
44 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
45 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
48 ; GFX9-LABEL: s_usubo_i64_zext:
50 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
51 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
52 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
53 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
55 ; GFX9-NEXT: s_sub_u32 s0, s6, s2
56 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
57 ; GFX9-NEXT: s_subb_u32 s1, s7, s3
58 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
59 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
60 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
61 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
62 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
63 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
65 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
66 %val = extractvalue { i64, i1 } %usub, 0
67 %carry = extractvalue { i64, i1 } %usub, 1
68 %ext = zext i1 %carry to i64
69 %add2 = add i64 %val, %ext
70 store i64 %add2, ptr addrspace(1) %out, align 8
74 ; FIXME: Could do scalar
75 define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
76 ; SI-LABEL: s_usubo_i32:
78 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
79 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
80 ; SI-NEXT: s_mov_b32 s3, 0xf000
81 ; SI-NEXT: s_mov_b32 s2, -1
82 ; SI-NEXT: s_mov_b32 s10, s2
83 ; SI-NEXT: s_mov_b32 s11, s3
84 ; SI-NEXT: s_waitcnt lgkmcnt(0)
85 ; SI-NEXT: s_mov_b32 s0, s4
86 ; SI-NEXT: s_mov_b32 s1, s5
87 ; SI-NEXT: s_mov_b32 s8, s6
88 ; SI-NEXT: s_mov_b32 s9, s7
89 ; SI-NEXT: v_mov_b32_e32 v0, s13
90 ; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
91 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
92 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
93 ; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
96 ; VI-LABEL: s_usubo_i32:
98 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
99 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
101 ; VI-NEXT: v_mov_b32_e32 v0, s4
102 ; VI-NEXT: v_mov_b32_e32 v4, s1
103 ; VI-NEXT: v_mov_b32_e32 v1, s5
104 ; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
105 ; VI-NEXT: v_mov_b32_e32 v2, s6
106 ; VI-NEXT: v_mov_b32_e32 v3, s7
107 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
108 ; VI-NEXT: flat_store_dword v[0:1], v4
109 ; VI-NEXT: flat_store_byte v[2:3], v5
112 ; GFX9-LABEL: s_usubo_i32:
114 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
115 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
116 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
119 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1
120 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
121 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
122 ; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
123 ; GFX9-NEXT: s_endpgm
124 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
125 %val = extractvalue { i32, i1 } %usub, 0
126 %carry = extractvalue { i32, i1 } %usub, 1
127 store i32 %val, ptr addrspace(1) %out, align 4
128 store i1 %carry, ptr addrspace(1) %carryout
132 define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
133 ; SI-LABEL: v_usubo_i32:
135 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
136 ; SI-NEXT: s_mov_b32 s11, 0xf000
137 ; SI-NEXT: s_mov_b32 s10, -1
138 ; SI-NEXT: s_mov_b32 s14, s10
139 ; SI-NEXT: s_mov_b32 s15, s11
140 ; SI-NEXT: s_mov_b32 s18, s10
141 ; SI-NEXT: s_mov_b32 s19, s11
142 ; SI-NEXT: s_waitcnt lgkmcnt(0)
143 ; SI-NEXT: s_mov_b32 s12, s4
144 ; SI-NEXT: s_mov_b32 s13, s5
145 ; SI-NEXT: s_mov_b32 s16, s6
146 ; SI-NEXT: s_mov_b32 s17, s7
147 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
148 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
149 ; SI-NEXT: s_mov_b32 s6, s10
150 ; SI-NEXT: s_mov_b32 s7, s11
151 ; SI-NEXT: s_mov_b32 s8, s0
152 ; SI-NEXT: s_mov_b32 s9, s1
153 ; SI-NEXT: s_mov_b32 s4, s2
154 ; SI-NEXT: s_mov_b32 s5, s3
155 ; SI-NEXT: s_waitcnt vmcnt(0)
156 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
157 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
158 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
159 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
162 ; VI-LABEL: v_usubo_i32:
164 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
165 ; VI-NEXT: s_waitcnt lgkmcnt(0)
166 ; VI-NEXT: v_mov_b32_e32 v0, s4
167 ; VI-NEXT: v_mov_b32_e32 v1, s5
168 ; VI-NEXT: v_mov_b32_e32 v2, s6
169 ; VI-NEXT: v_mov_b32_e32 v3, s7
170 ; VI-NEXT: flat_load_dword v4, v[0:1]
171 ; VI-NEXT: flat_load_dword v5, v[2:3]
172 ; VI-NEXT: v_mov_b32_e32 v0, s0
173 ; VI-NEXT: v_mov_b32_e32 v1, s1
174 ; VI-NEXT: v_mov_b32_e32 v2, s2
175 ; VI-NEXT: v_mov_b32_e32 v3, s3
176 ; VI-NEXT: s_waitcnt vmcnt(0)
177 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
178 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
179 ; VI-NEXT: flat_store_dword v[0:1], v4
180 ; VI-NEXT: flat_store_byte v[2:3], v5
183 ; GFX9-LABEL: v_usubo_i32:
185 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
186 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
187 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
189 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
190 ; GFX9-NEXT: s_waitcnt vmcnt(0)
191 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
192 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
193 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
194 ; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
195 ; GFX9-NEXT: s_endpgm
196 %tid = call i32 @llvm.amdgcn.workitem.id.x()
197 %tid.ext = sext i32 %tid to i64
198 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
199 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
200 %a = load i32, ptr addrspace(1) %a.gep, align 4
201 %b = load i32, ptr addrspace(1) %b.gep, align 4
202 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
203 %val = extractvalue { i32, i1 } %usub, 0
204 %carry = extractvalue { i32, i1 } %usub, 1
205 store i32 %val, ptr addrspace(1) %out, align 4
206 store i1 %carry, ptr addrspace(1) %carryout
210 define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
211 ; SI-LABEL: v_usubo_i32_novcc:
213 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
214 ; SI-NEXT: s_mov_b32 s11, 0xf000
215 ; SI-NEXT: s_mov_b32 s10, -1
216 ; SI-NEXT: s_mov_b32 s14, s10
217 ; SI-NEXT: s_mov_b32 s15, s11
218 ; SI-NEXT: s_mov_b32 s18, s10
219 ; SI-NEXT: s_mov_b32 s19, s11
220 ; SI-NEXT: s_waitcnt lgkmcnt(0)
221 ; SI-NEXT: s_mov_b32 s12, s4
222 ; SI-NEXT: s_mov_b32 s13, s5
223 ; SI-NEXT: s_mov_b32 s16, s6
224 ; SI-NEXT: s_mov_b32 s17, s7
225 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
226 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
227 ; SI-NEXT: s_mov_b32 s6, s10
228 ; SI-NEXT: s_mov_b32 s7, s11
229 ; SI-NEXT: s_mov_b32 s8, s0
230 ; SI-NEXT: s_mov_b32 s9, s1
231 ; SI-NEXT: s_mov_b32 s4, s2
232 ; SI-NEXT: s_mov_b32 s5, s3
233 ; SI-NEXT: s_waitcnt vmcnt(0)
234 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
235 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
236 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
237 ; SI-NEXT: s_waitcnt vmcnt(0)
238 ; SI-NEXT: ;;#ASMSTART
240 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
241 ; SI-NEXT: s_waitcnt vmcnt(0)
244 ; VI-LABEL: v_usubo_i32_novcc:
246 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
247 ; VI-NEXT: s_waitcnt lgkmcnt(0)
248 ; VI-NEXT: v_mov_b32_e32 v0, s4
249 ; VI-NEXT: v_mov_b32_e32 v1, s5
250 ; VI-NEXT: v_mov_b32_e32 v2, s6
251 ; VI-NEXT: v_mov_b32_e32 v3, s7
252 ; VI-NEXT: flat_load_dword v4, v[0:1]
253 ; VI-NEXT: flat_load_dword v5, v[2:3]
254 ; VI-NEXT: v_mov_b32_e32 v0, s0
255 ; VI-NEXT: v_mov_b32_e32 v1, s1
256 ; VI-NEXT: v_mov_b32_e32 v2, s2
257 ; VI-NEXT: v_mov_b32_e32 v3, s3
258 ; VI-NEXT: s_waitcnt vmcnt(0)
259 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
260 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
261 ; VI-NEXT: flat_store_dword v[0:1], v4
262 ; VI-NEXT: s_waitcnt vmcnt(0)
263 ; VI-NEXT: ;;#ASMSTART
265 ; VI-NEXT: flat_store_byte v[2:3], v5
266 ; VI-NEXT: s_waitcnt vmcnt(0)
269 ; GFX9-LABEL: v_usubo_i32_novcc:
271 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
272 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
273 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
274 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
275 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
276 ; GFX9-NEXT: s_waitcnt vmcnt(0)
277 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
278 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
279 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
280 ; GFX9-NEXT: s_waitcnt vmcnt(0)
281 ; GFX9-NEXT: ;;#ASMSTART
282 ; GFX9-NEXT: ;;#ASMEND
283 ; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
284 ; GFX9-NEXT: s_waitcnt vmcnt(0)
285 ; GFX9-NEXT: s_endpgm
286 %tid = call i32 @llvm.amdgcn.workitem.id.x()
287 %tid.ext = sext i32 %tid to i64
288 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
289 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
290 %a = load i32, ptr addrspace(1) %a.gep, align 4
291 %b = load i32, ptr addrspace(1) %b.gep, align 4
292 %uadd = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
293 %val = extractvalue { i32, i1 } %uadd, 0
294 %carry = extractvalue { i32, i1 } %uadd, 1
295 store volatile i32 %val, ptr addrspace(1) %out, align 4
296 call void asm sideeffect "", "~{vcc}"() #0
297 store volatile i1 %carry, ptr addrspace(1) %carryout
301 define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
302 ; SI-LABEL: s_usubo_i64:
304 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
305 ; SI-NEXT: s_mov_b32 s11, 0xf000
306 ; SI-NEXT: s_mov_b32 s10, -1
307 ; SI-NEXT: s_waitcnt lgkmcnt(0)
308 ; SI-NEXT: s_sub_u32 s6, s4, s6
309 ; SI-NEXT: s_subb_u32 s7, s5, s7
310 ; SI-NEXT: s_mov_b32 s14, s10
311 ; SI-NEXT: s_mov_b32 s15, s11
312 ; SI-NEXT: s_mov_b32 s8, s0
313 ; SI-NEXT: s_mov_b32 s9, s1
314 ; SI-NEXT: s_mov_b32 s12, s2
315 ; SI-NEXT: s_mov_b32 s13, s3
316 ; SI-NEXT: v_mov_b32_e32 v0, s4
317 ; SI-NEXT: v_mov_b32_e32 v1, s5
318 ; SI-NEXT: v_mov_b32_e32 v2, s6
319 ; SI-NEXT: v_mov_b32_e32 v3, s7
320 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
321 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
322 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
323 ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
326 ; VI-LABEL: s_usubo_i64:
328 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
329 ; VI-NEXT: s_waitcnt lgkmcnt(0)
330 ; VI-NEXT: v_mov_b32_e32 v0, s0
331 ; VI-NEXT: s_sub_u32 s0, s4, s6
332 ; VI-NEXT: v_mov_b32_e32 v4, s4
333 ; VI-NEXT: v_mov_b32_e32 v1, s1
334 ; VI-NEXT: s_subb_u32 s1, s5, s7
335 ; VI-NEXT: v_mov_b32_e32 v5, s5
336 ; VI-NEXT: v_mov_b32_e32 v7, s1
337 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
338 ; VI-NEXT: v_mov_b32_e32 v6, s0
339 ; VI-NEXT: v_mov_b32_e32 v2, s2
340 ; VI-NEXT: v_mov_b32_e32 v3, s3
341 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
342 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
343 ; VI-NEXT: flat_store_byte v[2:3], v0
346 ; GFX9-LABEL: s_usubo_i64:
348 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
349 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
350 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX9-NEXT: s_sub_u32 s6, s4, s6
352 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
353 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
354 ; GFX9-NEXT: s_subb_u32 s7, s5, s7
355 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
356 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
357 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
358 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
359 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
360 ; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
361 ; GFX9-NEXT: s_endpgm
362 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
363 %val = extractvalue { i64, i1 } %usub, 0
364 %carry = extractvalue { i64, i1 } %usub, 1
365 store i64 %val, ptr addrspace(1) %out, align 8
366 store i1 %carry, ptr addrspace(1) %carryout
370 define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
371 ; SI-LABEL: v_usubo_i64:
373 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
374 ; SI-NEXT: s_mov_b32 s11, 0xf000
375 ; SI-NEXT: s_mov_b32 s10, -1
376 ; SI-NEXT: s_mov_b32 s14, s10
377 ; SI-NEXT: s_mov_b32 s15, s11
378 ; SI-NEXT: s_mov_b32 s18, s10
379 ; SI-NEXT: s_mov_b32 s19, s11
380 ; SI-NEXT: s_waitcnt lgkmcnt(0)
381 ; SI-NEXT: s_mov_b32 s12, s4
382 ; SI-NEXT: s_mov_b32 s13, s5
383 ; SI-NEXT: s_mov_b32 s16, s6
384 ; SI-NEXT: s_mov_b32 s17, s7
385 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
386 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
387 ; SI-NEXT: s_mov_b32 s6, s10
388 ; SI-NEXT: s_mov_b32 s7, s11
389 ; SI-NEXT: s_mov_b32 s8, s0
390 ; SI-NEXT: s_mov_b32 s9, s1
391 ; SI-NEXT: s_mov_b32 s4, s2
392 ; SI-NEXT: s_mov_b32 s5, s3
393 ; SI-NEXT: s_waitcnt vmcnt(0)
394 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
395 ; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
396 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
397 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
398 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
399 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
402 ; VI-LABEL: v_usubo_i64:
404 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
405 ; VI-NEXT: s_waitcnt lgkmcnt(0)
406 ; VI-NEXT: v_mov_b32_e32 v0, s4
407 ; VI-NEXT: v_mov_b32_e32 v1, s5
408 ; VI-NEXT: v_mov_b32_e32 v2, s6
409 ; VI-NEXT: v_mov_b32_e32 v3, s7
410 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
411 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
412 ; VI-NEXT: v_mov_b32_e32 v4, s0
413 ; VI-NEXT: v_mov_b32_e32 v5, s1
414 ; VI-NEXT: v_mov_b32_e32 v6, s2
415 ; VI-NEXT: v_mov_b32_e32 v7, s3
416 ; VI-NEXT: s_waitcnt vmcnt(0)
417 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
418 ; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
419 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
420 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
421 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
422 ; VI-NEXT: flat_store_byte v[6:7], v0
425 ; GFX9-LABEL: v_usubo_i64:
427 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
428 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
429 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
430 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
431 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
432 ; GFX9-NEXT: s_waitcnt vmcnt(0)
433 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
434 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
435 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
436 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
437 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
438 ; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
439 ; GFX9-NEXT: s_endpgm
440 %tid = call i32 @llvm.amdgcn.workitem.id.x()
441 %tid.ext = sext i32 %tid to i64
442 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
443 %b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr
444 %a = load i64, ptr addrspace(1) %a.gep
445 %b = load i64, ptr addrspace(1) %b.gep
446 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
447 %val = extractvalue { i64, i1 } %usub, 0
448 %carry = extractvalue { i64, i1 } %usub, 1
449 store i64 %val, ptr addrspace(1) %out, align 8
450 store i1 %carry, ptr addrspace(1) %carryout
454 define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
455 ; SI-LABEL: v_usubo_i16:
457 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
458 ; SI-NEXT: s_mov_b32 s11, 0xf000
459 ; SI-NEXT: s_mov_b32 s10, -1
460 ; SI-NEXT: s_mov_b32 s14, s10
461 ; SI-NEXT: s_mov_b32 s15, s11
462 ; SI-NEXT: s_mov_b32 s18, s10
463 ; SI-NEXT: s_mov_b32 s19, s11
464 ; SI-NEXT: s_waitcnt lgkmcnt(0)
465 ; SI-NEXT: s_mov_b32 s12, s4
466 ; SI-NEXT: s_mov_b32 s13, s5
467 ; SI-NEXT: s_mov_b32 s16, s6
468 ; SI-NEXT: s_mov_b32 s17, s7
469 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
470 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
471 ; SI-NEXT: s_mov_b32 s6, s10
472 ; SI-NEXT: s_mov_b32 s7, s11
473 ; SI-NEXT: s_mov_b32 s8, s0
474 ; SI-NEXT: s_mov_b32 s9, s1
475 ; SI-NEXT: s_mov_b32 s4, s2
476 ; SI-NEXT: s_mov_b32 s5, s3
477 ; SI-NEXT: s_waitcnt vmcnt(0)
478 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
479 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
480 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
481 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
482 ; SI-NEXT: s_waitcnt expcnt(0)
483 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
484 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
487 ; VI-LABEL: v_usubo_i16:
489 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
490 ; VI-NEXT: s_waitcnt lgkmcnt(0)
491 ; VI-NEXT: v_mov_b32_e32 v0, s4
492 ; VI-NEXT: v_mov_b32_e32 v1, s5
493 ; VI-NEXT: v_mov_b32_e32 v2, s6
494 ; VI-NEXT: v_mov_b32_e32 v3, s7
495 ; VI-NEXT: flat_load_ushort v4, v[0:1]
496 ; VI-NEXT: flat_load_ushort v5, v[2:3]
497 ; VI-NEXT: v_mov_b32_e32 v0, s0
498 ; VI-NEXT: v_mov_b32_e32 v1, s1
499 ; VI-NEXT: v_mov_b32_e32 v2, s2
500 ; VI-NEXT: v_mov_b32_e32 v3, s3
501 ; VI-NEXT: s_waitcnt vmcnt(0)
502 ; VI-NEXT: v_sub_u16_e32 v5, v4, v5
503 ; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4
504 ; VI-NEXT: flat_store_short v[0:1], v5
505 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
506 ; VI-NEXT: flat_store_byte v[2:3], v0
509 ; GFX9-LABEL: v_usubo_i16:
511 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
512 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
513 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
515 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
516 ; GFX9-NEXT: s_waitcnt vmcnt(0)
517 ; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2
518 ; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1
519 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
520 ; GFX9-NEXT: global_store_short v0, v2, s[0:1]
521 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
522 ; GFX9-NEXT: s_endpgm
523 %tid = call i32 @llvm.amdgcn.workitem.id.x()
524 %tid.ext = sext i32 %tid to i64
525 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
526 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr
527 %a = load i16, ptr addrspace(1) %a.gep
528 %b = load i16, ptr addrspace(1) %b.gep
529 %usub = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
530 %val = extractvalue { i16, i1 } %usub, 0
531 %carry = extractvalue { i16, i1 } %usub, 1
532 store i16 %val, ptr addrspace(1) %out
533 store i1 %carry, ptr addrspace(1) %carryout
537 define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
538 ; SI-LABEL: v_usubo_v2i32:
540 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
541 ; SI-NEXT: s_mov_b32 s11, 0xf000
542 ; SI-NEXT: s_mov_b32 s10, -1
543 ; SI-NEXT: s_mov_b32 s14, s10
544 ; SI-NEXT: s_mov_b32 s15, s11
545 ; SI-NEXT: s_mov_b32 s18, s10
546 ; SI-NEXT: s_mov_b32 s19, s11
547 ; SI-NEXT: s_waitcnt lgkmcnt(0)
548 ; SI-NEXT: s_mov_b32 s12, s4
549 ; SI-NEXT: s_mov_b32 s13, s5
550 ; SI-NEXT: s_mov_b32 s16, s6
551 ; SI-NEXT: s_mov_b32 s17, s7
552 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
553 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
554 ; SI-NEXT: s_mov_b32 s6, s10
555 ; SI-NEXT: s_mov_b32 s7, s11
556 ; SI-NEXT: s_mov_b32 s8, s0
557 ; SI-NEXT: s_mov_b32 s9, s1
558 ; SI-NEXT: s_mov_b32 s4, s2
559 ; SI-NEXT: s_mov_b32 s5, s3
560 ; SI-NEXT: s_waitcnt vmcnt(0)
561 ; SI-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
562 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
563 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
564 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
565 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
566 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
569 ; VI-LABEL: v_usubo_v2i32:
571 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
572 ; VI-NEXT: s_waitcnt lgkmcnt(0)
573 ; VI-NEXT: v_mov_b32_e32 v0, s4
574 ; VI-NEXT: v_mov_b32_e32 v1, s5
575 ; VI-NEXT: v_mov_b32_e32 v2, s6
576 ; VI-NEXT: v_mov_b32_e32 v3, s7
577 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
578 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
579 ; VI-NEXT: v_mov_b32_e32 v4, s0
580 ; VI-NEXT: v_mov_b32_e32 v5, s1
581 ; VI-NEXT: v_mov_b32_e32 v6, s2
582 ; VI-NEXT: v_mov_b32_e32 v7, s3
583 ; VI-NEXT: s_waitcnt vmcnt(0)
584 ; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
585 ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
586 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
587 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
588 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
589 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[2:3]
592 ; GFX9-LABEL: v_usubo_v2i32:
594 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
595 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
596 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
598 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
599 ; GFX9-NEXT: s_waitcnt vmcnt(0)
600 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
601 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
602 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
603 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
604 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
605 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
606 ; GFX9-NEXT: s_endpgm
607 %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
608 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
609 %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
610 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
611 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
612 store <2 x i32> %val, ptr addrspace(1) %out, align 4
613 %carry.ext = zext <2 x i1> %carry to <2 x i32>
614 store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
618 define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
619 ; SI-LABEL: s_usubo_clamp_bit:
620 ; SI: ; %bb.0: ; %entry
621 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
622 ; SI-NEXT: s_waitcnt lgkmcnt(0)
623 ; SI-NEXT: v_mov_b32_e32 v0, s3
624 ; SI-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
625 ; SI-NEXT: s_cmp_eq_u32 s2, s3
626 ; SI-NEXT: s_mov_b64 s[2:3], 0
627 ; SI-NEXT: s_cbranch_scc1 .LBB8_2
628 ; SI-NEXT: ; %bb.1: ; %if
629 ; SI-NEXT: s_xor_b64 s[2:3], vcc, -1
630 ; SI-NEXT: .LBB8_2: ; %exit
631 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
632 ; SI-NEXT: s_mov_b32 s11, 0xf000
633 ; SI-NEXT: s_mov_b32 s10, -1
634 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
635 ; SI-NEXT: s_mov_b32 s2, s10
636 ; SI-NEXT: s_mov_b32 s3, s11
637 ; SI-NEXT: s_waitcnt lgkmcnt(0)
638 ; SI-NEXT: s_mov_b32 s8, s4
639 ; SI-NEXT: s_mov_b32 s9, s5
640 ; SI-NEXT: s_mov_b32 s0, s6
641 ; SI-NEXT: s_mov_b32 s1, s7
642 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
643 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
646 ; VI-LABEL: s_usubo_clamp_bit:
647 ; VI: ; %bb.0: ; %entry
648 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
649 ; VI-NEXT: s_waitcnt lgkmcnt(0)
650 ; VI-NEXT: v_mov_b32_e32 v0, s3
651 ; VI-NEXT: s_cmp_eq_u32 s2, s3
652 ; VI-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
653 ; VI-NEXT: s_mov_b64 s[2:3], 0
654 ; VI-NEXT: s_cbranch_scc1 .LBB8_2
655 ; VI-NEXT: ; %bb.1: ; %if
656 ; VI-NEXT: s_xor_b64 s[2:3], vcc, -1
657 ; VI-NEXT: .LBB8_2: ; %exit
658 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
659 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
660 ; VI-NEXT: s_waitcnt lgkmcnt(0)
661 ; VI-NEXT: v_mov_b32_e32 v1, s4
662 ; VI-NEXT: v_mov_b32_e32 v2, s5
663 ; VI-NEXT: v_mov_b32_e32 v3, s6
664 ; VI-NEXT: v_mov_b32_e32 v4, s7
665 ; VI-NEXT: flat_store_dword v[1:2], v0
666 ; VI-NEXT: flat_store_byte v[3:4], v5
669 ; GFX9-LABEL: s_usubo_clamp_bit:
670 ; GFX9: ; %bb.0: ; %entry
671 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
672 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
673 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
674 ; GFX9-NEXT: s_cmp_eq_u32 s2, s3
675 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
676 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
677 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2
678 ; GFX9-NEXT: ; %bb.1: ; %if
679 ; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1
680 ; GFX9-NEXT: .LBB8_2: ; %exit
681 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
682 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
683 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
684 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
685 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
686 ; GFX9-NEXT: global_store_byte v1, v2, s[6:7]
687 ; GFX9-NEXT: s_endpgm
689 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
690 %val = extractvalue { i32, i1 } %usub, 0
691 %carry = extractvalue { i32, i1 } %usub, 1
692 %c2 = icmp eq i1 %carry, false
693 %cc = icmp eq i32 %a, %b
694 br i1 %cc, label %exit, label %if
700 %cout = phi i1 [false, %entry], [%c2, %if]
701 store i32 %val, ptr addrspace(1) %out, align 4
702 store i1 %cout, ptr addrspace(1) %carryout
707 define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
708 ; SI-LABEL: v_usubo_clamp_bit:
709 ; SI: ; %bb.0: ; %entry
710 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
711 ; SI-NEXT: s_mov_b32 s3, 0xf000
712 ; SI-NEXT: s_mov_b32 s2, -1
713 ; SI-NEXT: s_mov_b32 s14, s2
714 ; SI-NEXT: s_mov_b32 s15, s3
715 ; SI-NEXT: s_waitcnt lgkmcnt(0)
716 ; SI-NEXT: s_mov_b32 s0, s8
717 ; SI-NEXT: s_mov_b32 s1, s9
718 ; SI-NEXT: s_mov_b32 s12, s10
719 ; SI-NEXT: s_mov_b32 s13, s11
720 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
721 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
722 ; SI-NEXT: s_waitcnt vmcnt(0)
723 ; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
724 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
725 ; SI-NEXT: s_mov_b64 s[8:9], 0
726 ; SI-NEXT: s_cbranch_vccnz .LBB9_2
727 ; SI-NEXT: ; %bb.1: ; %if
728 ; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
729 ; SI-NEXT: .LBB9_2: ; %exit
730 ; SI-NEXT: s_mov_b32 s0, s4
731 ; SI-NEXT: s_mov_b32 s1, s5
732 ; SI-NEXT: s_mov_b32 s4, s6
733 ; SI-NEXT: s_mov_b32 s5, s7
734 ; SI-NEXT: s_mov_b32 s6, s2
735 ; SI-NEXT: s_mov_b32 s7, s3
736 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
737 ; SI-NEXT: s_waitcnt expcnt(0)
738 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
739 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
742 ; VI-LABEL: v_usubo_clamp_bit:
743 ; VI: ; %bb.0: ; %entry
744 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
745 ; VI-NEXT: s_mov_b64 s[2:3], 0
746 ; VI-NEXT: s_waitcnt lgkmcnt(0)
747 ; VI-NEXT: v_mov_b32_e32 v0, s8
748 ; VI-NEXT: v_mov_b32_e32 v1, s9
749 ; VI-NEXT: v_mov_b32_e32 v2, s10
750 ; VI-NEXT: v_mov_b32_e32 v3, s11
751 ; VI-NEXT: flat_load_dword v1, v[0:1]
752 ; VI-NEXT: flat_load_dword v2, v[2:3]
753 ; VI-NEXT: s_waitcnt vmcnt(0)
754 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
755 ; VI-NEXT: v_sub_u32_e64 v0, s[0:1], v1, v2
756 ; VI-NEXT: s_cbranch_vccnz .LBB9_2
757 ; VI-NEXT: ; %bb.1: ; %if
758 ; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1
759 ; VI-NEXT: .LBB9_2: ; %exit
760 ; VI-NEXT: v_mov_b32_e32 v1, s4
761 ; VI-NEXT: v_mov_b32_e32 v2, s5
762 ; VI-NEXT: v_mov_b32_e32 v3, s6
763 ; VI-NEXT: v_mov_b32_e32 v4, s7
764 ; VI-NEXT: flat_store_dword v[1:2], v0
765 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
766 ; VI-NEXT: flat_store_byte v[3:4], v0
769 ; GFX9-LABEL: v_usubo_clamp_bit:
770 ; GFX9: ; %bb.0: ; %entry
771 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
772 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
773 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
774 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
775 ; GFX9-NEXT: global_load_dword v2, v0, s[8:9]
776 ; GFX9-NEXT: global_load_dword v3, v0, s[10:11]
777 ; GFX9-NEXT: s_waitcnt vmcnt(0)
778 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
779 ; GFX9-NEXT: v_sub_co_u32_e64 v1, s[0:1], v2, v3
780 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2
781 ; GFX9-NEXT: ; %bb.1: ; %if
782 ; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1
783 ; GFX9-NEXT: .LBB9_2: ; %exit
784 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
785 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
786 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
787 ; GFX9-NEXT: s_endpgm
789 %tid = call i32 @llvm.amdgcn.workitem.id.x()
790 %tid.ext = sext i32 %tid to i64
791 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
792 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
793 %a = load i32, ptr addrspace(1) %a.gep, align 4
794 %b = load i32, ptr addrspace(1) %b.gep, align 4
795 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
796 %val = extractvalue { i32, i1 } %usub, 0
797 %carry = extractvalue { i32, i1 } %usub, 1
798 %c2 = icmp eq i1 %carry, false
799 %cc = icmp eq i32 %a, %b
800 br i1 %cc, label %exit, label %if
806 %cout = phi i1 [false, %entry], [%c2, %if]
807 store i32 %val, ptr addrspace(1) %out, align 4
808 store i1 %cout, ptr addrspace(1) %carryout
812 declare i32 @llvm.amdgcn.workitem.id.x() #1
813 declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1
814 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
815 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
816 declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
818 attributes #0 = { nounwind }
819 attributes #1 = { nounwind readnone }