1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
8 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
9 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
12 declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
14 define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
15 ; SI-LABEL: saddo_i64_zext:
17 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
18 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
19 ; SI-NEXT: s_mov_b32 s3, 0xf000
20 ; SI-NEXT: s_mov_b32 s2, -1
21 ; SI-NEXT: s_waitcnt lgkmcnt(0)
22 ; SI-NEXT: v_mov_b32_e32 v0, s6
23 ; SI-NEXT: s_add_u32 s10, s6, s8
24 ; SI-NEXT: s_addc_u32 s11, s7, s9
25 ; SI-NEXT: v_mov_b32_e32 v1, s7
26 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
27 ; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
28 ; SI-NEXT: s_mov_b32 s0, s4
29 ; SI-NEXT: s_mov_b32 s1, s5
30 ; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
31 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
32 ; SI-NEXT: v_mov_b32_e32 v1, s11
33 ; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
34 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
35 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
38 ; VI-LABEL: saddo_i64_zext:
40 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
41 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
42 ; VI-NEXT: s_waitcnt lgkmcnt(0)
43 ; VI-NEXT: v_mov_b32_e32 v1, s6
44 ; VI-NEXT: s_add_u32 s2, s6, s0
45 ; VI-NEXT: s_addc_u32 s3, s7, s1
46 ; VI-NEXT: v_mov_b32_e32 v2, s7
47 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
48 ; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
49 ; VI-NEXT: v_mov_b32_e32 v3, s3
50 ; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
51 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
52 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
53 ; VI-NEXT: v_mov_b32_e32 v0, s4
54 ; VI-NEXT: v_mov_b32_e32 v1, s5
55 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
56 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
59 ; GFX9-LABEL: saddo_i64_zext:
61 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
62 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
63 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
64 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
65 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
66 ; GFX9-NEXT: s_add_u32 s0, s6, s2
67 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
68 ; GFX9-NEXT: s_addc_u32 s1, s7, s3
69 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
70 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
71 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
72 ; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
73 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
79 ; GFX10-LABEL: saddo_i64_zext:
81 ; GFX10-NEXT: s_clause 0x1
82 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
83 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
84 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
85 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
86 ; GFX10-NEXT: s_add_u32 s0, s6, s2
87 ; GFX10-NEXT: s_addc_u32 s1, s7, s3
88 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
89 ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
90 ; GFX10-NEXT: s_xor_b32 s2, s2, s3
91 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
92 ; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
93 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
94 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
95 ; GFX10-NEXT: s_endpgm
96 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
97 %val = extractvalue { i64, i1 } %sadd, 0
98 %carry = extractvalue { i64, i1 } %sadd, 1
99 %ext = zext i1 %carry to i64
100 %add2 = add i64 %val, %ext
101 store i64 %add2, i64 addrspace(1)* %out, align 8
105 define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
106 ; SI-LABEL: s_saddo_i32:
108 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
109 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
110 ; SI-NEXT: s_mov_b32 s3, 0xf000
111 ; SI-NEXT: s_mov_b32 s2, -1
112 ; SI-NEXT: s_waitcnt lgkmcnt(0)
113 ; SI-NEXT: s_mov_b32 s0, s4
114 ; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0
115 ; SI-NEXT: s_add_i32 s9, s8, s9
116 ; SI-NEXT: v_mov_b32_e32 v0, s8
117 ; SI-NEXT: s_mov_b32 s1, s5
118 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0
119 ; SI-NEXT: v_mov_b32_e32 v0, s9
120 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
121 ; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
122 ; SI-NEXT: s_mov_b32 s4, s6
123 ; SI-NEXT: s_mov_b32 s5, s7
124 ; SI-NEXT: s_mov_b32 s6, s2
125 ; SI-NEXT: s_mov_b32 s7, s3
126 ; SI-NEXT: s_waitcnt expcnt(0)
127 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
128 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
131 ; VI-LABEL: s_saddo_i32:
133 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
134 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
135 ; VI-NEXT: s_waitcnt lgkmcnt(0)
136 ; VI-NEXT: v_mov_b32_e32 v0, s4
137 ; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0
138 ; VI-NEXT: s_add_i32 s1, s0, s1
139 ; VI-NEXT: v_mov_b32_e32 v4, s0
140 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4
141 ; VI-NEXT: v_mov_b32_e32 v4, s1
142 ; VI-NEXT: v_mov_b32_e32 v1, s5
143 ; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
144 ; VI-NEXT: flat_store_dword v[0:1], v4
145 ; VI-NEXT: v_mov_b32_e32 v2, s6
146 ; VI-NEXT: v_mov_b32_e32 v3, s7
147 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
148 ; VI-NEXT: flat_store_byte v[2:3], v0
151 ; GFX9-LABEL: s_saddo_i32:
153 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
154 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
155 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
156 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
158 ; GFX9-NEXT: s_add_i32 s0, s2, s3
159 ; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp
160 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1
161 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
162 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
163 ; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
164 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
165 ; GFX9-NEXT: s_endpgm
167 ; GFX10-LABEL: s_saddo_i32:
169 ; GFX10-NEXT: s_clause 0x1
170 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
171 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
172 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
173 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp
175 ; GFX10-NEXT: s_add_i32 s0, s2, s3
176 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
177 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0
178 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
179 ; GFX10-NEXT: global_store_dword v1, v2, s[4:5]
180 ; GFX10-NEXT: global_store_byte v1, v0, s[6:7]
181 ; GFX10-NEXT: s_endpgm
182 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
183 %val = extractvalue { i32, i1 } %sadd, 0
184 %carry = extractvalue { i32, i1 } %sadd, 1
185 store i32 %val, i32 addrspace(1)* %out, align 4
186 store i1 %carry, i1 addrspace(1)* %carryout
190 define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
191 ; SI-LABEL: v_saddo_i32:
193 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
194 ; SI-NEXT: s_mov_b32 s11, 0xf000
195 ; SI-NEXT: s_mov_b32 s10, -1
196 ; SI-NEXT: s_mov_b32 s14, s10
197 ; SI-NEXT: s_mov_b32 s15, s11
198 ; SI-NEXT: s_waitcnt lgkmcnt(0)
199 ; SI-NEXT: s_mov_b32 s12, s4
200 ; SI-NEXT: s_mov_b32 s13, s5
201 ; SI-NEXT: s_mov_b32 s4, s6
202 ; SI-NEXT: s_mov_b32 s5, s7
203 ; SI-NEXT: s_mov_b32 s6, s10
204 ; SI-NEXT: s_mov_b32 s7, s11
205 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
206 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
207 ; SI-NEXT: s_mov_b32 s8, s0
208 ; SI-NEXT: s_mov_b32 s9, s1
209 ; SI-NEXT: s_mov_b32 s4, s2
210 ; SI-NEXT: s_mov_b32 s5, s3
211 ; SI-NEXT: s_waitcnt vmcnt(0)
212 ; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0
213 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
214 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
215 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
216 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
217 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
218 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
221 ; VI-LABEL: v_saddo_i32:
223 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
224 ; VI-NEXT: s_waitcnt lgkmcnt(0)
225 ; VI-NEXT: v_mov_b32_e32 v0, s4
226 ; VI-NEXT: v_mov_b32_e32 v1, s5
227 ; VI-NEXT: v_mov_b32_e32 v2, s6
228 ; VI-NEXT: v_mov_b32_e32 v3, s7
229 ; VI-NEXT: flat_load_dword v4, v[0:1]
230 ; VI-NEXT: flat_load_dword v5, v[2:3]
231 ; VI-NEXT: v_mov_b32_e32 v0, s0
232 ; VI-NEXT: v_mov_b32_e32 v1, s1
233 ; VI-NEXT: v_mov_b32_e32 v2, s2
234 ; VI-NEXT: v_mov_b32_e32 v3, s3
235 ; VI-NEXT: s_waitcnt vmcnt(0)
236 ; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4
237 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
238 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4
239 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
240 ; VI-NEXT: flat_store_dword v[0:1], v6
241 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
242 ; VI-NEXT: flat_store_byte v[2:3], v0
245 ; GFX9-LABEL: v_saddo_i32:
247 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
248 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
249 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
251 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
252 ; GFX9-NEXT: s_waitcnt vmcnt(0)
253 ; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp
254 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
255 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
256 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
257 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
258 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
259 ; GFX9-NEXT: s_endpgm
261 ; GFX10-LABEL: v_saddo_i32:
263 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
264 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
265 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX10-NEXT: s_clause 0x1
267 ; GFX10-NEXT: global_load_dword v1, v0, s[4:5]
268 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
269 ; GFX10-NEXT: s_waitcnt vmcnt(0)
270 ; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp
271 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
272 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
273 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
274 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
275 ; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
276 ; GFX10-NEXT: s_endpgm
277 %a = load i32, i32 addrspace(1)* %aptr, align 4
278 %b = load i32, i32 addrspace(1)* %bptr, align 4
279 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
280 %val = extractvalue { i32, i1 } %sadd, 0
281 %carry = extractvalue { i32, i1 } %sadd, 1
282 store i32 %val, i32 addrspace(1)* %out, align 4
283 store i1 %carry, i1 addrspace(1)* %carryout
287 define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
288 ; SI-LABEL: s_saddo_i64:
290 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
291 ; SI-NEXT: s_mov_b32 s11, 0xf000
292 ; SI-NEXT: s_mov_b32 s10, -1
293 ; SI-NEXT: s_waitcnt lgkmcnt(0)
294 ; SI-NEXT: s_add_u32 s12, s4, s6
295 ; SI-NEXT: v_mov_b32_e32 v0, s4
296 ; SI-NEXT: s_addc_u32 s13, s5, s7
297 ; SI-NEXT: v_mov_b32_e32 v1, s5
298 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
299 ; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
300 ; SI-NEXT: v_mov_b32_e32 v0, s12
301 ; SI-NEXT: s_mov_b32 s8, s0
302 ; SI-NEXT: s_mov_b32 s9, s1
303 ; SI-NEXT: v_mov_b32_e32 v1, s13
304 ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
305 ; SI-NEXT: s_mov_b32 s0, s2
306 ; SI-NEXT: s_mov_b32 s1, s3
307 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
308 ; SI-NEXT: s_mov_b32 s2, s10
309 ; SI-NEXT: s_mov_b32 s3, s11
310 ; SI-NEXT: s_waitcnt expcnt(0)
311 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
312 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
315 ; VI-LABEL: s_saddo_i64:
317 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
318 ; VI-NEXT: s_waitcnt lgkmcnt(0)
319 ; VI-NEXT: v_mov_b32_e32 v0, s0
320 ; VI-NEXT: v_mov_b32_e32 v4, s4
321 ; VI-NEXT: s_add_u32 s0, s4, s6
322 ; VI-NEXT: v_mov_b32_e32 v1, s1
323 ; VI-NEXT: s_addc_u32 s1, s5, s7
324 ; VI-NEXT: v_mov_b32_e32 v5, s5
325 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
326 ; VI-NEXT: v_mov_b32_e32 v2, s2
327 ; VI-NEXT: v_mov_b32_e32 v3, s3
328 ; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
329 ; VI-NEXT: v_mov_b32_e32 v5, s1
330 ; VI-NEXT: v_mov_b32_e32 v4, s0
331 ; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
332 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
333 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
334 ; VI-NEXT: flat_store_byte v[2:3], v0
337 ; GFX9-LABEL: s_saddo_i64:
339 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
340 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
341 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX9-NEXT: s_add_u32 s8, s4, s6
343 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
344 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
345 ; GFX9-NEXT: s_addc_u32 s9, s5, s7
346 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
347 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
348 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
349 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
350 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
351 ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
352 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
353 ; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
354 ; GFX9-NEXT: s_endpgm
356 ; GFX10-LABEL: s_saddo_i64:
358 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
359 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
360 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
361 ; GFX10-NEXT: s_add_u32 s8, s4, s6
362 ; GFX10-NEXT: s_addc_u32 s9, s5, s7
363 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
364 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
365 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
366 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
367 ; GFX10-NEXT: s_xor_b32 s4, s6, s4
368 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
369 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
370 ; GFX10-NEXT: global_store_byte v2, v3, s[2:3]
371 ; GFX10-NEXT: s_endpgm
372 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
373 %val = extractvalue { i64, i1 } %sadd, 0
374 %carry = extractvalue { i64, i1 } %sadd, 1
375 store i64 %val, i64 addrspace(1)* %out, align 8
376 store i1 %carry, i1 addrspace(1)* %carryout
380 define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
381 ; SI-LABEL: v_saddo_i64:
383 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
384 ; SI-NEXT: s_mov_b32 s11, 0xf000
385 ; SI-NEXT: s_mov_b32 s10, -1
386 ; SI-NEXT: s_mov_b32 s14, s10
387 ; SI-NEXT: s_mov_b32 s15, s11
388 ; SI-NEXT: s_waitcnt lgkmcnt(0)
389 ; SI-NEXT: s_mov_b32 s12, s4
390 ; SI-NEXT: s_mov_b32 s13, s5
391 ; SI-NEXT: s_mov_b32 s4, s6
392 ; SI-NEXT: s_mov_b32 s5, s7
393 ; SI-NEXT: s_mov_b32 s6, s10
394 ; SI-NEXT: s_mov_b32 s7, s11
395 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
396 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
397 ; SI-NEXT: s_mov_b32 s8, s0
398 ; SI-NEXT: s_mov_b32 s9, s1
399 ; SI-NEXT: s_mov_b32 s4, s2
400 ; SI-NEXT: s_mov_b32 s5, s3
401 ; SI-NEXT: s_waitcnt vmcnt(0)
402 ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
403 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
404 ; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
405 ; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
406 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
407 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
408 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
409 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
412 ; VI-LABEL: v_saddo_i64:
414 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
415 ; VI-NEXT: s_waitcnt lgkmcnt(0)
416 ; VI-NEXT: v_mov_b32_e32 v0, s4
417 ; VI-NEXT: v_mov_b32_e32 v1, s5
418 ; VI-NEXT: v_mov_b32_e32 v2, s6
419 ; VI-NEXT: v_mov_b32_e32 v3, s7
420 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
421 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
422 ; VI-NEXT: v_mov_b32_e32 v4, s0
423 ; VI-NEXT: v_mov_b32_e32 v5, s1
424 ; VI-NEXT: v_mov_b32_e32 v6, s2
425 ; VI-NEXT: v_mov_b32_e32 v7, s3
426 ; VI-NEXT: s_waitcnt vmcnt(0)
427 ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
428 ; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
429 ; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
430 ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
431 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
432 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
433 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
434 ; VI-NEXT: flat_store_byte v[6:7], v0
437 ; GFX9-LABEL: v_saddo_i64:
439 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
440 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
441 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
443 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
444 ; GFX9-NEXT: s_waitcnt vmcnt(0)
445 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
446 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
447 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
448 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
449 ; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
450 ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
451 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
452 ; GFX9-NEXT: global_store_byte v6, v0, s[6:7]
453 ; GFX9-NEXT: s_endpgm
455 ; GFX10-LABEL: v_saddo_i64:
457 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
458 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
459 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX10-NEXT: s_clause 0x1
461 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
462 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
463 ; GFX10-NEXT: s_waitcnt vmcnt(0)
464 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
465 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
466 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
467 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
468 ; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
469 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
470 ; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
471 ; GFX10-NEXT: global_store_byte v6, v0, s[6:7]
472 ; GFX10-NEXT: s_endpgm
473 %a = load i64, i64 addrspace(1)* %aptr, align 4
474 %b = load i64, i64 addrspace(1)* %bptr, align 4
475 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
476 %val = extractvalue { i64, i1 } %sadd, 0
477 %carry = extractvalue { i64, i1 } %sadd, 1
478 store i64 %val, i64 addrspace(1)* %out, align 8
479 store i1 %carry, i1 addrspace(1)* %carryout
483 define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
484 ; SI-LABEL: v_saddo_v2i32:
486 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
487 ; SI-NEXT: s_mov_b32 s11, 0xf000
488 ; SI-NEXT: s_mov_b32 s10, -1
489 ; SI-NEXT: s_mov_b32 s14, s10
490 ; SI-NEXT: s_mov_b32 s15, s11
491 ; SI-NEXT: s_waitcnt lgkmcnt(0)
492 ; SI-NEXT: s_mov_b32 s12, s4
493 ; SI-NEXT: s_mov_b32 s13, s5
494 ; SI-NEXT: s_mov_b32 s4, s6
495 ; SI-NEXT: s_mov_b32 s5, s7
496 ; SI-NEXT: s_mov_b32 s6, s10
497 ; SI-NEXT: s_mov_b32 s7, s11
498 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
499 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
500 ; SI-NEXT: s_mov_b32 s8, s0
501 ; SI-NEXT: s_mov_b32 s9, s1
502 ; SI-NEXT: s_mov_b32 s12, s2
503 ; SI-NEXT: s_mov_b32 s13, s3
504 ; SI-NEXT: s_waitcnt vmcnt(0)
505 ; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3
506 ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
507 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
508 ; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
509 ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
510 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
511 ; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
512 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
513 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
514 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
515 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
516 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
519 ; VI-LABEL: v_saddo_v2i32:
521 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
522 ; VI-NEXT: s_waitcnt lgkmcnt(0)
523 ; VI-NEXT: v_mov_b32_e32 v0, s4
524 ; VI-NEXT: v_mov_b32_e32 v1, s5
525 ; VI-NEXT: v_mov_b32_e32 v2, s6
526 ; VI-NEXT: v_mov_b32_e32 v3, s7
527 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
528 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
529 ; VI-NEXT: v_mov_b32_e32 v4, s0
530 ; VI-NEXT: v_mov_b32_e32 v5, s1
531 ; VI-NEXT: v_mov_b32_e32 v6, s2
532 ; VI-NEXT: v_mov_b32_e32 v7, s3
533 ; VI-NEXT: s_waitcnt vmcnt(0)
534 ; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3
535 ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
536 ; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
537 ; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
538 ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
539 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
540 ; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
541 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
542 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
543 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
544 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
545 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1]
548 ; GFX9-LABEL: v_saddo_v2i32:
550 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
551 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
552 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
553 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
554 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
555 ; GFX9-NEXT: s_waitcnt vmcnt(0)
556 ; GFX9-NEXT: v_add_i32 v5, v0, v2 clamp
557 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
558 ; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp
559 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
560 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
561 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
562 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
563 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v5
564 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
565 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
566 ; GFX9-NEXT: s_endpgm
568 ; GFX10-LABEL: v_saddo_v2i32:
570 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
571 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
572 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX10-NEXT: s_clause 0x1
574 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
575 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
576 ; GFX10-NEXT: s_waitcnt vmcnt(0)
577 ; GFX10-NEXT: v_add_nc_i32 v5, v1, v3 clamp
578 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
579 ; GFX10-NEXT: v_add_nc_i32 v6, v0, v2 clamp
580 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
581 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v5
582 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
583 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v0, v6
584 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
585 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
586 ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
587 ; GFX10-NEXT: s_endpgm
588 %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
589 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
590 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
591 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
592 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
593 store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
594 %carry.ext = zext <2 x i1> %carry to <2 x i32>
595 store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout