1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,SICIVI,SI
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,SICIVI,VI
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,GFX9
7 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
8 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
11 declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
13 define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
14 ; SI-LABEL: saddo_i64_zext:
16 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
17 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
18 ; SI-NEXT: s_mov_b32 s7, 0xf000
19 ; SI-NEXT: s_mov_b32 s6, -1
20 ; SI-NEXT: s_waitcnt lgkmcnt(0)
21 ; SI-NEXT: s_mov_b32 s4, s8
22 ; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
23 ; SI-NEXT: s_mov_b32 s5, s9
24 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
25 ; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
26 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
27 ; SI-NEXT: s_add_u32 s2, s10, s0
28 ; SI-NEXT: s_addc_u32 s3, s11, s1
29 ; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
30 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
31 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
32 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
33 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
34 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
35 ; SI-NEXT: v_mov_b32_e32 v1, s3
36 ; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
37 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
38 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
41 ; VI-LABEL: saddo_i64_zext:
43 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
44 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: v_mov_b32_e32 v0, s4
47 ; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
48 ; VI-NEXT: v_mov_b32_e32 v1, s5
49 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
50 ; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
51 ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
52 ; VI-NEXT: s_add_u32 s2, s6, s0
53 ; VI-NEXT: s_addc_u32 s3, s7, s1
54 ; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
55 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
56 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
57 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2
58 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
59 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
60 ; VI-NEXT: v_mov_b32_e32 v3, s3
61 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
62 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
63 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
66 ; GFX9-LABEL: saddo_i64_zext:
68 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
69 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
70 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
72 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], -1
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
74 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
75 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
76 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
77 ; GFX9-NEXT: s_add_u32 s2, s6, s0
78 ; GFX9-NEXT: s_addc_u32 s3, s7, s1
79 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
80 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
81 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
82 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v2
83 ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
84 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
85 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
86 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
87 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
88 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
90 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
91 %val = extractvalue { i64, i1 } %sadd, 0
92 %carry = extractvalue { i64, i1 } %sadd, 1
93 %ext = zext i1 %carry to i64
94 %add2 = add i64 %val, %ext
95 store i64 %add2, i64 addrspace(1)* %out, align 8
99 define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
100 ; SI-LABEL: s_saddo_i32:
102 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
103 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
104 ; SI-NEXT: s_mov_b32 s7, 0xf000
105 ; SI-NEXT: s_mov_b32 s6, -1
106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
107 ; SI-NEXT: s_mov_b32 s4, s8
108 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
109 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
110 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
111 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
112 ; SI-NEXT: s_add_i32 s2, s0, s1
113 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
114 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
115 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
116 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
117 ; SI-NEXT: s_mov_b32 s5, s9
118 ; SI-NEXT: v_mov_b32_e32 v0, s2
119 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
120 ; SI-NEXT: s_mov_b32 s8, s10
121 ; SI-NEXT: s_mov_b32 s9, s11
122 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
123 ; SI-NEXT: s_mov_b32 s10, s6
124 ; SI-NEXT: s_mov_b32 s11, s7
125 ; SI-NEXT: s_waitcnt expcnt(0)
126 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
127 ; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0
130 ; VI-LABEL: s_saddo_i32:
132 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
133 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
134 ; VI-NEXT: s_waitcnt lgkmcnt(0)
135 ; VI-NEXT: v_mov_b32_e32 v0, s4
136 ; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
137 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
138 ; VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
139 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
140 ; VI-NEXT: s_add_i32 s2, s0, s1
141 ; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
142 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
143 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
144 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
145 ; VI-NEXT: v_mov_b32_e32 v1, s5
146 ; VI-NEXT: v_mov_b32_e32 v4, s2
147 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
148 ; VI-NEXT: flat_store_dword v[0:1], v4
149 ; VI-NEXT: v_mov_b32_e32 v2, s6
150 ; VI-NEXT: v_mov_b32_e32 v3, s7
151 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
152 ; VI-NEXT: flat_store_byte v[2:3], v0
155 ; GFX9-LABEL: s_saddo_i32:
157 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
158 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
159 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
160 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
161 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s1, -1
162 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3]
163 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s0, -1
164 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
165 ; GFX9-NEXT: s_add_i32 s2, s0, s1
166 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, -1
167 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
168 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
169 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
170 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
171 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
172 ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
173 ; GFX9-NEXT: global_store_dword v[0:1], v4, off
174 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
175 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
176 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
177 ; GFX9-NEXT: global_store_byte v[2:3], v0, off
178 ; GFX9-NEXT: s_endpgm
179 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
180 %val = extractvalue { i32, i1 } %sadd, 0
181 %carry = extractvalue { i32, i1 } %sadd, 1
182 store i32 %val, i32 addrspace(1)* %out, align 4
183 store i1 %carry, i1 addrspace(1)* %carryout
187 define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
188 ; SI-LABEL: v_saddo_i32:
190 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
191 ; SI-NEXT: s_mov_b32 s15, 0xf000
192 ; SI-NEXT: s_mov_b32 s14, -1
193 ; SI-NEXT: s_mov_b32 s2, s14
194 ; SI-NEXT: s_mov_b32 s3, s15
195 ; SI-NEXT: s_waitcnt lgkmcnt(0)
196 ; SI-NEXT: s_mov_b32 s0, s10
197 ; SI-NEXT: s_mov_b32 s1, s11
198 ; SI-NEXT: s_mov_b32 s10, s14
199 ; SI-NEXT: s_mov_b32 s11, s15
200 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
201 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
202 ; SI-NEXT: s_mov_b32 s12, s6
203 ; SI-NEXT: s_mov_b32 s13, s7
204 ; SI-NEXT: s_mov_b32 s6, s14
205 ; SI-NEXT: s_mov_b32 s7, s15
206 ; SI-NEXT: s_waitcnt vmcnt(0)
207 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
208 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
209 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
210 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
211 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
212 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0
213 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
214 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
215 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v3, v1
216 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
217 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
218 ; SI-NEXT: s_waitcnt expcnt(0)
219 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
220 ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
223 ; VI-LABEL: v_saddo_i32:
225 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
226 ; VI-NEXT: s_waitcnt lgkmcnt(0)
227 ; VI-NEXT: v_mov_b32_e32 v4, s6
228 ; VI-NEXT: v_mov_b32_e32 v5, s7
229 ; VI-NEXT: v_mov_b32_e32 v6, s4
230 ; VI-NEXT: v_mov_b32_e32 v7, s5
231 ; VI-NEXT: flat_load_dword v6, v[6:7]
232 ; VI-NEXT: flat_load_dword v4, v[4:5]
233 ; VI-NEXT: v_mov_b32_e32 v2, s0
234 ; VI-NEXT: v_mov_b32_e32 v3, s1
235 ; VI-NEXT: v_mov_b32_e32 v0, s2
236 ; VI-NEXT: v_mov_b32_e32 v1, s3
237 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
238 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
239 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
240 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
241 ; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
242 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
243 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
244 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
245 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
246 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5
247 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
248 ; VI-NEXT: flat_store_dword v[2:3], v4
249 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
250 ; VI-NEXT: flat_store_byte v[0:1], v2
253 ; GFX9-LABEL: v_saddo_i32:
255 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
256 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
257 ; GFX9-NEXT: v_mov_b32_e32 v4, s6
258 ; GFX9-NEXT: v_mov_b32_e32 v5, s7
259 ; GFX9-NEXT: v_mov_b32_e32 v6, s4
260 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
261 ; GFX9-NEXT: global_load_dword v6, v[6:7], off
262 ; GFX9-NEXT: global_load_dword v4, v[4:5], off
263 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
264 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
265 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
266 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
267 ; GFX9-NEXT: s_waitcnt vmcnt(0)
268 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
269 ; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
270 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
271 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
272 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
273 ; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
274 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
275 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
276 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v7, v5
277 ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
278 ; GFX9-NEXT: global_store_dword v[2:3], v4, off
279 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
280 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
281 ; GFX9-NEXT: s_endpgm
282 %a = load i32, i32 addrspace(1)* %aptr, align 4
283 %b = load i32, i32 addrspace(1)* %bptr, align 4
284 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
285 %val = extractvalue { i32, i1 } %sadd, 0
286 %carry = extractvalue { i32, i1 } %sadd, 1
287 store i32 %val, i32 addrspace(1)* %out, align 4
288 store i1 %carry, i1 addrspace(1)* %carryout
292 define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
293 ; SI-LABEL: s_saddo_i64:
295 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
296 ; SI-NEXT: s_mov_b32 s15, 0xf000
297 ; SI-NEXT: s_mov_b32 s14, -1
298 ; SI-NEXT: s_waitcnt lgkmcnt(0)
299 ; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], -1
300 ; SI-NEXT: s_add_u32 s2, s8, s10
301 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
302 ; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[8:9], -1
303 ; SI-NEXT: s_addc_u32 s3, s9, s11
304 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
305 ; SI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
306 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0
307 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
308 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, v0
309 ; SI-NEXT: v_mov_b32_e32 v0, s2
310 ; SI-NEXT: s_mov_b32 s12, s6
311 ; SI-NEXT: s_mov_b32 s13, s7
312 ; SI-NEXT: s_mov_b32 s6, s14
313 ; SI-NEXT: s_mov_b32 s7, s15
314 ; SI-NEXT: v_mov_b32_e32 v1, s3
315 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
316 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
317 ; SI-NEXT: s_waitcnt expcnt(0)
318 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
319 ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
322 ; VI-LABEL: s_saddo_i64:
324 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
325 ; VI-NEXT: s_waitcnt lgkmcnt(0)
326 ; VI-NEXT: v_mov_b32_e32 v2, s0
327 ; VI-NEXT: v_mov_b32_e32 v3, s1
328 ; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
329 ; VI-NEXT: v_mov_b32_e32 v0, s2
330 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
331 ; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
332 ; VI-NEXT: s_add_u32 s2, s4, s6
333 ; VI-NEXT: v_mov_b32_e32 v1, s3
334 ; VI-NEXT: s_addc_u32 s3, s5, s7
335 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
336 ; VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
337 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
338 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
339 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
340 ; VI-NEXT: v_mov_b32_e32 v5, s3
341 ; VI-NEXT: v_mov_b32_e32 v4, s2
342 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
343 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
344 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
345 ; VI-NEXT: flat_store_byte v[0:1], v2
348 ; GFX9-LABEL: s_saddo_i64:
350 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
351 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
352 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
353 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
354 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
355 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
356 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
357 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
358 ; GFX9-NEXT: s_add_u32 s2, s4, s6
359 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
360 ; GFX9-NEXT: s_addc_u32 s3, s5, s7
361 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
362 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], -1
363 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
364 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
365 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v4
366 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
367 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
368 ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
369 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
370 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
371 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
372 ; GFX9-NEXT: s_endpgm
373 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
374 %val = extractvalue { i64, i1 } %sadd, 0
375 %carry = extractvalue { i64, i1 } %sadd, 1
376 store i64 %val, i64 addrspace(1)* %out, align 8
377 store i1 %carry, i1 addrspace(1)* %carryout
381 define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
382 ; SI-LABEL: v_saddo_i64:
384 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
385 ; SI-NEXT: s_mov_b32 s15, 0xf000
386 ; SI-NEXT: s_mov_b32 s14, -1
387 ; SI-NEXT: s_mov_b32 s2, s14
388 ; SI-NEXT: s_mov_b32 s3, s15
389 ; SI-NEXT: s_waitcnt lgkmcnt(0)
390 ; SI-NEXT: s_mov_b32 s0, s10
391 ; SI-NEXT: s_mov_b32 s1, s11
392 ; SI-NEXT: s_mov_b32 s10, s14
393 ; SI-NEXT: s_mov_b32 s11, s15
394 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
395 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
396 ; SI-NEXT: s_mov_b32 s12, s6
397 ; SI-NEXT: s_mov_b32 s13, s7
398 ; SI-NEXT: s_mov_b32 s6, s14
399 ; SI-NEXT: s_mov_b32 s7, s15
400 ; SI-NEXT: s_waitcnt vmcnt(0)
401 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
402 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
403 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
404 ; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
405 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
406 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
407 ; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[0:1]
408 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
409 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
410 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v2
411 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
412 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
413 ; SI-NEXT: s_waitcnt expcnt(0)
414 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
415 ; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
418 ; VI-LABEL: v_saddo_i64:
420 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
421 ; VI-NEXT: s_waitcnt lgkmcnt(0)
422 ; VI-NEXT: v_mov_b32_e32 v4, s6
423 ; VI-NEXT: v_mov_b32_e32 v5, s7
424 ; VI-NEXT: v_mov_b32_e32 v6, s4
425 ; VI-NEXT: v_mov_b32_e32 v7, s5
426 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
427 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
428 ; VI-NEXT: v_mov_b32_e32 v2, s0
429 ; VI-NEXT: v_mov_b32_e32 v3, s1
430 ; VI-NEXT: v_mov_b32_e32 v0, s2
431 ; VI-NEXT: v_mov_b32_e32 v1, s3
432 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
433 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
434 ; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
435 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
436 ; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
437 ; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
438 ; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
439 ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
440 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
441 ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
442 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6
443 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
444 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
445 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
446 ; VI-NEXT: flat_store_byte v[0:1], v2
449 ; GFX9-LABEL: v_saddo_i64:
451 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
452 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
453 ; GFX9-NEXT: v_mov_b32_e32 v4, s6
454 ; GFX9-NEXT: v_mov_b32_e32 v5, s7
455 ; GFX9-NEXT: v_mov_b32_e32 v6, s4
456 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
457 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
458 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
459 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
460 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
461 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
462 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
463 ; GFX9-NEXT: s_waitcnt vmcnt(0)
464 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
465 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
466 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
467 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
468 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4
469 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v5, vcc
470 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
471 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
472 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
473 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], v9, v6
474 ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
475 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
476 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
477 ; GFX9-NEXT: global_store_byte v[0:1], v2, off
478 ; GFX9-NEXT: s_endpgm
479 %a = load i64, i64 addrspace(1)* %aptr, align 4
480 %b = load i64, i64 addrspace(1)* %bptr, align 4
481 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
482 %val = extractvalue { i64, i1 } %sadd, 0
483 %carry = extractvalue { i64, i1 } %sadd, 1
484 store i64 %val, i64 addrspace(1)* %out, align 8
485 store i1 %carry, i1 addrspace(1)* %carryout
489 define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
490 ; SI-LABEL: v_saddo_v2i32:
492 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
493 ; SI-NEXT: s_mov_b32 s15, 0xf000
494 ; SI-NEXT: s_mov_b32 s14, -1
495 ; SI-NEXT: s_mov_b32 s2, s14
496 ; SI-NEXT: s_mov_b32 s3, s15
497 ; SI-NEXT: s_waitcnt lgkmcnt(0)
498 ; SI-NEXT: s_mov_b32 s0, s10
499 ; SI-NEXT: s_mov_b32 s1, s11
500 ; SI-NEXT: s_mov_b32 s10, s14
501 ; SI-NEXT: s_mov_b32 s11, s15
502 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
503 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
504 ; SI-NEXT: s_mov_b32 s12, s6
505 ; SI-NEXT: s_mov_b32 s13, s7
506 ; SI-NEXT: s_mov_b32 s6, s14
507 ; SI-NEXT: s_mov_b32 s7, s15
508 ; SI-NEXT: s_waitcnt vmcnt(0)
509 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
510 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
511 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
512 ; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
513 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
514 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
515 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
516 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
517 ; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
518 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
519 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v0
520 ; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v1
521 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
522 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v6, v2
523 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
524 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
525 ; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], v6, v2
526 ; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
527 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
528 ; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], v5, v3
529 ; SI-NEXT: s_waitcnt expcnt(0)
530 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
531 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
532 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
533 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
536 ; VI-LABEL: v_saddo_v2i32:
538 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
539 ; VI-NEXT: s_waitcnt lgkmcnt(0)
540 ; VI-NEXT: v_mov_b32_e32 v4, s6
541 ; VI-NEXT: v_mov_b32_e32 v5, s7
542 ; VI-NEXT: v_mov_b32_e32 v6, s4
543 ; VI-NEXT: v_mov_b32_e32 v7, s5
544 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
545 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
546 ; VI-NEXT: v_mov_b32_e32 v0, s2
547 ; VI-NEXT: v_mov_b32_e32 v1, s3
548 ; VI-NEXT: v_mov_b32_e32 v2, s0
549 ; VI-NEXT: v_mov_b32_e32 v3, s1
550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
551 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
552 ; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
553 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
554 ; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
555 ; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
556 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
557 ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
558 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
559 ; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
560 ; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v5
561 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
562 ; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5
563 ; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
564 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6
565 ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
566 ; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6
567 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
568 ; VI-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7
569 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
570 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
571 ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
572 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
573 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
574 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
577 ; GFX9-LABEL: v_saddo_v2i32:
579 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
580 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
581 ; GFX9-NEXT: v_mov_b32_e32 v4, s6
582 ; GFX9-NEXT: v_mov_b32_e32 v5, s7
583 ; GFX9-NEXT: v_mov_b32_e32 v6, s4
584 ; GFX9-NEXT: v_mov_b32_e32 v7, s5
585 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
586 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
587 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
588 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
589 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
590 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
592 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
593 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
594 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v6
595 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
596 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
597 ; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
598 ; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
599 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
600 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
601 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
602 ; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], -1, v4
603 ; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], -1, v5
604 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1]
605 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], v10, v6
606 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
607 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], v10, v6
608 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v9, v8
609 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], v9, v7
610 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
611 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
612 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
613 ; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[2:3]
614 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
615 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
616 ; GFX9-NEXT: s_endpgm
617 %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
618 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
619 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
620 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
621 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
622 store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
623 %carry.ext = zext <2 x i1> %carry to <2 x i32>
624 store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout