1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
6 define i32 @v_uaddo_i32(i32 %a, i32 %b) {
7 ; GFX7-LABEL: v_uaddo_i32:
9 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
12 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
13 ; GFX7-NEXT: s_setpc_b64 s[30:31]
15 ; GFX8-LABEL: v_uaddo_i32:
17 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
19 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
20 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
21 ; GFX8-NEXT: s_setpc_b64 s[30:31]
23 ; GFX9-LABEL: v_uaddo_i32:
25 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
27 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
28 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
29 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30 %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
31 %add = extractvalue {i32, i1} %uaddo, 0
32 %of = extractvalue {i32, i1} %uaddo, 1
33 %of.zext = zext i1 %of to i32
34 %ret = add i32 %add, %of.zext
38 define i64 @v_uaddo_i64(i64 %a, i64 %b) {
39 ; GFX7-LABEL: v_uaddo_i64:
41 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
43 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
44 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
45 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
46 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
47 ; GFX7-NEXT: s_setpc_b64 s[30:31]
49 ; GFX8-LABEL: v_uaddo_i64:
51 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
53 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
54 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
55 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
56 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
57 ; GFX8-NEXT: s_setpc_b64 s[30:31]
59 ; GFX9-LABEL: v_uaddo_i64:
61 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
63 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
64 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
65 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
66 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
67 ; GFX9-NEXT: s_setpc_b64 s[30:31]
68 %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
69 %add = extractvalue {i64, i1} %uaddo, 0
70 %of = extractvalue {i64, i1} %uaddo, 1
71 %of.zext = zext i1 %of to i64
72 %ret = add i64 %add, %of.zext
76 define i8 @v_uaddo_i8(i8 %a, i8 %b) {
77 ; GFX7-LABEL: v_uaddo_i8:
79 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
81 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
82 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
83 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0
84 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
85 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
86 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
87 ; GFX7-NEXT: s_setpc_b64 s[30:31]
89 ; GFX8-LABEL: v_uaddo_i8:
91 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
93 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
94 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
95 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0
96 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
97 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
98 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
99 ; GFX8-NEXT: s_setpc_b64 s[30:31]
101 ; GFX9-LABEL: v_uaddo_i8:
103 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
105 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
106 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
107 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
109 %uaddo = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
110 %add = extractvalue {i8, i1} %uaddo, 0
111 %of = extractvalue {i8, i1} %uaddo, 1
112 %of.zext = zext i1 %of to i8
113 %ret = add i8 %add, %of.zext
117 define i7 @v_uaddo_i7(i7 %a, i7 %b) {
118 ; GFX7-LABEL: v_uaddo_i7:
120 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0
122 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1
123 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
124 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0
125 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
126 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
127 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
128 ; GFX7-NEXT: s_setpc_b64 s[30:31]
130 ; GFX8-LABEL: v_uaddo_i7:
132 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
133 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0
134 ; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1
135 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
136 ; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0
137 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
138 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
139 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
140 ; GFX8-NEXT: s_setpc_b64 s[30:31]
142 ; GFX9-LABEL: v_uaddo_i7:
144 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0
146 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1
147 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
148 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0
149 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
150 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
151 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
152 ; GFX9-NEXT: s_setpc_b64 s[30:31]
153 %uaddo = call {i7, i1} @llvm.uadd.with.overflow.i7(i7 %a, i7 %b)
154 %add = extractvalue {i7, i1} %uaddo, 0
155 %of = extractvalue {i7, i1} %uaddo, 1
156 %of.zext = zext i1 %of to i7
157 %ret = add i7 %add, %of.zext
161 define <2 x i32> @v_uaddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
162 ; GFX7-LABEL: v_uaddo_v2i32:
164 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
166 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
167 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
168 ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
169 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
170 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
171 ; GFX7-NEXT: s_setpc_b64 s[30:31]
173 ; GFX8-LABEL: v_uaddo_v2i32:
175 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
177 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
178 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
179 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
180 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
181 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
182 ; GFX8-NEXT: s_setpc_b64 s[30:31]
184 ; GFX9-LABEL: v_uaddo_v2i32:
186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
188 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
189 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
190 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
191 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
192 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
193 ; GFX9-NEXT: s_setpc_b64 s[30:31]
194 %uaddo = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
195 %add = extractvalue {<2 x i32>, <2 x i1>} %uaddo, 0
196 %of = extractvalue {<2 x i32>, <2 x i1>} %uaddo, 1
197 %of.zext = zext <2 x i1> %of to <2 x i32>
198 %ret = add <2 x i32> %add, %of.zext
202 define i32 @v_saddo_i32(i32 %a, i32 %b) {
203 ; GFX7-LABEL: v_saddo_i32:
205 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v1
207 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0
208 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
209 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
210 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
211 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
212 ; GFX7-NEXT: s_setpc_b64 s[30:31]
214 ; GFX8-LABEL: v_saddo_i32:
216 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
218 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0
219 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
220 ; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
221 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
222 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
223 ; GFX8-NEXT: s_setpc_b64 s[30:31]
225 ; GFX9-LABEL: v_saddo_i32:
227 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
229 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0
230 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
231 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
232 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
233 ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
234 ; GFX9-NEXT: s_setpc_b64 s[30:31]
235 %saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
236 %add = extractvalue {i32, i1} %saddo, 0
237 %of = extractvalue {i32, i1} %saddo, 1
238 %of.zext = zext i1 %of to i32
239 %ret = add i32 %add, %of.zext
243 define i64 @v_saddo_i64(i64 %a, i64 %b) {
244 ; GFX7-LABEL: v_saddo_i64:
246 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v0, v2
248 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
249 ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
250 ; GFX7-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
251 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
252 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
253 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
254 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
255 ; GFX7-NEXT: s_setpc_b64 s[30:31]
257 ; GFX8-LABEL: v_saddo_i64:
259 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
261 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
262 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
263 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
264 ; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
265 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
266 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
267 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
268 ; GFX8-NEXT: s_setpc_b64 s[30:31]
270 ; GFX9-LABEL: v_saddo_i64:
272 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
274 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
275 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
276 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
277 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
278 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
279 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
280 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
281 ; GFX9-NEXT: s_setpc_b64 s[30:31]
282 %saddo = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
283 %add = extractvalue {i64, i1} %saddo, 0
284 %of = extractvalue {i64, i1} %saddo, 1
285 %of.zext = zext i1 %of to i64
286 %ret = add i64 %add, %of.zext
290 define <2 x i32> @v_saddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
291 ; GFX7-LABEL: v_saddo_v2i32:
293 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v0, v2
295 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v1, v3
296 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0
297 ; GFX7-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
298 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2
299 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v3
300 ; GFX7-NEXT: s_xor_b64 s[6:7], s[6:7], vcc
301 ; GFX7-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
302 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
303 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
304 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
305 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v5, v1
306 ; GFX7-NEXT: s_setpc_b64 s[30:31]
308 ; GFX8-LABEL: v_saddo_v2i32:
310 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
312 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v1, v3
313 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0
314 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
315 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2
316 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v3
317 ; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], vcc
318 ; GFX8-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
319 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
320 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
321 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
322 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
323 ; GFX8-NEXT: s_setpc_b64 s[30:31]
325 ; GFX9-LABEL: v_saddo_v2i32:
327 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; GFX9-NEXT: v_add_u32_e32 v4, v0, v2
329 ; GFX9-NEXT: v_add_u32_e32 v5, v1, v3
330 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0
331 ; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
332 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2
333 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v3
334 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], vcc
335 ; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
336 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
337 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
338 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
339 ; GFX9-NEXT: v_add_u32_e32 v1, v5, v1
340 ; GFX9-NEXT: s_setpc_b64 s[30:31]
341 %saddo = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
342 %add = extractvalue {<2 x i32>, <2 x i1>} %saddo, 0
343 %of = extractvalue {<2 x i32>, <2 x i1>} %saddo, 1
344 %of.zext = zext <2 x i1> %of to <2 x i32>
345 %ret = add <2 x i32> %add, %of.zext
349 define i8 @v_saddo_i8(i8 %a, i8 %b) {
350 ; GFX7-LABEL: v_saddo_i8:
352 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v1
354 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
355 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8
356 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
357 ; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 8
358 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
359 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
360 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
361 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
362 ; GFX7-NEXT: s_setpc_b64 s[30:31]
364 ; GFX8-LABEL: v_saddo_i8:
366 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
368 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8
369 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
370 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
371 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 8
372 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
373 ; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
374 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
375 ; GFX8-NEXT: v_add_u16_e32 v0, v2, v0
376 ; GFX8-NEXT: s_setpc_b64 s[30:31]
378 ; GFX9-LABEL: v_saddo_i8:
380 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381 ; GFX9-NEXT: v_add_u16_e32 v2, v0, v1
382 ; GFX9-NEXT: v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
383 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
384 ; GFX9-NEXT: v_cmp_lt_i32_sdwa s[6:7], sext(v1), v0 src0_sel:BYTE_0 src1_sel:DWORD
385 ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
386 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
387 ; GFX9-NEXT: v_add_u16_e32 v0, v2, v0
388 ; GFX9-NEXT: s_setpc_b64 s[30:31]
389 %saddo = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
390 %add = extractvalue {i8, i1} %saddo, 0
391 %of = extractvalue {i8, i1} %saddo, 1
392 %of.zext = zext i1 %of to i8
393 %ret = add i8 %add, %of.zext
397 define i7 @v_saddo_i7(i7 %a, i7 %b) {
398 ; GFX7-LABEL: v_saddo_i7:
400 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v1
402 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 7
403 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 7
404 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
405 ; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 7
406 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
407 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
408 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
409 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
410 ; GFX7-NEXT: s_setpc_b64 s[30:31]
412 ; GFX8-LABEL: v_saddo_i7:
414 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
416 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 7
417 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 7
418 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
419 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 7
420 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
421 ; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
422 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
423 ; GFX8-NEXT: v_add_u16_e32 v0, v2, v0
424 ; GFX8-NEXT: s_setpc_b64 s[30:31]
426 ; GFX9-LABEL: v_saddo_i7:
428 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429 ; GFX9-NEXT: v_add_u16_e32 v2, v0, v1
430 ; GFX9-NEXT: v_bfe_i32 v3, v2, 0, 7
431 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 7
432 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
433 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 7
434 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
435 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
436 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
437 ; GFX9-NEXT: v_add_u16_e32 v0, v2, v0
438 ; GFX9-NEXT: s_setpc_b64 s[30:31]
439 %saddo = call {i7, i1} @llvm.sadd.with.overflow.i7(i7 %a, i7 %b)
440 %add = extractvalue {i7, i1} %saddo, 0
441 %of = extractvalue {i7, i1} %saddo, 1
442 %of.zext = zext i1 %of to i7
443 %ret = add i7 %add, %of.zext
447 define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
448 ; GFX7-LABEL: s_uaddo_i32:
450 ; GFX7-NEXT: s_add_u32 s0, s0, s1
451 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0
452 ; GFX7-NEXT: s_add_i32 s0, s0, s1
453 ; GFX7-NEXT: ; return to shader part epilog
455 ; GFX8-LABEL: s_uaddo_i32:
457 ; GFX8-NEXT: s_add_u32 s0, s0, s1
458 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
459 ; GFX8-NEXT: s_add_i32 s0, s0, s1
460 ; GFX8-NEXT: ; return to shader part epilog
462 ; GFX9-LABEL: s_uaddo_i32:
464 ; GFX9-NEXT: s_add_u32 s0, s0, s1
465 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0
466 ; GFX9-NEXT: s_add_i32 s0, s0, s1
467 ; GFX9-NEXT: ; return to shader part epilog
468 %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
469 %add = extractvalue {i32, i1} %uaddo, 0
470 %of = extractvalue {i32, i1} %uaddo, 1
471 %of.zext = zext i1 %of to i32
472 %ret = add i32 %add, %of.zext
476 define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
477 ; GFX7-LABEL: s_uaddo_i64:
479 ; GFX7-NEXT: s_add_u32 s0, s0, s2
480 ; GFX7-NEXT: s_addc_u32 s1, s1, s3
481 ; GFX7-NEXT: s_cselect_b32 s2, 1, 0
482 ; GFX7-NEXT: s_add_u32 s0, s0, s2
483 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
484 ; GFX7-NEXT: ; return to shader part epilog
486 ; GFX8-LABEL: s_uaddo_i64:
488 ; GFX8-NEXT: s_add_u32 s0, s0, s2
489 ; GFX8-NEXT: s_addc_u32 s1, s1, s3
490 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
491 ; GFX8-NEXT: s_add_u32 s0, s0, s2
492 ; GFX8-NEXT: s_addc_u32 s1, s1, 0
493 ; GFX8-NEXT: ; return to shader part epilog
495 ; GFX9-LABEL: s_uaddo_i64:
497 ; GFX9-NEXT: s_add_u32 s0, s0, s2
498 ; GFX9-NEXT: s_addc_u32 s1, s1, s3
499 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
500 ; GFX9-NEXT: s_add_u32 s0, s0, s2
501 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
502 ; GFX9-NEXT: ; return to shader part epilog
503 %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
504 %add = extractvalue {i64, i1} %uaddo, 0
505 %of = extractvalue {i64, i1} %uaddo, 1
506 %of.zext = zext i1 %of to i64
507 %ret = add i64 %add, %of.zext
511 define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b) {
512 ; GFX7-LABEL: s_uaddo_v2i32:
514 ; GFX7-NEXT: s_add_u32 s0, s0, s2
515 ; GFX7-NEXT: s_cselect_b32 s2, 1, 0
516 ; GFX7-NEXT: s_add_u32 s1, s1, s3
517 ; GFX7-NEXT: s_cselect_b32 s3, 1, 0
518 ; GFX7-NEXT: s_add_i32 s0, s0, s2
519 ; GFX7-NEXT: s_add_i32 s1, s1, s3
520 ; GFX7-NEXT: ; return to shader part epilog
522 ; GFX8-LABEL: s_uaddo_v2i32:
524 ; GFX8-NEXT: s_add_u32 s0, s0, s2
525 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
526 ; GFX8-NEXT: s_add_u32 s1, s1, s3
527 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0
528 ; GFX8-NEXT: s_add_i32 s0, s0, s2
529 ; GFX8-NEXT: s_add_i32 s1, s1, s3
530 ; GFX8-NEXT: ; return to shader part epilog
532 ; GFX9-LABEL: s_uaddo_v2i32:
534 ; GFX9-NEXT: s_add_u32 s0, s0, s2
535 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
536 ; GFX9-NEXT: s_add_u32 s1, s1, s3
537 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0
538 ; GFX9-NEXT: s_add_i32 s0, s0, s2
539 ; GFX9-NEXT: s_add_i32 s1, s1, s3
540 ; GFX9-NEXT: ; return to shader part epilog
541 %uaddo = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
542 %add = extractvalue {<2 x i32>, <2 x i1>} %uaddo, 0
543 %of = extractvalue {<2 x i32>, <2 x i1>} %uaddo, 1
544 %of.zext = zext <2 x i1> %of to <2 x i32>
545 %ret = add <2 x i32> %add, %of.zext
549 define i8 @s_uaddo_i8(i8 %a, i8 %b) {
550 ; GFX7-LABEL: s_uaddo_i8:
552 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
553 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
554 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
555 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
556 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0
557 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
558 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
559 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
560 ; GFX7-NEXT: s_setpc_b64 s[30:31]
562 ; GFX8-LABEL: s_uaddo_i8:
564 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
566 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
567 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
568 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0
569 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
570 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
571 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
572 ; GFX8-NEXT: s_setpc_b64 s[30:31]
574 ; GFX9-LABEL: s_uaddo_i8:
576 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
578 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
579 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
580 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
581 ; GFX9-NEXT: s_setpc_b64 s[30:31]
582 %uaddo = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
583 %add = extractvalue {i8, i1} %uaddo, 0
584 %of = extractvalue {i8, i1} %uaddo, 1
585 %of.zext = zext i1 %of to i8
586 %ret = add i8 %add, %of.zext
590 define i7 @s_uaddo_i7(i7 %a, i7 %b) {
591 ; GFX7-LABEL: s_uaddo_i7:
593 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0
595 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1
596 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
597 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0
598 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
599 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
600 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
601 ; GFX7-NEXT: s_setpc_b64 s[30:31]
603 ; GFX8-LABEL: s_uaddo_i7:
605 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0
607 ; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1
608 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
609 ; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0
610 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
611 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
612 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
613 ; GFX8-NEXT: s_setpc_b64 s[30:31]
615 ; GFX9-LABEL: s_uaddo_i7:
617 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0
619 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1
620 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
621 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0
622 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
623 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
624 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
625 ; GFX9-NEXT: s_setpc_b64 s[30:31]
626 %uaddo = call {i7, i1} @llvm.uadd.with.overflow.i7(i7 %a, i7 %b)
627 %add = extractvalue {i7, i1} %uaddo, 0
628 %of = extractvalue {i7, i1} %uaddo, 1
629 %of.zext = zext i1 %of to i7
630 %ret = add i7 %add, %of.zext
634 define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
635 ; GFX7-LABEL: s_saddo_i32:
637 ; GFX7-NEXT: s_add_i32 s2, s0, s1
638 ; GFX7-NEXT: s_cmp_lt_i32 s2, s0
639 ; GFX7-NEXT: s_cselect_b32 s0, 1, 0
640 ; GFX7-NEXT: s_cmp_lt_i32 s1, 0
641 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0
642 ; GFX7-NEXT: s_xor_b32 s0, s1, s0
643 ; GFX7-NEXT: s_and_b32 s0, s0, 1
644 ; GFX7-NEXT: s_add_i32 s0, s2, s0
645 ; GFX7-NEXT: ; return to shader part epilog
647 ; GFX8-LABEL: s_saddo_i32:
649 ; GFX8-NEXT: s_add_i32 s2, s0, s1
650 ; GFX8-NEXT: s_cmp_lt_i32 s2, s0
651 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0
652 ; GFX8-NEXT: s_cmp_lt_i32 s1, 0
653 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
654 ; GFX8-NEXT: s_xor_b32 s0, s1, s0
655 ; GFX8-NEXT: s_and_b32 s0, s0, 1
656 ; GFX8-NEXT: s_add_i32 s0, s2, s0
657 ; GFX8-NEXT: ; return to shader part epilog
659 ; GFX9-LABEL: s_saddo_i32:
661 ; GFX9-NEXT: s_add_i32 s2, s0, s1
662 ; GFX9-NEXT: s_cmp_lt_i32 s2, s0
663 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0
664 ; GFX9-NEXT: s_cmp_lt_i32 s1, 0
665 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0
666 ; GFX9-NEXT: s_xor_b32 s0, s1, s0
667 ; GFX9-NEXT: s_and_b32 s0, s0, 1
668 ; GFX9-NEXT: s_add_i32 s0, s2, s0
669 ; GFX9-NEXT: ; return to shader part epilog
670 %saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
671 %add = extractvalue {i32, i1} %saddo, 0
672 %of = extractvalue {i32, i1} %saddo, 1
673 %of.zext = zext i1 %of to i32
674 %ret = add i32 %add, %of.zext
678 define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
679 ; GFX7-LABEL: s_saddo_i64:
681 ; GFX7-NEXT: s_add_u32 s4, s0, s2
682 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
683 ; GFX7-NEXT: s_addc_u32 s5, s1, s3
684 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
685 ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
686 ; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
687 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
688 ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
689 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
690 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0
691 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
692 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
693 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
694 ; GFX7-NEXT: ; return to shader part epilog
696 ; GFX8-LABEL: s_saddo_i64:
698 ; GFX8-NEXT: s_add_u32 s4, s0, s2
699 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
700 ; GFX8-NEXT: s_addc_u32 s5, s1, s3
701 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
702 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
703 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
704 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
705 ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
706 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
707 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
708 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
709 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
710 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
711 ; GFX8-NEXT: ; return to shader part epilog
713 ; GFX9-LABEL: s_saddo_i64:
715 ; GFX9-NEXT: s_add_u32 s4, s0, s2
716 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
717 ; GFX9-NEXT: s_addc_u32 s5, s1, s3
718 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
719 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
720 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
721 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
722 ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
723 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
724 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
725 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
726 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
727 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
728 ; GFX9-NEXT: ; return to shader part epilog
729 %saddo = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
730 %add = extractvalue {i64, i1} %saddo, 0
731 %of = extractvalue {i64, i1} %saddo, 1
732 %of.zext = zext i1 %of to i64
733 %ret = add i64 %add, %of.zext
737 define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b) {
738 ; GFX7-LABEL: s_saddo_v2i32:
740 ; GFX7-NEXT: s_add_i32 s4, s0, s2
741 ; GFX7-NEXT: s_add_i32 s5, s1, s3
742 ; GFX7-NEXT: s_cmp_lt_i32 s4, s0
743 ; GFX7-NEXT: s_cselect_b32 s0, 1, 0
744 ; GFX7-NEXT: s_cmp_lt_i32 s5, s1
745 ; GFX7-NEXT: s_cselect_b32 s1, 1, 0
746 ; GFX7-NEXT: s_cmp_lt_i32 s2, 0
747 ; GFX7-NEXT: s_cselect_b32 s2, 1, 0
748 ; GFX7-NEXT: s_cmp_lt_i32 s3, 0
749 ; GFX7-NEXT: s_cselect_b32 s3, 1, 0
750 ; GFX7-NEXT: s_xor_b32 s0, s2, s0
751 ; GFX7-NEXT: s_xor_b32 s1, s3, s1
752 ; GFX7-NEXT: s_and_b32 s0, s0, 1
753 ; GFX7-NEXT: s_and_b32 s1, s1, 1
754 ; GFX7-NEXT: s_add_i32 s0, s4, s0
755 ; GFX7-NEXT: s_add_i32 s1, s5, s1
756 ; GFX7-NEXT: ; return to shader part epilog
758 ; GFX8-LABEL: s_saddo_v2i32:
760 ; GFX8-NEXT: s_add_i32 s4, s0, s2
761 ; GFX8-NEXT: s_add_i32 s5, s1, s3
762 ; GFX8-NEXT: s_cmp_lt_i32 s4, s0
763 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0
764 ; GFX8-NEXT: s_cmp_lt_i32 s5, s1
765 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
766 ; GFX8-NEXT: s_cmp_lt_i32 s2, 0
767 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
768 ; GFX8-NEXT: s_cmp_lt_i32 s3, 0
769 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0
770 ; GFX8-NEXT: s_xor_b32 s0, s2, s0
771 ; GFX8-NEXT: s_xor_b32 s1, s3, s1
772 ; GFX8-NEXT: s_and_b32 s0, s0, 1
773 ; GFX8-NEXT: s_and_b32 s1, s1, 1
774 ; GFX8-NEXT: s_add_i32 s0, s4, s0
775 ; GFX8-NEXT: s_add_i32 s1, s5, s1
776 ; GFX8-NEXT: ; return to shader part epilog
778 ; GFX9-LABEL: s_saddo_v2i32:
780 ; GFX9-NEXT: s_add_i32 s4, s0, s2
781 ; GFX9-NEXT: s_add_i32 s5, s1, s3
782 ; GFX9-NEXT: s_cmp_lt_i32 s4, s0
783 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0
784 ; GFX9-NEXT: s_cmp_lt_i32 s5, s1
785 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0
786 ; GFX9-NEXT: s_cmp_lt_i32 s2, 0
787 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
788 ; GFX9-NEXT: s_cmp_lt_i32 s3, 0
789 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0
790 ; GFX9-NEXT: s_xor_b32 s0, s2, s0
791 ; GFX9-NEXT: s_xor_b32 s1, s3, s1
792 ; GFX9-NEXT: s_and_b32 s0, s0, 1
793 ; GFX9-NEXT: s_and_b32 s1, s1, 1
794 ; GFX9-NEXT: s_add_i32 s0, s4, s0
795 ; GFX9-NEXT: s_add_i32 s1, s5, s1
796 ; GFX9-NEXT: ; return to shader part epilog
797 %saddo = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
798 %add = extractvalue {<2 x i32>, <2 x i1>} %saddo, 0
799 %of = extractvalue {<2 x i32>, <2 x i1>} %saddo, 1
800 %of.zext = zext <2 x i1> %of to <2 x i32>
801 %ret = add <2 x i32> %add, %of.zext
805 define i8 @s_saddo_i8(i8 %a, i8 %b) {
806 ; GFX7-LABEL: s_saddo_i8:
808 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
809 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v1
810 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
811 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8
812 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
813 ; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 8
814 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
815 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
816 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
817 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
818 ; GFX7-NEXT: s_setpc_b64 s[30:31]
820 ; GFX8-LABEL: s_saddo_i8:
822 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
824 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8
825 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
826 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
827 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 8
828 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
829 ; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
830 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
831 ; GFX8-NEXT: v_add_u16_e32 v0, v2, v0
832 ; GFX8-NEXT: s_setpc_b64 s[30:31]
834 ; GFX9-LABEL: s_saddo_i8:
836 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837 ; GFX9-NEXT: v_add_u16_e32 v2, v0, v1
838 ; GFX9-NEXT: v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
839 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
840 ; GFX9-NEXT: v_cmp_lt_i32_sdwa s[6:7], sext(v1), v0 src0_sel:BYTE_0 src1_sel:DWORD
841 ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
842 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
843 ; GFX9-NEXT: v_add_u16_e32 v0, v2, v0
844 ; GFX9-NEXT: s_setpc_b64 s[30:31]
845 %saddo = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
846 %add = extractvalue {i8, i1} %saddo, 0
847 %of = extractvalue {i8, i1} %saddo, 1
848 %of.zext = zext i1 %of to i8
849 %ret = add i8 %add, %of.zext
853 define i7 @s_saddo_i7(i7 %a, i7 %b) {
854 ; GFX7-LABEL: s_saddo_i7:
856 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v1
858 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 7
859 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 7
860 ; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
861 ; GFX7-NEXT: v_bfe_i32 v0, v1, 0, 7
862 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
863 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
864 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
865 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
866 ; GFX7-NEXT: s_setpc_b64 s[30:31]
868 ; GFX8-LABEL: s_saddo_i7:
870 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
872 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 7
873 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 7
874 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
875 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 7
876 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
877 ; GFX8-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
878 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
879 ; GFX8-NEXT: v_add_u16_e32 v0, v2, v0
880 ; GFX8-NEXT: s_setpc_b64 s[30:31]
882 ; GFX9-LABEL: s_saddo_i7:
884 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885 ; GFX9-NEXT: v_add_u16_e32 v2, v0, v1
886 ; GFX9-NEXT: v_bfe_i32 v3, v2, 0, 7
887 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 7
888 ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0
889 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 7
890 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0
891 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
892 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
893 ; GFX9-NEXT: v_add_u16_e32 v0, v2, v0
894 ; GFX9-NEXT: s_setpc_b64 s[30:31]
895 %saddo = call {i7, i1} @llvm.sadd.with.overflow.i7(i7 %a, i7 %b)
896 %add = extractvalue {i7, i1} %saddo, 0
897 %of = extractvalue {i7, i1} %saddo, 1
898 %of.zext = zext i1 %of to i7
899 %ret = add i7 %add, %of.zext
903 define amdgpu_ps i32 @uaddo_i32_sv(i32 inreg %a, i32 %b) {
904 ; GFX7-LABEL: uaddo_i32_sv:
906 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
907 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
908 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
909 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
910 ; GFX7-NEXT: ; return to shader part epilog
912 ; GFX8-LABEL: uaddo_i32_sv:
914 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
915 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
916 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
917 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
918 ; GFX8-NEXT: ; return to shader part epilog
920 ; GFX9-LABEL: uaddo_i32_sv:
922 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
923 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
924 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
925 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
926 ; GFX9-NEXT: ; return to shader part epilog
927 %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
928 %add = extractvalue {i32, i1} %uaddo, 0
929 %of = extractvalue {i32, i1} %uaddo, 1
930 %of.zext = zext i1 %of to i32
931 %ret = add i32 %add, %of.zext
935 define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
936 ; GFX7-LABEL: uaddo_i16_sv:
938 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
939 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
940 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
941 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
942 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
943 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
944 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
945 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
946 ; GFX7-NEXT: ; return to shader part epilog
948 ; GFX8-LABEL: uaddo_i16_sv:
950 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
951 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
952 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
953 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v0
954 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
955 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
956 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
957 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
958 ; GFX8-NEXT: ; return to shader part epilog
960 ; GFX9-LABEL: uaddo_i16_sv:
962 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
963 ; GFX9-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
964 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
965 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
966 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
967 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
968 ; GFX9-NEXT: ; return to shader part epilog
969 %uaddo = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
970 %add = extractvalue {i16, i1} %uaddo, 0
971 %of = extractvalue {i16, i1} %uaddo, 1
972 %of.zext = zext i1 %of to i16
973 %ret = add i16 %add, %of.zext
977 define amdgpu_ps i32 @saddo_i32_sv(i32 inreg %a, i32 %b) {
978 ; GFX7-LABEL: saddo_i32_sv:
980 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v0
981 ; GFX7-NEXT: v_cmp_gt_i32_e32 vcc, s0, v1
982 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v0
983 ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
984 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
985 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
986 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
987 ; GFX7-NEXT: ; return to shader part epilog
989 ; GFX8-LABEL: saddo_i32_sv:
991 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v0
992 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, s0, v1
993 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v0
994 ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
995 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
996 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
997 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
998 ; GFX8-NEXT: ; return to shader part epilog
1000 ; GFX9-LABEL: saddo_i32_sv:
1002 ; GFX9-NEXT: v_add_u32_e32 v1, s0, v0
1003 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, s0, v1
1004 ; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v0
1005 ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
1006 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1007 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
1008 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1009 ; GFX9-NEXT: ; return to shader part epilog
1010 %saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
1011 %add = extractvalue {i32, i1} %saddo, 0
1012 %of = extractvalue {i32, i1} %saddo, 1
1013 %of.zext = zext i1 %of to i32
1014 %ret = add i32 %add, %of.zext
1018 define amdgpu_ps i16 @saddo_i16_sv(i16 inreg %a, i16 %b) {
1019 ; GFX7-LABEL: saddo_i16_sv:
1021 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v0
1022 ; GFX7-NEXT: v_bfe_i32 v2, v1, 0, 16
1023 ; GFX7-NEXT: s_sext_i32_i16 s0, s0
1024 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
1025 ; GFX7-NEXT: v_cmp_gt_i32_e32 vcc, s0, v2
1026 ; GFX7-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v0
1027 ; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
1028 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1029 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
1030 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
1031 ; GFX7-NEXT: ; return to shader part epilog
1033 ; GFX8-LABEL: saddo_i16_sv:
1035 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v0
1036 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, s0, v1
1037 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[0:1], 0, v0
1038 ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
1039 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1040 ; GFX8-NEXT: v_add_u16_e32 v0, v1, v0
1041 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1042 ; GFX8-NEXT: ; return to shader part epilog
1044 ; GFX9-LABEL: saddo_i16_sv:
1046 ; GFX9-NEXT: v_add_u16_e32 v1, s0, v0
1047 ; GFX9-NEXT: v_cmp_gt_i16_e32 vcc, s0, v1
1048 ; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], 0, v0
1049 ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
1050 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1051 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
1052 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1053 ; GFX9-NEXT: ; return to shader part epilog
1054 %saddo = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a, i16 %b)
1055 %add = extractvalue {i16, i1} %saddo, 0
1056 %of = extractvalue {i16, i1} %saddo, 1
1057 %of.zext = zext i1 %of to i16
1058 %ret = add i16 %add, %of.zext
1062 declare {i7, i1} @llvm.uadd.with.overflow.i7(i7 %a, i7 %b)
1063 declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
1064 declare {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
1065 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
1066 declare {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
1067 declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
1069 declare {i7, i1} @llvm.sadd.with.overflow.i7(i7 %a, i7 %b)
1070 declare {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
1071 declare {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a, i16 %b)
1072 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
1073 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
1074 declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)