1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
7 define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
8 ; GFX6-LABEL: v_usubsat_i8:
10 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX6-NEXT: s_movk_i32 s4, 0xff
12 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
13 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
14 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1
15 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
16 ; GFX6-NEXT: s_setpc_b64 s[30:31]
18 ; GFX8-LABEL: v_usubsat_i8:
20 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
22 ; GFX8-NEXT: s_setpc_b64 s[30:31]
24 ; GFX9-LABEL: v_usubsat_i8:
26 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
28 ; GFX9-NEXT: s_setpc_b64 s[30:31]
30 ; GFX10-LABEL: v_usubsat_i8:
32 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
34 ; GFX10-NEXT: s_movk_i32 s4, 0xff
35 ; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
36 ; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
37 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
38 ; GFX10-NEXT: s_setpc_b64 s[30:31]
39 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
43 define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
44 ; GFX6-LABEL: v_usubsat_i16:
46 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47 ; GFX6-NEXT: s_mov_b32 s4, 0xffff
48 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
49 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
50 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1
51 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
52 ; GFX6-NEXT: s_setpc_b64 s[30:31]
54 ; GFX8-LABEL: v_usubsat_i16:
56 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
58 ; GFX8-NEXT: s_setpc_b64 s[30:31]
60 ; GFX9-LABEL: v_usubsat_i16:
62 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
64 ; GFX9-NEXT: s_setpc_b64 s[30:31]
66 ; GFX10-LABEL: v_usubsat_i16:
68 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
70 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
71 ; GFX10-NEXT: s_setpc_b64 s[30:31]
72 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
76 define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
77 ; GFX6-LABEL: v_usubsat_i32:
79 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1
81 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
82 ; GFX6-NEXT: s_setpc_b64 s[30:31]
84 ; GFX8-LABEL: v_usubsat_i32:
86 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
88 ; GFX8-NEXT: s_setpc_b64 s[30:31]
90 ; GFX9-LABEL: v_usubsat_i32:
92 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp
94 ; GFX9-NEXT: s_setpc_b64 s[30:31]
96 ; GFX10-LABEL: v_usubsat_i32:
98 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
100 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp
101 ; GFX10-NEXT: s_setpc_b64 s[30:31]
102 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
106 define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
107 ; GFX6-LABEL: v_usubsat_v2i16:
109 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110 ; GFX6-NEXT: s_mov_b32 s4, 0xffff
111 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v3
112 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
113 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v4
114 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
115 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
116 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2
117 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
118 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
119 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
120 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
121 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
122 ; GFX6-NEXT: s_setpc_b64 s[30:31]
124 ; GFX8-LABEL: v_usubsat_v2i16:
126 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GFX8-NEXT: v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
128 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
129 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
130 ; GFX8-NEXT: s_setpc_b64 s[30:31]
132 ; GFX9-LABEL: v_usubsat_v2i16:
134 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
136 ; GFX9-NEXT: s_setpc_b64 s[30:31]
138 ; GFX10-LABEL: v_usubsat_v2i16:
140 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
142 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
143 ; GFX10-NEXT: s_setpc_b64 s[30:31]
144 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
145 ret <2 x i16> %result
148 define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
149 ; GFX6-LABEL: v_usubsat_v3i16:
151 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152 ; GFX6-NEXT: s_mov_b32 s4, 0xffff
153 ; GFX6-NEXT: v_and_b32_e32 v6, s4, v4
154 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
155 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v6
156 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3
157 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
158 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3
159 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
160 ; GFX6-NEXT: v_and_b32_e32 v5, s4, v5
161 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
162 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
163 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
164 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
165 ; GFX6-NEXT: v_max_u32_e32 v1, v2, v5
166 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v5
167 ; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
168 ; GFX6-NEXT: s_setpc_b64 s[30:31]
170 ; GFX8-LABEL: v_usubsat_v3i16:
172 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173 ; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
174 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp
175 ; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp
176 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
177 ; GFX8-NEXT: s_setpc_b64 s[30:31]
179 ; GFX9-LABEL: v_usubsat_v3i16:
181 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
183 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
184 ; GFX9-NEXT: s_setpc_b64 s[30:31]
186 ; GFX10-LABEL: v_usubsat_v3i16:
188 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
190 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
191 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
192 ; GFX10-NEXT: s_setpc_b64 s[30:31]
193 %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
194 ret <3 x i16> %result
197 define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
198 ; GFX6-LABEL: v_usubsat_v4i16:
200 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201 ; GFX6-NEXT: s_mov_b32 s4, 0xffff
202 ; GFX6-NEXT: v_and_b32_e32 v9, s4, v5
203 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
204 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v9
205 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v4
206 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
207 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4
208 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
209 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v7
210 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3
211 ; GFX6-NEXT: v_and_b32_e32 v6, s4, v6
212 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
213 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
214 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
215 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
216 ; GFX6-NEXT: v_max_u32_e32 v1, v2, v6
217 ; GFX6-NEXT: v_max_u32_e32 v2, v3, v8
218 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
219 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
220 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
221 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
222 ; GFX6-NEXT: s_setpc_b64 s[30:31]
224 ; GFX8-LABEL: v_usubsat_v4i16:
226 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227 ; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
228 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp
229 ; GFX8-NEXT: v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
230 ; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp
231 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
232 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
233 ; GFX8-NEXT: s_setpc_b64 s[30:31]
235 ; GFX9-LABEL: v_usubsat_v4i16:
237 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
239 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
240 ; GFX9-NEXT: s_setpc_b64 s[30:31]
242 ; GFX10-LABEL: v_usubsat_v4i16:
244 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
246 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
247 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
248 ; GFX10-NEXT: s_setpc_b64 s[30:31]
249 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
250 %cast = bitcast <4 x i16> %result to <2 x float>
251 ret <2 x float> %cast
254 define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
255 ; GFX6-LABEL: v_usubsat_v2i32:
257 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2
259 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v3
260 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
261 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
262 ; GFX6-NEXT: s_setpc_b64 s[30:31]
264 ; GFX8-LABEL: v_usubsat_v2i32:
266 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
268 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
269 ; GFX8-NEXT: s_setpc_b64 s[30:31]
271 ; GFX9-LABEL: v_usubsat_v2i32:
273 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp
275 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp
276 ; GFX9-NEXT: s_setpc_b64 s[30:31]
278 ; GFX10-LABEL: v_usubsat_v2i32:
280 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
282 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp
283 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp
284 ; GFX10-NEXT: s_setpc_b64 s[30:31]
285 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
286 ret <2 x i32> %result
289 define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
290 ; GFX6-LABEL: v_usubsat_v3i32:
292 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3
294 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v4
295 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v5
296 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
297 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
298 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
299 ; GFX6-NEXT: s_setpc_b64 s[30:31]
301 ; GFX8-LABEL: v_usubsat_v3i32:
303 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
305 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
306 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
307 ; GFX8-NEXT: s_setpc_b64 s[30:31]
309 ; GFX9-LABEL: v_usubsat_v3i32:
311 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp
313 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp
314 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp
315 ; GFX9-NEXT: s_setpc_b64 s[30:31]
317 ; GFX10-LABEL: v_usubsat_v3i32:
319 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
321 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp
322 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp
323 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp
324 ; GFX10-NEXT: s_setpc_b64 s[30:31]
325 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
326 ret <3 x i32> %result
329 define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
330 ; GFX6-LABEL: v_usubsat_v4i32:
332 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
333 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4
334 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v5
335 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v6
336 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v7
337 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
338 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
339 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
340 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
341 ; GFX6-NEXT: s_setpc_b64 s[30:31]
343 ; GFX8-LABEL: v_usubsat_v4i32:
345 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
347 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
348 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
349 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
350 ; GFX8-NEXT: s_setpc_b64 s[30:31]
352 ; GFX9-LABEL: v_usubsat_v4i32:
354 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp
356 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp
357 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp
358 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp
359 ; GFX9-NEXT: s_setpc_b64 s[30:31]
361 ; GFX10-LABEL: v_usubsat_v4i32:
363 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
365 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp
366 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp
367 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp
368 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp
369 ; GFX10-NEXT: s_setpc_b64 s[30:31]
370 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
371 ret <4 x i32> %result
374 define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
375 ; GFX6-LABEL: v_usubsat_v8i32:
377 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v8
379 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v9
380 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v10
381 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v11
382 ; GFX6-NEXT: v_max_u32_e32 v4, v4, v12
383 ; GFX6-NEXT: v_max_u32_e32 v5, v5, v13
384 ; GFX6-NEXT: v_max_u32_e32 v6, v6, v14
385 ; GFX6-NEXT: v_max_u32_e32 v7, v7, v15
386 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
387 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
388 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
389 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v11
390 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12
391 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v13
392 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v14
393 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v15
394 ; GFX6-NEXT: s_setpc_b64 s[30:31]
396 ; GFX8-LABEL: v_usubsat_v8i32:
398 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8 clamp
400 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9 clamp
401 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10 clamp
402 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11 clamp
403 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12 clamp
404 ; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13 clamp
405 ; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14 clamp
406 ; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15 clamp
407 ; GFX8-NEXT: s_setpc_b64 s[30:31]
409 ; GFX9-LABEL: v_usubsat_v8i32:
411 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v8 clamp
413 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v9 clamp
414 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v10 clamp
415 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v11 clamp
416 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v12 clamp
417 ; GFX9-NEXT: v_sub_u32_e64 v5, v5, v13 clamp
418 ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v14 clamp
419 ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp
420 ; GFX9-NEXT: s_setpc_b64 s[30:31]
422 ; GFX10-LABEL: v_usubsat_v8i32:
424 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
426 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp
427 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp
428 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp
429 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp
430 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp
431 ; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp
432 ; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp
433 ; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp
434 ; GFX10-NEXT: s_setpc_b64 s[30:31]
435 %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
436 ret <8 x i32> %result
439 define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
440 ; GFX6-LABEL: v_usubsat_v16i32:
442 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v16
444 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v17
445 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v18
446 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v19
447 ; GFX6-NEXT: v_max_u32_e32 v4, v4, v20
448 ; GFX6-NEXT: v_max_u32_e32 v5, v5, v21
449 ; GFX6-NEXT: v_max_u32_e32 v6, v6, v22
450 ; GFX6-NEXT: v_max_u32_e32 v7, v7, v23
451 ; GFX6-NEXT: v_max_u32_e32 v8, v8, v24
452 ; GFX6-NEXT: v_max_u32_e32 v9, v9, v25
453 ; GFX6-NEXT: v_max_u32_e32 v10, v10, v26
454 ; GFX6-NEXT: v_max_u32_e32 v11, v11, v27
455 ; GFX6-NEXT: v_max_u32_e32 v12, v12, v28
456 ; GFX6-NEXT: v_max_u32_e32 v13, v13, v29
457 ; GFX6-NEXT: v_max_u32_e32 v14, v14, v30
458 ; GFX6-NEXT: v_max_u32_e32 v15, v15, v31
459 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
460 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17
461 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18
462 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19
463 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v20
464 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v21
465 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v22
466 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v23
467 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v24
468 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v25
469 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v26
470 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v27
471 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28
472 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29
473 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30
474 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v31
475 ; GFX6-NEXT: s_setpc_b64 s[30:31]
477 ; GFX8-LABEL: v_usubsat_v16i32:
479 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
480 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
481 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
482 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
483 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
484 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
485 ; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
486 ; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
487 ; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
488 ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
489 ; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
490 ; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
491 ; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
492 ; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
493 ; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
494 ; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
495 ; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp
496 ; GFX8-NEXT: s_setpc_b64 s[30:31]
498 ; GFX9-LABEL: v_usubsat_v16i32:
500 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp
502 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp
503 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp
504 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp
505 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp
506 ; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp
507 ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp
508 ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp
509 ; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp
510 ; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp
511 ; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp
512 ; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp
513 ; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp
514 ; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp
515 ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp
516 ; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp
517 ; GFX9-NEXT: s_setpc_b64 s[30:31]
519 ; GFX10-LABEL: v_usubsat_v16i32:
521 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
523 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp
524 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp
525 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp
526 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp
527 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp
528 ; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp
529 ; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp
530 ; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp
531 ; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp
532 ; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp
533 ; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp
534 ; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp
535 ; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp
536 ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp
537 ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp
538 ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp
539 ; GFX10-NEXT: s_setpc_b64 s[30:31]
540 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
541 ret <16 x i32> %result
545 define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
546 ; GFX6-LABEL: v_usubsat_i64:
548 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
550 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
551 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
552 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
553 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
554 ; GFX6-NEXT: s_setpc_b64 s[30:31]
556 ; GFX8-LABEL: v_usubsat_i64:
558 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
560 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
561 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
562 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
563 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
564 ; GFX8-NEXT: s_setpc_b64 s[30:31]
566 ; GFX9-LABEL: v_usubsat_i64:
568 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
570 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
571 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
572 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
573 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
574 ; GFX9-NEXT: s_setpc_b64 s[30:31]
576 ; GFX10-LABEL: v_usubsat_i64:
578 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
579 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
580 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
581 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
582 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
583 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
584 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
585 ; GFX10-NEXT: s_setpc_b64 s[30:31]
586 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
590 declare i8 @llvm.usub.sat.i8(i8, i8) #0
591 declare i16 @llvm.usub.sat.i16(i16, i16) #0
592 declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
593 declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
594 declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
595 declare i32 @llvm.usub.sat.i32(i32, i32) #0
596 declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
597 declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
598 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
599 declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0
600 declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
601 declare i64 @llvm.usub.sat.i64(i64, i64) #0
603 attributes #0 = { nounwind readnone speculatable willreturn }