1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
9 define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
10 ; GFX6-LABEL: v_usubsat_i8:
12 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
14 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
15 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1
16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
17 ; GFX6-NEXT: s_setpc_b64 s[30:31]
19 ; GFX8-LABEL: v_usubsat_i8:
21 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
23 ; GFX8-NEXT: s_setpc_b64 s[30:31]
25 ; GFX9-LABEL: v_usubsat_i8:
27 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28 ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
29 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31 ; GFX10-LABEL: v_usubsat_i8:
33 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
35 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
36 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
37 ; GFX10-NEXT: s_setpc_b64 s[30:31]
39 ; GFX11-TRUE16-LABEL: v_usubsat_i8:
40 ; GFX11-TRUE16: ; %bb.0:
41 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
43 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
44 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
45 ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp
46 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
48 ; GFX11-FAKE16-LABEL: v_usubsat_i8:
49 ; GFX11-FAKE16: ; %bb.0:
50 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
53 ; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
54 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
55 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
59 define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
60 ; GFX6-LABEL: v_usubsat_i16:
62 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
64 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
65 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1
66 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
67 ; GFX6-NEXT: s_setpc_b64 s[30:31]
69 ; GFX8-LABEL: v_usubsat_i16:
71 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
73 ; GFX8-NEXT: s_setpc_b64 s[30:31]
75 ; GFX9-LABEL: v_usubsat_i16:
77 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
79 ; GFX9-NEXT: s_setpc_b64 s[30:31]
81 ; GFX10-LABEL: v_usubsat_i16:
83 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
85 ; GFX10-NEXT: s_setpc_b64 s[30:31]
87 ; GFX11-TRUE16-LABEL: v_usubsat_i16:
88 ; GFX11-TRUE16: ; %bb.0:
89 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
91 ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp
92 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
94 ; GFX11-FAKE16-LABEL: v_usubsat_i16:
95 ; GFX11-FAKE16: ; %bb.0:
96 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97 ; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
98 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
99 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
103 define i16 @usubsat_as_bithack_i16(i16 %x) {
104 ; GFX6-LABEL: usubsat_as_bithack_i16:
106 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
108 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
109 ; GFX6-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
110 ; GFX6-NEXT: v_and_b32_e32 v0, v1, v0
111 ; GFX6-NEXT: s_setpc_b64 s[30:31]
113 ; GFX8-LABEL: usubsat_as_bithack_i16:
115 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GFX8-NEXT: s_movk_i32 s4, 0x8000
117 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
118 ; GFX8-NEXT: s_setpc_b64 s[30:31]
120 ; GFX9-LABEL: usubsat_as_bithack_i16:
122 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
124 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
125 ; GFX9-NEXT: s_setpc_b64 s[30:31]
127 ; GFX10-LABEL: usubsat_as_bithack_i16:
129 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
131 ; GFX10-NEXT: s_setpc_b64 s[30:31]
133 ; GFX11-TRUE16-LABEL: usubsat_as_bithack_i16:
134 ; GFX11-TRUE16: ; %bb.0:
135 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136 ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp
137 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
139 ; GFX11-FAKE16-LABEL: usubsat_as_bithack_i16:
140 ; GFX11-FAKE16: ; %bb.0:
141 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142 ; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
143 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
144 %signsplat = ashr i16 %x, 15
145 %flipsign = xor i16 %x, 32768
146 %result = and i16 %signsplat, %flipsign
150 define i16 @usubsat_as_bithack2_i16(i16 %x) {
151 ; GFX6-LABEL: usubsat_as_bithack2_i16:
153 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
155 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
156 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xffff8000, v0
157 ; GFX6-NEXT: v_and_b32_e32 v0, v1, v0
158 ; GFX6-NEXT: s_setpc_b64 s[30:31]
160 ; GFX8-LABEL: usubsat_as_bithack2_i16:
162 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163 ; GFX8-NEXT: s_movk_i32 s4, 0x8000
164 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
165 ; GFX8-NEXT: s_setpc_b64 s[30:31]
167 ; GFX9-LABEL: usubsat_as_bithack2_i16:
169 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
171 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
172 ; GFX9-NEXT: s_setpc_b64 s[30:31]
174 ; GFX10-LABEL: usubsat_as_bithack2_i16:
176 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
178 ; GFX10-NEXT: s_setpc_b64 s[30:31]
180 ; GFX11-TRUE16-LABEL: usubsat_as_bithack2_i16:
181 ; GFX11-TRUE16: ; %bb.0:
182 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp
184 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
186 ; GFX11-FAKE16-LABEL: usubsat_as_bithack2_i16:
187 ; GFX11-FAKE16: ; %bb.0:
188 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
190 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
191 %signsplat = ashr i16 %x, 15
192 %flipsign = add i16 %x, 32768
193 %result = and i16 %signsplat, %flipsign
197 define i16 @usubsat_as_bithack_commute_i16(i16 %x) {
198 ; GFX6-LABEL: usubsat_as_bithack_commute_i16:
200 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
202 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
203 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xffff8000, v0
204 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
205 ; GFX6-NEXT: s_setpc_b64 s[30:31]
207 ; GFX8-LABEL: usubsat_as_bithack_commute_i16:
209 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210 ; GFX8-NEXT: s_movk_i32 s4, 0x8000
211 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
212 ; GFX8-NEXT: s_setpc_b64 s[30:31]
214 ; GFX9-LABEL: usubsat_as_bithack_commute_i16:
216 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
218 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp
219 ; GFX9-NEXT: s_setpc_b64 s[30:31]
221 ; GFX10-LABEL: usubsat_as_bithack_commute_i16:
223 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
225 ; GFX10-NEXT: s_setpc_b64 s[30:31]
227 ; GFX11-TRUE16-LABEL: usubsat_as_bithack_commute_i16:
228 ; GFX11-TRUE16: ; %bb.0:
229 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230 ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp
231 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
233 ; GFX11-FAKE16-LABEL: usubsat_as_bithack_commute_i16:
234 ; GFX11-FAKE16: ; %bb.0:
235 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236 ; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp
237 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
238 %signsplat = ashr i16 %x, 15
239 %flipsign = add i16 %x, 32768
240 %result = and i16 %flipsign, %signsplat
244 define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
245 ; GFX6-LABEL: v_usubsat_i32:
247 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1
249 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
250 ; GFX6-NEXT: s_setpc_b64 s[30:31]
252 ; GFX8-LABEL: v_usubsat_i32:
254 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
256 ; GFX8-NEXT: s_setpc_b64 s[30:31]
258 ; GFX9-LABEL: v_usubsat_i32:
260 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
261 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp
262 ; GFX9-NEXT: s_setpc_b64 s[30:31]
264 ; GFX10PLUS-LABEL: v_usubsat_i32:
265 ; GFX10PLUS: ; %bb.0:
266 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp
268 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
269 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
273 define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
274 ; GFX6-LABEL: v_usubsat_v2i16:
276 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
278 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
279 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
280 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
281 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v3
282 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2
283 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
284 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
285 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
286 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
287 ; GFX6-NEXT: s_setpc_b64 s[30:31]
289 ; GFX8-LABEL: v_usubsat_v2i16:
291 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; GFX8-NEXT: v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
293 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
294 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
295 ; GFX8-NEXT: s_setpc_b64 s[30:31]
297 ; GFX9-LABEL: v_usubsat_v2i16:
299 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
301 ; GFX9-NEXT: s_setpc_b64 s[30:31]
303 ; GFX10PLUS-LABEL: v_usubsat_v2i16:
304 ; GFX10PLUS: ; %bb.0:
305 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
307 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
308 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
309 ret <2 x i16> %result
312 define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
313 ; GFX6-LABEL: v_usubsat_v3i16:
315 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
316 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v4
317 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
318 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
319 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
320 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v6
321 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3
322 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
323 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
324 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
325 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
326 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
327 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
328 ; GFX6-NEXT: v_max_u32_e32 v1, v2, v5
329 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v5
330 ; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
331 ; GFX6-NEXT: s_setpc_b64 s[30:31]
333 ; GFX8-LABEL: v_usubsat_v3i16:
335 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336 ; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
337 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp
338 ; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp
339 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
340 ; GFX8-NEXT: s_setpc_b64 s[30:31]
342 ; GFX9-LABEL: v_usubsat_v3i16:
344 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
346 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
347 ; GFX9-NEXT: s_setpc_b64 s[30:31]
349 ; GFX10PLUS-LABEL: v_usubsat_v3i16:
350 ; GFX10PLUS: ; %bb.0:
351 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
353 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
354 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
355 %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
356 ret <3 x i16> %result
359 define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
360 ; GFX6-LABEL: v_usubsat_v4i16:
362 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5
364 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
365 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
366 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
367 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v9
368 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4
369 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
370 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7
371 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
372 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
373 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
374 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
375 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
376 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
377 ; GFX6-NEXT: v_max_u32_e32 v1, v2, v6
378 ; GFX6-NEXT: v_max_u32_e32 v2, v3, v8
379 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
380 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
381 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
382 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
383 ; GFX6-NEXT: s_setpc_b64 s[30:31]
385 ; GFX8-LABEL: v_usubsat_v4i16:
387 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
389 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp
390 ; GFX8-NEXT: v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
391 ; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp
392 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
393 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
394 ; GFX8-NEXT: s_setpc_b64 s[30:31]
396 ; GFX9-LABEL: v_usubsat_v4i16:
398 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
400 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
401 ; GFX9-NEXT: s_setpc_b64 s[30:31]
403 ; GFX10PLUS-LABEL: v_usubsat_v4i16:
404 ; GFX10PLUS: ; %bb.0:
405 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
407 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
408 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
409 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
410 %cast = bitcast <4 x i16> %result to <2 x float>
411 ret <2 x float> %cast
414 define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
415 ; GFX6-LABEL: v_usubsat_v2i32:
417 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2
419 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v3
420 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
421 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
422 ; GFX6-NEXT: s_setpc_b64 s[30:31]
424 ; GFX8-LABEL: v_usubsat_v2i32:
426 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
428 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
429 ; GFX8-NEXT: s_setpc_b64 s[30:31]
431 ; GFX9-LABEL: v_usubsat_v2i32:
433 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp
435 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp
436 ; GFX9-NEXT: s_setpc_b64 s[30:31]
438 ; GFX10PLUS-LABEL: v_usubsat_v2i32:
439 ; GFX10PLUS: ; %bb.0:
440 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp
442 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp
443 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
444 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
445 ret <2 x i32> %result
448 define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
449 ; GFX6-LABEL: v_usubsat_v3i32:
451 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
452 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3
453 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v4
454 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v5
455 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
456 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
457 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
458 ; GFX6-NEXT: s_setpc_b64 s[30:31]
460 ; GFX8-LABEL: v_usubsat_v3i32:
462 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
464 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
465 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
466 ; GFX8-NEXT: s_setpc_b64 s[30:31]
468 ; GFX9-LABEL: v_usubsat_v3i32:
470 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
471 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp
472 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp
473 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp
474 ; GFX9-NEXT: s_setpc_b64 s[30:31]
476 ; GFX10PLUS-LABEL: v_usubsat_v3i32:
477 ; GFX10PLUS: ; %bb.0:
478 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp
480 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp
481 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp
482 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
483 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
484 ret <3 x i32> %result
487 define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
488 ; GFX6-LABEL: v_usubsat_v4i32:
490 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4
492 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v5
493 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v6
494 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v7
495 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
496 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
497 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
498 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
499 ; GFX6-NEXT: s_setpc_b64 s[30:31]
501 ; GFX8-LABEL: v_usubsat_v4i32:
503 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
504 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
505 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
506 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
507 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
508 ; GFX8-NEXT: s_setpc_b64 s[30:31]
510 ; GFX9-LABEL: v_usubsat_v4i32:
512 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
513 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp
514 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp
515 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp
516 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp
517 ; GFX9-NEXT: s_setpc_b64 s[30:31]
519 ; GFX10PLUS-LABEL: v_usubsat_v4i32:
520 ; GFX10PLUS: ; %bb.0:
521 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp
523 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp
524 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp
525 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp
526 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
527 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
528 ret <4 x i32> %result
531 define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
532 ; GFX6-LABEL: v_usubsat_v8i32:
534 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v8
536 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v9
537 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v10
538 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v11
539 ; GFX6-NEXT: v_max_u32_e32 v4, v4, v12
540 ; GFX6-NEXT: v_max_u32_e32 v5, v5, v13
541 ; GFX6-NEXT: v_max_u32_e32 v6, v6, v14
542 ; GFX6-NEXT: v_max_u32_e32 v7, v7, v15
543 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
544 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
545 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
546 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v11
547 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12
548 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v13
549 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v14
550 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v15
551 ; GFX6-NEXT: s_setpc_b64 s[30:31]
553 ; GFX8-LABEL: v_usubsat_v8i32:
555 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8 clamp
557 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9 clamp
558 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10 clamp
559 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11 clamp
560 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12 clamp
561 ; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13 clamp
562 ; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14 clamp
563 ; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15 clamp
564 ; GFX8-NEXT: s_setpc_b64 s[30:31]
566 ; GFX9-LABEL: v_usubsat_v8i32:
568 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v8 clamp
570 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v9 clamp
571 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v10 clamp
572 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v11 clamp
573 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v12 clamp
574 ; GFX9-NEXT: v_sub_u32_e64 v5, v5, v13 clamp
575 ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v14 clamp
576 ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp
577 ; GFX9-NEXT: s_setpc_b64 s[30:31]
579 ; GFX10PLUS-LABEL: v_usubsat_v8i32:
580 ; GFX10PLUS: ; %bb.0:
581 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp
583 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp
584 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp
585 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp
586 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp
587 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp
588 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp
589 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp
590 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
591 %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
592 ret <8 x i32> %result
595 define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
596 ; GFX6-LABEL: v_usubsat_v16i32:
598 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
599 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v16
600 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
601 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
602 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v17
603 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v18
604 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v19
605 ; GFX6-NEXT: v_max_u32_e32 v4, v4, v20
606 ; GFX6-NEXT: v_max_u32_e32 v5, v5, v21
607 ; GFX6-NEXT: v_max_u32_e32 v6, v6, v22
608 ; GFX6-NEXT: v_max_u32_e32 v7, v7, v23
609 ; GFX6-NEXT: v_max_u32_e32 v8, v8, v24
610 ; GFX6-NEXT: v_max_u32_e32 v9, v9, v25
611 ; GFX6-NEXT: v_max_u32_e32 v10, v10, v26
612 ; GFX6-NEXT: v_max_u32_e32 v11, v11, v27
613 ; GFX6-NEXT: v_max_u32_e32 v12, v12, v28
614 ; GFX6-NEXT: v_max_u32_e32 v13, v13, v29
615 ; GFX6-NEXT: v_max_u32_e32 v14, v14, v30
616 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17
617 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18
618 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19
619 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v20
620 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v21
621 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v22
622 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v23
623 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v24
624 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v25
625 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v26
626 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v27
627 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28
628 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29
629 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30
630 ; GFX6-NEXT: s_waitcnt vmcnt(0)
631 ; GFX6-NEXT: v_max_u32_e32 v15, v15, v16
632 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16
633 ; GFX6-NEXT: s_setpc_b64 s[30:31]
635 ; GFX8-LABEL: v_usubsat_v16i32:
637 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
639 ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
640 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
641 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
642 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
643 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
644 ; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
645 ; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
646 ; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
647 ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
648 ; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
649 ; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
650 ; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
651 ; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
652 ; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
653 ; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
654 ; GFX8-NEXT: s_waitcnt vmcnt(0)
655 ; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
656 ; GFX8-NEXT: s_setpc_b64 s[30:31]
658 ; GFX9-LABEL: v_usubsat_v16i32:
660 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
661 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp
662 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
663 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp
664 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp
665 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp
666 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp
667 ; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp
668 ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp
669 ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp
670 ; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp
671 ; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp
672 ; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp
673 ; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp
674 ; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp
675 ; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp
676 ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp
677 ; GFX9-NEXT: s_waitcnt vmcnt(0)
678 ; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp
679 ; GFX9-NEXT: s_setpc_b64 s[30:31]
681 ; GFX10-LABEL: v_usubsat_v16i32:
683 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
685 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp
686 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp
687 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp
688 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp
689 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp
690 ; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp
691 ; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp
692 ; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp
693 ; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp
694 ; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp
695 ; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp
696 ; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp
697 ; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp
698 ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp
699 ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp
700 ; GFX10-NEXT: s_waitcnt vmcnt(0)
701 ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp
702 ; GFX10-NEXT: s_setpc_b64 s[30:31]
704 ; GFX11-LABEL: v_usubsat_v16i32:
706 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
707 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
708 ; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp
709 ; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp
710 ; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp
711 ; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp
712 ; GFX11-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp
713 ; GFX11-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp
714 ; GFX11-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp
715 ; GFX11-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp
716 ; GFX11-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp
717 ; GFX11-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp
718 ; GFX11-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp
719 ; GFX11-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp
720 ; GFX11-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp
721 ; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp
722 ; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp
723 ; GFX11-NEXT: s_waitcnt vmcnt(0)
724 ; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp
725 ; GFX11-NEXT: s_setpc_b64 s[30:31]
726 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
727 ret <16 x i32> %result
731 define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
732 ; GFX6-LABEL: v_usubsat_i64:
734 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
736 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
737 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
738 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
739 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
740 ; GFX6-NEXT: s_setpc_b64 s[30:31]
742 ; GFX8-LABEL: v_usubsat_i64:
744 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
745 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
746 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
747 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
748 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
749 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
750 ; GFX8-NEXT: s_setpc_b64 s[30:31]
752 ; GFX9-LABEL: v_usubsat_i64:
754 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
756 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
757 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
758 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
759 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
760 ; GFX9-NEXT: s_setpc_b64 s[30:31]
762 ; GFX10PLUS-LABEL: v_usubsat_i64:
763 ; GFX10PLUS: ; %bb.0:
764 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765 ; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
766 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
767 ; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
768 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
769 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
770 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
771 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
775 declare i8 @llvm.usub.sat.i8(i8, i8) #0
776 declare i16 @llvm.usub.sat.i16(i16, i16) #0
777 declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
778 declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
779 declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
780 declare i32 @llvm.usub.sat.i32(i32, i32) #0
781 declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
782 declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
783 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
784 declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0
785 declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
786 declare i64 @llvm.usub.sat.i64(i64, i64) #0
788 attributes #0 = { nounwind readnone speculatable willreturn }