1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
9 ; GFX6-LABEL: v_usubsat_i7:
11 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0
13 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1
14 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
15 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
16 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0
17 ; GFX6-NEXT: s_setpc_b64 s[30:31]
19 ; GFX8-LABEL: v_usubsat_i7:
21 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0
23 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1
24 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
25 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
26 ; GFX8-NEXT: s_setpc_b64 s[30:31]
28 ; GFX9-LABEL: v_usubsat_i7:
30 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0
32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1
33 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
34 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
35 ; GFX9-NEXT: s_setpc_b64 s[30:31]
37 ; GFX10PLUS-LABEL: v_usubsat_i7:
39 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0
41 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1
42 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
43 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
44 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
45 %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
49 define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
50 ; GFX6-LABEL: s_usubsat_i7:
52 ; GFX6-NEXT: s_lshl_b32 s0, s0, 25
53 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25
54 ; GFX6-NEXT: s_min_u32 s1, s0, s1
55 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
56 ; GFX6-NEXT: s_lshr_b32 s0, s0, 25
57 ; GFX6-NEXT: ; return to shader part epilog
59 ; GFX8-LABEL: s_usubsat_i7:
61 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9
62 ; GFX8-NEXT: s_lshl_b32 s0, s0, 9
63 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
64 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
65 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
66 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
67 ; GFX8-NEXT: ; return to shader part epilog
69 ; GFX9-LABEL: s_usubsat_i7:
71 ; GFX9-NEXT: s_lshl_b32 s1, s1, 9
72 ; GFX9-NEXT: s_lshl_b32 s0, s0, 9
73 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
74 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
75 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
76 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
77 ; GFX9-NEXT: ; return to shader part epilog
79 ; GFX10PLUS-LABEL: s_usubsat_i7:
81 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
82 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
83 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
84 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
85 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
86 ; GFX10PLUS-NEXT: ; return to shader part epilog
87 %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
91 define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
92 ; GFX6-LABEL: v_usubsat_i8:
94 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
96 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
97 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
98 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
99 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
100 ; GFX6-NEXT: s_setpc_b64 s[30:31]
102 ; GFX8-LABEL: v_usubsat_i8:
104 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
106 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
107 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
108 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
109 ; GFX8-NEXT: s_setpc_b64 s[30:31]
111 ; GFX9-LABEL: v_usubsat_i8:
113 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
115 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
116 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
117 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
118 ; GFX9-NEXT: s_setpc_b64 s[30:31]
120 ; GFX10PLUS-LABEL: v_usubsat_i8:
121 ; GFX10PLUS: ; %bb.0:
122 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
124 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
125 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
126 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
127 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
128 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
132 define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
133 ; GFX6-LABEL: s_usubsat_i8:
135 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
136 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
137 ; GFX6-NEXT: s_min_u32 s1, s0, s1
138 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
139 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24
140 ; GFX6-NEXT: ; return to shader part epilog
142 ; GFX8-LABEL: s_usubsat_i8:
144 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
145 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
146 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
147 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
148 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
149 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
150 ; GFX8-NEXT: ; return to shader part epilog
152 ; GFX9-LABEL: s_usubsat_i8:
154 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
155 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
156 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
157 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
158 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
159 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
160 ; GFX9-NEXT: ; return to shader part epilog
162 ; GFX10PLUS-LABEL: s_usubsat_i8:
163 ; GFX10PLUS: ; %bb.0:
164 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
165 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
166 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
167 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
168 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
169 ; GFX10PLUS-NEXT: ; return to shader part epilog
170 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
174 define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
175 ; GFX6-LABEL: v_usubsat_v2i8:
177 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
179 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
180 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
181 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
182 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
183 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
184 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
185 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
186 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
187 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
188 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
189 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
190 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
191 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
192 ; GFX6-NEXT: s_setpc_b64 s[30:31]
194 ; GFX8-LABEL: v_usubsat_v2i8:
196 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
198 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
199 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
200 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
201 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
202 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
203 ; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp
204 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
205 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
206 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
207 ; GFX8-NEXT: s_setpc_b64 s[30:31]
209 ; GFX9-LABEL: v_usubsat_v2i8:
211 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
213 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
214 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
215 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
216 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
217 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
218 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
219 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
220 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
221 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
222 ; GFX9-NEXT: s_movk_i32 s4, 0xff
223 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
224 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
225 ; GFX9-NEXT: s_setpc_b64 s[30:31]
227 ; GFX10-LABEL: v_usubsat_v2i8:
229 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
231 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
232 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
233 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
234 ; GFX10-NEXT: s_movk_i32 s4, 0xff
235 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
236 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
237 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
238 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
239 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
240 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
241 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
242 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
243 ; GFX10-NEXT: s_setpc_b64 s[30:31]
245 ; GFX11-LABEL: v_usubsat_v2i8:
247 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
249 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
250 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
251 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
252 ; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
253 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
254 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
255 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
256 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
257 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
258 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
259 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
260 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
261 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
262 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
263 ; GFX11-NEXT: s_setpc_b64 s[30:31]
264 %lhs = bitcast i16 %lhs.arg to <2 x i8>
265 %rhs = bitcast i16 %rhs.arg to <2 x i8>
266 %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
267 %cast.result = bitcast <2 x i8> %result to i16
271 define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
272 ; GFX6-LABEL: s_usubsat_v2i8:
274 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
275 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8
276 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
277 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
278 ; GFX6-NEXT: s_min_u32 s1, s0, s1
279 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
280 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
281 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
282 ; GFX6-NEXT: s_min_u32 s2, s1, s2
283 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
284 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24
285 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24
286 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
287 ; GFX6-NEXT: s_or_b32 s0, s0, s1
288 ; GFX6-NEXT: ; return to shader part epilog
290 ; GFX8-LABEL: s_usubsat_v2i8:
292 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8
293 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
294 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
295 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
296 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
297 ; GFX8-NEXT: s_lshl_b32 s1, s3, 8
298 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
299 ; GFX8-NEXT: s_lshl_b32 s0, s2, 8
300 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
301 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
302 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
303 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
304 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
305 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
306 ; GFX8-NEXT: ; return to shader part epilog
308 ; GFX9-LABEL: s_usubsat_v2i8:
310 ; GFX9-NEXT: s_lshr_b32 s2, s0, 8
311 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
312 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8
313 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
314 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
315 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
316 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
317 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
318 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16
319 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
320 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
321 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
322 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
323 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
324 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
325 ; GFX9-NEXT: s_movk_i32 s0, 0xff
326 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
327 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
328 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
329 ; GFX9-NEXT: ; return to shader part epilog
331 ; GFX10-LABEL: s_usubsat_v2i8:
333 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
334 ; GFX10-NEXT: s_lshr_b32 s3, s1, 8
335 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
336 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
337 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
338 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
339 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
340 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8
341 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
342 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
343 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
344 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
345 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
346 ; GFX10-NEXT: s_movk_i32 s0, 0xff
347 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
348 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
349 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
350 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
351 ; GFX10-NEXT: ; return to shader part epilog
353 ; GFX11-LABEL: s_usubsat_v2i8:
355 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
356 ; GFX11-NEXT: s_lshr_b32 s3, s1, 8
357 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
358 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
359 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
360 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
361 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
362 ; GFX11-NEXT: s_lshl_b32 s2, s2, 8
363 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
364 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8
365 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
366 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
367 ; GFX11-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
368 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
369 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
370 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
371 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
372 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
373 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
374 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
375 ; GFX11-NEXT: ; return to shader part epilog
376 %lhs = bitcast i16 %lhs.arg to <2 x i8>
377 %rhs = bitcast i16 %rhs.arg to <2 x i8>
378 %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
379 %cast.result = bitcast <2 x i8> %result to i16
383 define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
384 ; GFX6-LABEL: v_usubsat_v4i8:
386 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
388 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
389 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0
390 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1
391 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1
392 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1
393 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
394 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
395 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
396 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
397 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
398 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5
399 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
400 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
401 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
402 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6
403 ; GFX6-NEXT: v_min_u32_e32 v3, v2, v3
404 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
405 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4
406 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7
407 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4
408 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
409 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2
410 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
411 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3
412 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 24
413 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
414 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
415 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3
416 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
417 ; GFX6-NEXT: s_setpc_b64 s[30:31]
419 ; GFX8-LABEL: v_usubsat_v4i8:
421 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
423 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
424 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
425 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0
426 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
427 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
428 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1
429 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
430 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
431 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
432 ; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp
433 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4
434 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6
435 ; GFX8-NEXT: v_sub_u16_e64 v2, v2, v3 clamp
436 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5
437 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7
438 ; GFX8-NEXT: v_sub_u16_e64 v3, v3, v4 clamp
439 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
440 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
441 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
442 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
443 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
444 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
445 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
446 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
447 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
448 ; GFX8-NEXT: s_setpc_b64 s[30:31]
450 ; GFX9-LABEL: v_usubsat_v4i8:
452 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
454 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
455 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
456 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
457 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
458 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
459 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
460 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
461 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
462 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
463 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
464 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
465 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
466 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
467 ; GFX9-NEXT: v_pk_sub_u16 v2, v2, v3 clamp
468 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
469 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
470 ; GFX9-NEXT: v_mov_b32_e32 v2, 8
471 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
472 ; GFX9-NEXT: s_movk_i32 s4, 0xff
473 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
474 ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
475 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0
476 ; GFX9-NEXT: v_mov_b32_e32 v3, 24
477 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
478 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
479 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
480 ; GFX9-NEXT: s_setpc_b64 s[30:31]
482 ; GFX10-LABEL: v_usubsat_v4i8:
484 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
486 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
487 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
488 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
489 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
490 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
491 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
492 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
493 ; GFX10-NEXT: v_mov_b32_e32 v4, 24
494 ; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
495 ; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
496 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
497 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
498 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
499 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
500 ; GFX10-NEXT: v_pk_sub_u16 v2, v2, v3 clamp
501 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
502 ; GFX10-NEXT: v_mov_b32_e32 v1, 8
503 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1]
504 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
505 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
506 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0
507 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
508 ; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1
509 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
510 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
511 ; GFX10-NEXT: s_setpc_b64 s[30:31]
513 ; GFX11-LABEL: v_usubsat_v4i8:
515 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
516 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
517 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
518 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
519 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
520 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
521 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
522 ; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
523 ; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
524 ; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
525 ; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
526 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
527 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
528 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
529 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
530 ; GFX11-NEXT: v_pk_sub_u16 v2, v2, v3 clamp
531 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
532 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
533 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
534 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8
535 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0
536 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
537 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
538 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
539 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
540 ; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2
541 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0
542 ; GFX11-NEXT: s_setpc_b64 s[30:31]
543 %lhs = bitcast i32 %lhs.arg to <4 x i8>
544 %rhs = bitcast i32 %rhs.arg to <4 x i8>
545 %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
546 %cast.result = bitcast <4 x i8> %result to i32
550 define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
551 ; GFX6-LABEL: s_usubsat_v4i8:
553 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
554 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
555 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24
556 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8
557 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16
558 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24
559 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
560 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
561 ; GFX6-NEXT: s_min_u32 s1, s0, s1
562 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
563 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
564 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24
565 ; GFX6-NEXT: s_min_u32 s2, s1, s2
566 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
567 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
568 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24
569 ; GFX6-NEXT: s_min_u32 s3, s2, s3
570 ; GFX6-NEXT: s_sub_i32 s2, s2, s3
571 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24
572 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24
573 ; GFX6-NEXT: s_min_u32 s4, s3, s4
574 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24
575 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24
576 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
577 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
578 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24
579 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
580 ; GFX6-NEXT: s_lshl_b32 s0, s2, 16
581 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
582 ; GFX6-NEXT: s_lshl_b32 s0, s3, 24
583 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
584 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
585 ; GFX6-NEXT: ; return to shader part epilog
587 ; GFX8-LABEL: s_usubsat_v4i8:
589 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8
590 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16
591 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24
592 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
593 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
594 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16
595 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24
596 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
597 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
598 ; GFX8-NEXT: s_lshl_b32 s1, s5, 8
599 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
600 ; GFX8-NEXT: s_lshl_b32 s0, s2, 8
601 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
602 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
603 ; GFX8-NEXT: s_lshl_b32 s1, s6, 8
604 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
605 ; GFX8-NEXT: s_lshl_b32 s0, s3, 8
606 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
607 ; GFX8-NEXT: s_lshl_b32 s1, s7, 8
608 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
609 ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
610 ; GFX8-NEXT: s_lshl_b32 s0, s4, 8
611 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
612 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
614 ; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
615 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
618 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
620 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
621 ; GFX8-NEXT: ; return to shader part epilog
623 ; GFX9-LABEL: s_usubsat_v4i8:
625 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8
626 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
627 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24
628 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
629 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
630 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16
631 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
632 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
633 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8
634 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
635 ; GFX9-NEXT: s_lshr_b32 s6, s3, 16
636 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16
637 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24
638 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
639 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
640 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
641 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
642 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16
643 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
644 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
645 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
646 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
647 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16
648 ; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
649 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
650 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
651 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
652 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
653 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
654 ; GFX9-NEXT: s_mov_b32 s2, 8
655 ; GFX9-NEXT: v_pk_sub_u16 v1, s3, v1 clamp
656 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
657 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
658 ; GFX9-NEXT: s_movk_i32 s0, 0xff
659 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
660 ; GFX9-NEXT: s_mov_b32 s5, 24
661 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2
662 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
663 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
664 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
665 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
666 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
667 ; GFX9-NEXT: ; return to shader part epilog
669 ; GFX10-LABEL: s_usubsat_v4i8:
671 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
672 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
673 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24
674 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8
675 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
676 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
677 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16
678 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24
679 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
680 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
681 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
682 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
683 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
684 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
685 ; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
686 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
687 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
688 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
689 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
690 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16
691 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
692 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
693 ; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
694 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
695 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
696 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
697 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
698 ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp
699 ; GFX10-NEXT: s_mov_b32 s0, 8
700 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
701 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
702 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
703 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
704 ; GFX10-NEXT: s_mov_b32 s0, 24
705 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
706 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
707 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
708 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
709 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
710 ; GFX10-NEXT: ; return to shader part epilog
712 ; GFX11-LABEL: s_usubsat_v4i8:
714 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
715 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24
716 ; GFX11-NEXT: s_lshr_b32 s4, s1, 8
717 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24
718 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2
719 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3
720 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4
721 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
722 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5
723 ; GFX11-NEXT: s_lshr_b32 s5, s3, 16
724 ; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008
725 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
726 ; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
727 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8
728 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
729 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
730 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16
731 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16
732 ; GFX11-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
733 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
734 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
735 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
736 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8
737 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
738 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
739 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
740 ; GFX11-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
741 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
742 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
743 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
744 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
745 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
746 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2
747 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
748 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
749 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
750 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
751 ; GFX11-NEXT: ; return to shader part epilog
752 %lhs = bitcast i32 %lhs.arg to <4 x i8>
753 %rhs = bitcast i32 %rhs.arg to <4 x i8>
754 %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
755 %cast.result = bitcast <4 x i8> %result to i32
759 define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) {
760 ; GFX6-LABEL: v_usubsat_i24:
762 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
764 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
765 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
766 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
767 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0
768 ; GFX6-NEXT: s_setpc_b64 s[30:31]
770 ; GFX8-LABEL: v_usubsat_i24:
772 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
773 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0
774 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
775 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
776 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
777 ; GFX8-NEXT: s_setpc_b64 s[30:31]
779 ; GFX9-LABEL: v_usubsat_i24:
781 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
782 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
783 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
784 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp
785 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
786 ; GFX9-NEXT: s_setpc_b64 s[30:31]
788 ; GFX10PLUS-LABEL: v_usubsat_i24:
789 ; GFX10PLUS: ; %bb.0:
790 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
791 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0
792 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1
793 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp
794 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
795 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
796 %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
800 define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
801 ; GFX6-LABEL: s_usubsat_i24:
803 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8
804 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
805 ; GFX6-NEXT: s_min_u32 s1, s0, s1
806 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
807 ; GFX6-NEXT: s_lshr_b32 s0, s0, 8
808 ; GFX6-NEXT: ; return to shader part epilog
810 ; GFX8-LABEL: s_usubsat_i24:
812 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
813 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
814 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
815 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
816 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
817 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
818 ; GFX8-NEXT: ; return to shader part epilog
820 ; GFX9-LABEL: s_usubsat_i24:
822 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
823 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
824 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
825 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
826 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
827 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
828 ; GFX9-NEXT: ; return to shader part epilog
830 ; GFX10PLUS-LABEL: s_usubsat_i24:
831 ; GFX10PLUS: ; %bb.0:
832 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
833 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
834 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
835 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
836 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
837 ; GFX10PLUS-NEXT: ; return to shader part epilog
838 %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
842 define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
843 ; GFX6-LABEL: v_usubsat_i32:
845 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
847 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
848 ; GFX6-NEXT: s_setpc_b64 s[30:31]
850 ; GFX8-LABEL: v_usubsat_i32:
852 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
854 ; GFX8-NEXT: s_setpc_b64 s[30:31]
856 ; GFX9-LABEL: v_usubsat_i32:
858 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp
860 ; GFX9-NEXT: s_setpc_b64 s[30:31]
862 ; GFX10PLUS-LABEL: v_usubsat_i32:
863 ; GFX10PLUS: ; %bb.0:
864 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp
866 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
867 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
871 define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
872 ; GFX6-LABEL: s_usubsat_i32:
874 ; GFX6-NEXT: s_min_u32 s1, s0, s1
875 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
876 ; GFX6-NEXT: ; return to shader part epilog
878 ; GFX8-LABEL: s_usubsat_i32:
880 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
881 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
882 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
883 ; GFX8-NEXT: ; return to shader part epilog
885 ; GFX9-LABEL: s_usubsat_i32:
887 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
888 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
889 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
890 ; GFX9-NEXT: ; return to shader part epilog
892 ; GFX10PLUS-LABEL: s_usubsat_i32:
893 ; GFX10PLUS: ; %bb.0:
894 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
895 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
896 ; GFX10PLUS-NEXT: ; return to shader part epilog
897 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
901 define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
902 ; GFX6-LABEL: usubsat_i32_sv:
904 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
905 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
906 ; GFX6-NEXT: ; return to shader part epilog
908 ; GFX8-LABEL: usubsat_i32_sv:
910 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
911 ; GFX8-NEXT: ; return to shader part epilog
913 ; GFX9-LABEL: usubsat_i32_sv:
915 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
916 ; GFX9-NEXT: ; return to shader part epilog
918 ; GFX10PLUS-LABEL: usubsat_i32_sv:
919 ; GFX10PLUS: ; %bb.0:
920 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, v0 clamp
921 ; GFX10PLUS-NEXT: ; return to shader part epilog
922 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
923 %cast = bitcast i32 %result to float
927 define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
928 ; GFX6-LABEL: usubsat_i32_vs:
930 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v0
931 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
932 ; GFX6-NEXT: ; return to shader part epilog
934 ; GFX8-LABEL: usubsat_i32_vs:
936 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], v0, s0 clamp
937 ; GFX8-NEXT: ; return to shader part epilog
939 ; GFX9-LABEL: usubsat_i32_vs:
941 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, s0 clamp
942 ; GFX9-NEXT: ; return to shader part epilog
944 ; GFX10PLUS-LABEL: usubsat_i32_vs:
945 ; GFX10PLUS: ; %bb.0:
946 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, s0 clamp
947 ; GFX10PLUS-NEXT: ; return to shader part epilog
948 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
949 %cast = bitcast i32 %result to float
953 define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
954 ; GFX6-LABEL: v_usubsat_v2i32:
956 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
957 ; GFX6-NEXT: v_min_u32_e32 v2, v0, v2
958 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
959 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v3
960 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
961 ; GFX6-NEXT: s_setpc_b64 s[30:31]
963 ; GFX8-LABEL: v_usubsat_v2i32:
965 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
967 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
968 ; GFX8-NEXT: s_setpc_b64 s[30:31]
970 ; GFX9-LABEL: v_usubsat_v2i32:
972 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
973 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp
974 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp
975 ; GFX9-NEXT: s_setpc_b64 s[30:31]
977 ; GFX10PLUS-LABEL: v_usubsat_v2i32:
978 ; GFX10PLUS: ; %bb.0:
979 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
980 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp
981 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp
982 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
983 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
984 ret <2 x i32> %result
987 define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
988 ; GFX6-LABEL: s_usubsat_v2i32:
990 ; GFX6-NEXT: s_min_u32 s2, s0, s2
991 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
992 ; GFX6-NEXT: s_min_u32 s2, s1, s3
993 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
994 ; GFX6-NEXT: ; return to shader part epilog
996 ; GFX8-LABEL: s_usubsat_v2i32:
998 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
999 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1000 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], s0, v0 clamp
1001 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1002 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1003 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1004 ; GFX8-NEXT: ; return to shader part epilog
1006 ; GFX9-LABEL: s_usubsat_v2i32:
1008 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1009 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1010 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1011 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1012 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1013 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1014 ; GFX9-NEXT: ; return to shader part epilog
1016 ; GFX10PLUS-LABEL: s_usubsat_v2i32:
1017 ; GFX10PLUS: ; %bb.0:
1018 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s2 clamp
1019 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s3 clamp
1020 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1021 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1022 ; GFX10PLUS-NEXT: ; return to shader part epilog
1023 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1024 ret <2 x i32> %result
1027 define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1028 ; GFX6-LABEL: v_usubsat_v3i32:
1030 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1031 ; GFX6-NEXT: v_min_u32_e32 v3, v0, v3
1032 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
1033 ; GFX6-NEXT: v_min_u32_e32 v3, v1, v4
1034 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
1035 ; GFX6-NEXT: v_min_u32_e32 v3, v2, v5
1036 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
1037 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1039 ; GFX8-LABEL: v_usubsat_v3i32:
1041 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1042 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
1043 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
1044 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
1045 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1047 ; GFX9-LABEL: v_usubsat_v3i32:
1049 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1050 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp
1051 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp
1052 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp
1053 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1055 ; GFX10PLUS-LABEL: v_usubsat_v3i32:
1056 ; GFX10PLUS: ; %bb.0:
1057 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1058 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp
1059 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp
1060 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp
1061 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1062 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1063 ret <3 x i32> %result
1066 define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1067 ; GFX6-LABEL: s_usubsat_v3i32:
1069 ; GFX6-NEXT: s_min_u32 s3, s0, s3
1070 ; GFX6-NEXT: s_sub_i32 s0, s0, s3
1071 ; GFX6-NEXT: s_min_u32 s3, s1, s4
1072 ; GFX6-NEXT: s_sub_i32 s1, s1, s3
1073 ; GFX6-NEXT: s_min_u32 s3, s2, s5
1074 ; GFX6-NEXT: s_sub_i32 s2, s2, s3
1075 ; GFX6-NEXT: ; return to shader part epilog
1077 ; GFX8-LABEL: s_usubsat_v3i32:
1079 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
1080 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
1081 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
1082 ; GFX8-NEXT: v_sub_u32_e64 v0, s[6:7], s0, v0 clamp
1083 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1084 ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1085 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1086 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1087 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1088 ; GFX8-NEXT: ; return to shader part epilog
1090 ; GFX9-LABEL: s_usubsat_v3i32:
1092 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1093 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1094 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
1095 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1096 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1097 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1098 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1099 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1100 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1101 ; GFX9-NEXT: ; return to shader part epilog
1103 ; GFX10PLUS-LABEL: s_usubsat_v3i32:
1104 ; GFX10PLUS: ; %bb.0:
1105 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s3 clamp
1106 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s4 clamp
1107 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s5 clamp
1108 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1109 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1110 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1111 ; GFX10PLUS-NEXT: ; return to shader part epilog
1112 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1113 ret <3 x i32> %result
1116 define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1117 ; GFX6-LABEL: v_usubsat_v4i32:
1119 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120 ; GFX6-NEXT: v_min_u32_e32 v4, v0, v4
1121 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
1122 ; GFX6-NEXT: v_min_u32_e32 v4, v1, v5
1123 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
1124 ; GFX6-NEXT: v_min_u32_e32 v4, v2, v6
1125 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
1126 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v7
1127 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
1128 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1130 ; GFX8-LABEL: v_usubsat_v4i32:
1132 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1133 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
1134 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
1135 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
1136 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
1137 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1139 ; GFX9-LABEL: v_usubsat_v4i32:
1141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1142 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp
1143 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp
1144 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp
1145 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp
1146 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1148 ; GFX10PLUS-LABEL: v_usubsat_v4i32:
1149 ; GFX10PLUS: ; %bb.0:
1150 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp
1152 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp
1153 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp
1154 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp
1155 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1156 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1157 ret <4 x i32> %result
1160 define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1161 ; GFX6-LABEL: s_usubsat_v4i32:
1163 ; GFX6-NEXT: s_min_u32 s4, s0, s4
1164 ; GFX6-NEXT: s_sub_i32 s0, s0, s4
1165 ; GFX6-NEXT: s_min_u32 s4, s1, s5
1166 ; GFX6-NEXT: s_sub_i32 s1, s1, s4
1167 ; GFX6-NEXT: s_min_u32 s4, s2, s6
1168 ; GFX6-NEXT: s_sub_i32 s2, s2, s4
1169 ; GFX6-NEXT: s_min_u32 s4, s3, s7
1170 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
1171 ; GFX6-NEXT: ; return to shader part epilog
1173 ; GFX8-LABEL: s_usubsat_v4i32:
1175 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1176 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1177 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
1178 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1179 ; GFX8-NEXT: v_sub_u32_e64 v0, s[8:9], s0, v0 clamp
1180 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1181 ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1182 ; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp
1183 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1184 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1185 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1186 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1187 ; GFX8-NEXT: ; return to shader part epilog
1189 ; GFX9-LABEL: s_usubsat_v4i32:
1191 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1192 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1193 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1194 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1195 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1196 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1197 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1198 ; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp
1199 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1200 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1201 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1202 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1203 ; GFX9-NEXT: ; return to shader part epilog
1205 ; GFX10PLUS-LABEL: s_usubsat_v4i32:
1206 ; GFX10PLUS: ; %bb.0:
1207 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s4 clamp
1208 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s5 clamp
1209 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s6 clamp
1210 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s7 clamp
1211 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1212 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1213 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1214 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1215 ; GFX10PLUS-NEXT: ; return to shader part epilog
1216 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1217 ret <4 x i32> %result
1220 define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1221 ; GFX6-LABEL: v_usubsat_v5i32:
1223 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1224 ; GFX6-NEXT: v_min_u32_e32 v5, v0, v5
1225 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
1226 ; GFX6-NEXT: v_min_u32_e32 v5, v1, v6
1227 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
1228 ; GFX6-NEXT: v_min_u32_e32 v5, v2, v7
1229 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
1230 ; GFX6-NEXT: v_min_u32_e32 v5, v3, v8
1231 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
1232 ; GFX6-NEXT: v_min_u32_e32 v5, v4, v9
1233 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5
1234 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1236 ; GFX8-LABEL: v_usubsat_v5i32:
1238 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1239 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v5 clamp
1240 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v6 clamp
1241 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v7 clamp
1242 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v8 clamp
1243 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v9 clamp
1244 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1246 ; GFX9-LABEL: v_usubsat_v5i32:
1248 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1249 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v5 clamp
1250 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v6 clamp
1251 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v7 clamp
1252 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v8 clamp
1253 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v9 clamp
1254 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1256 ; GFX10PLUS-LABEL: v_usubsat_v5i32:
1257 ; GFX10PLUS: ; %bb.0:
1258 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1259 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v5 clamp
1260 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v6 clamp
1261 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v7 clamp
1262 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v8 clamp
1263 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, v4, v9 clamp
1264 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1265 %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1266 ret <5 x i32> %result
1269 define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1270 ; GFX6-LABEL: s_usubsat_v5i32:
1272 ; GFX6-NEXT: s_min_u32 s5, s0, s5
1273 ; GFX6-NEXT: s_sub_i32 s0, s0, s5
1274 ; GFX6-NEXT: s_min_u32 s5, s1, s6
1275 ; GFX6-NEXT: s_sub_i32 s1, s1, s5
1276 ; GFX6-NEXT: s_min_u32 s5, s2, s7
1277 ; GFX6-NEXT: s_sub_i32 s2, s2, s5
1278 ; GFX6-NEXT: s_min_u32 s5, s3, s8
1279 ; GFX6-NEXT: s_sub_i32 s3, s3, s5
1280 ; GFX6-NEXT: s_min_u32 s5, s4, s9
1281 ; GFX6-NEXT: s_sub_i32 s4, s4, s5
1282 ; GFX6-NEXT: ; return to shader part epilog
1284 ; GFX8-LABEL: s_usubsat_v5i32:
1286 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1287 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1288 ; GFX8-NEXT: v_mov_b32_e32 v2, s7
1289 ; GFX8-NEXT: v_mov_b32_e32 v3, s8
1290 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
1291 ; GFX8-NEXT: v_sub_u32_e64 v0, s[10:11], s0, v0 clamp
1292 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1293 ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1294 ; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp
1295 ; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], s4, v4 clamp
1296 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1297 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1298 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1299 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1300 ; GFX8-NEXT: v_readfirstlane_b32 s4, v4
1301 ; GFX8-NEXT: ; return to shader part epilog
1303 ; GFX9-LABEL: s_usubsat_v5i32:
1305 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1306 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1307 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1308 ; GFX9-NEXT: v_mov_b32_e32 v3, s8
1309 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1310 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1311 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1312 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1313 ; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp
1314 ; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp
1315 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1316 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1317 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1318 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1319 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1320 ; GFX9-NEXT: ; return to shader part epilog
1322 ; GFX10PLUS-LABEL: s_usubsat_v5i32:
1323 ; GFX10PLUS: ; %bb.0:
1324 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp
1325 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp
1326 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp
1327 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp
1328 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp
1329 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1330 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1331 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1332 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1333 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
1334 ; GFX10PLUS-NEXT: ; return to shader part epilog
1335 %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1336 ret <5 x i32> %result
1339 define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1340 ; GFX6-LABEL: v_usubsat_v16i32:
1342 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1343 ; GFX6-NEXT: v_min_u32_e32 v16, v0, v16
1344 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
1345 ; GFX6-NEXT: v_min_u32_e32 v16, v1, v17
1346 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16
1347 ; GFX6-NEXT: v_min_u32_e32 v16, v2, v18
1348 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16
1349 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19
1350 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16
1351 ; GFX6-NEXT: v_min_u32_e32 v16, v4, v20
1352 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16
1353 ; GFX6-NEXT: v_min_u32_e32 v16, v5, v21
1354 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16
1355 ; GFX6-NEXT: v_min_u32_e32 v16, v6, v22
1356 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16
1357 ; GFX6-NEXT: v_min_u32_e32 v16, v7, v23
1358 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16
1359 ; GFX6-NEXT: v_min_u32_e32 v16, v8, v24
1360 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16
1361 ; GFX6-NEXT: v_min_u32_e32 v16, v9, v25
1362 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16
1363 ; GFX6-NEXT: v_min_u32_e32 v16, v10, v26
1364 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16
1365 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
1366 ; GFX6-NEXT: v_min_u32_e32 v17, v11, v27
1367 ; GFX6-NEXT: v_min_u32_e32 v18, v12, v28
1368 ; GFX6-NEXT: v_min_u32_e32 v19, v13, v29
1369 ; GFX6-NEXT: v_min_u32_e32 v20, v14, v30
1370 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17
1371 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v18
1372 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v19
1373 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v20
1374 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1375 ; GFX6-NEXT: v_min_u32_e32 v16, v15, v16
1376 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16
1377 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1379 ; GFX8-LABEL: v_usubsat_v16i32:
1381 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1382 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
1383 ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
1384 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
1385 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
1386 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
1387 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
1388 ; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
1389 ; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
1390 ; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
1391 ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
1392 ; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
1393 ; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
1394 ; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
1395 ; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
1396 ; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
1397 ; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
1398 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1399 ; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
1400 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1402 ; GFX9-LABEL: v_usubsat_v16i32:
1404 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1405 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp
1406 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
1407 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp
1408 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp
1409 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp
1410 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp
1411 ; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp
1412 ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp
1413 ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp
1414 ; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp
1415 ; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp
1416 ; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp
1417 ; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp
1418 ; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp
1419 ; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp
1420 ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp
1421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1422 ; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp
1423 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1425 ; GFX10-LABEL: v_usubsat_v16i32:
1427 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1428 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
1429 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp
1430 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp
1431 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp
1432 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp
1433 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp
1434 ; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp
1435 ; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp
1436 ; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp
1437 ; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp
1438 ; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp
1439 ; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp
1440 ; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp
1441 ; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp
1442 ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp
1443 ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp
1444 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1445 ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp
1446 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1448 ; GFX11-LABEL: v_usubsat_v16i32:
1450 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1451 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
1452 ; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp
1453 ; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp
1454 ; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp
1455 ; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp
1456 ; GFX11-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp
1457 ; GFX11-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp
1458 ; GFX11-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp
1459 ; GFX11-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp
1460 ; GFX11-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp
1461 ; GFX11-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp
1462 ; GFX11-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp
1463 ; GFX11-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp
1464 ; GFX11-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp
1465 ; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp
1466 ; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp
1467 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1468 ; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp
1469 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1470 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1471 ret <16 x i32> %result
1474 define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
1475 ; GFX6-LABEL: s_usubsat_v16i32:
1477 ; GFX6-NEXT: s_min_u32 s16, s0, s16
1478 ; GFX6-NEXT: s_sub_i32 s0, s0, s16
1479 ; GFX6-NEXT: s_min_u32 s16, s1, s17
1480 ; GFX6-NEXT: s_sub_i32 s1, s1, s16
1481 ; GFX6-NEXT: s_min_u32 s16, s2, s18
1482 ; GFX6-NEXT: s_sub_i32 s2, s2, s16
1483 ; GFX6-NEXT: s_min_u32 s16, s3, s19
1484 ; GFX6-NEXT: s_sub_i32 s3, s3, s16
1485 ; GFX6-NEXT: s_min_u32 s16, s4, s20
1486 ; GFX6-NEXT: s_sub_i32 s4, s4, s16
1487 ; GFX6-NEXT: s_min_u32 s16, s5, s21
1488 ; GFX6-NEXT: s_sub_i32 s5, s5, s16
1489 ; GFX6-NEXT: s_min_u32 s16, s6, s22
1490 ; GFX6-NEXT: s_sub_i32 s6, s6, s16
1491 ; GFX6-NEXT: s_min_u32 s16, s7, s23
1492 ; GFX6-NEXT: s_sub_i32 s7, s7, s16
1493 ; GFX6-NEXT: s_min_u32 s16, s8, s24
1494 ; GFX6-NEXT: s_sub_i32 s8, s8, s16
1495 ; GFX6-NEXT: s_min_u32 s16, s9, s25
1496 ; GFX6-NEXT: s_sub_i32 s9, s9, s16
1497 ; GFX6-NEXT: s_min_u32 s16, s10, s26
1498 ; GFX6-NEXT: s_sub_i32 s10, s10, s16
1499 ; GFX6-NEXT: s_min_u32 s16, s11, s27
1500 ; GFX6-NEXT: s_sub_i32 s11, s11, s16
1501 ; GFX6-NEXT: s_min_u32 s16, s12, s28
1502 ; GFX6-NEXT: s_sub_i32 s12, s12, s16
1503 ; GFX6-NEXT: s_min_u32 s16, s13, s29
1504 ; GFX6-NEXT: s_sub_i32 s13, s13, s16
1505 ; GFX6-NEXT: s_min_u32 s16, s14, s30
1506 ; GFX6-NEXT: s_sub_i32 s14, s14, s16
1507 ; GFX6-NEXT: s_min_u32 s16, s15, s31
1508 ; GFX6-NEXT: s_sub_i32 s15, s15, s16
1509 ; GFX6-NEXT: ; return to shader part epilog
1511 ; GFX8-LABEL: s_usubsat_v16i32:
1513 ; GFX8-NEXT: v_mov_b32_e32 v0, s16
1514 ; GFX8-NEXT: v_mov_b32_e32 v1, s17
1515 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
1516 ; GFX8-NEXT: v_mov_b32_e32 v3, s19
1517 ; GFX8-NEXT: v_mov_b32_e32 v4, s20
1518 ; GFX8-NEXT: v_mov_b32_e32 v5, s21
1519 ; GFX8-NEXT: v_mov_b32_e32 v6, s22
1520 ; GFX8-NEXT: v_mov_b32_e32 v7, s23
1521 ; GFX8-NEXT: v_mov_b32_e32 v8, s24
1522 ; GFX8-NEXT: v_mov_b32_e32 v9, s25
1523 ; GFX8-NEXT: v_mov_b32_e32 v10, s26
1524 ; GFX8-NEXT: v_mov_b32_e32 v11, s27
1525 ; GFX8-NEXT: v_mov_b32_e32 v12, s28
1526 ; GFX8-NEXT: v_mov_b32_e32 v13, s29
1527 ; GFX8-NEXT: v_mov_b32_e32 v14, s30
1528 ; GFX8-NEXT: v_mov_b32_e32 v15, s31
1529 ; GFX8-NEXT: v_sub_u32_e64 v0, s[32:33], s0, v0 clamp
1530 ; GFX8-NEXT: v_sub_u32_e64 v1, s[16:17], s1, v1 clamp
1531 ; GFX8-NEXT: v_sub_u32_e64 v2, s[16:17], s2, v2 clamp
1532 ; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s3, v3 clamp
1533 ; GFX8-NEXT: v_sub_u32_e64 v4, s[2:3], s4, v4 clamp
1534 ; GFX8-NEXT: v_sub_u32_e64 v5, s[2:3], s5, v5 clamp
1535 ; GFX8-NEXT: v_sub_u32_e64 v6, s[2:3], s6, v6 clamp
1536 ; GFX8-NEXT: v_sub_u32_e64 v7, s[2:3], s7, v7 clamp
1537 ; GFX8-NEXT: v_sub_u32_e64 v8, s[2:3], s8, v8 clamp
1538 ; GFX8-NEXT: v_sub_u32_e64 v9, s[2:3], s9, v9 clamp
1539 ; GFX8-NEXT: v_sub_u32_e64 v10, s[2:3], s10, v10 clamp
1540 ; GFX8-NEXT: v_sub_u32_e64 v11, s[2:3], s11, v11 clamp
1541 ; GFX8-NEXT: v_sub_u32_e64 v12, s[2:3], s12, v12 clamp
1542 ; GFX8-NEXT: v_sub_u32_e64 v13, s[2:3], s13, v13 clamp
1543 ; GFX8-NEXT: v_sub_u32_e64 v14, s[2:3], s14, v14 clamp
1544 ; GFX8-NEXT: v_sub_u32_e64 v15, s[2:3], s15, v15 clamp
1545 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1546 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1547 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1548 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1549 ; GFX8-NEXT: v_readfirstlane_b32 s4, v4
1550 ; GFX8-NEXT: v_readfirstlane_b32 s5, v5
1551 ; GFX8-NEXT: v_readfirstlane_b32 s6, v6
1552 ; GFX8-NEXT: v_readfirstlane_b32 s7, v7
1553 ; GFX8-NEXT: v_readfirstlane_b32 s8, v8
1554 ; GFX8-NEXT: v_readfirstlane_b32 s9, v9
1555 ; GFX8-NEXT: v_readfirstlane_b32 s10, v10
1556 ; GFX8-NEXT: v_readfirstlane_b32 s11, v11
1557 ; GFX8-NEXT: v_readfirstlane_b32 s12, v12
1558 ; GFX8-NEXT: v_readfirstlane_b32 s13, v13
1559 ; GFX8-NEXT: v_readfirstlane_b32 s14, v14
1560 ; GFX8-NEXT: v_readfirstlane_b32 s15, v15
1561 ; GFX8-NEXT: ; return to shader part epilog
1563 ; GFX9-LABEL: s_usubsat_v16i32:
1565 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
1566 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
1567 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
1568 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
1569 ; GFX9-NEXT: v_mov_b32_e32 v4, s20
1570 ; GFX9-NEXT: v_mov_b32_e32 v5, s21
1571 ; GFX9-NEXT: v_mov_b32_e32 v6, s22
1572 ; GFX9-NEXT: v_mov_b32_e32 v7, s23
1573 ; GFX9-NEXT: v_mov_b32_e32 v8, s24
1574 ; GFX9-NEXT: v_mov_b32_e32 v9, s25
1575 ; GFX9-NEXT: v_mov_b32_e32 v10, s26
1576 ; GFX9-NEXT: v_mov_b32_e32 v11, s27
1577 ; GFX9-NEXT: v_mov_b32_e32 v12, s28
1578 ; GFX9-NEXT: v_mov_b32_e32 v13, s29
1579 ; GFX9-NEXT: v_mov_b32_e32 v14, s30
1580 ; GFX9-NEXT: v_mov_b32_e32 v15, s31
1581 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1582 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1583 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1584 ; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp
1585 ; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp
1586 ; GFX9-NEXT: v_sub_u32_e64 v5, s5, v5 clamp
1587 ; GFX9-NEXT: v_sub_u32_e64 v6, s6, v6 clamp
1588 ; GFX9-NEXT: v_sub_u32_e64 v7, s7, v7 clamp
1589 ; GFX9-NEXT: v_sub_u32_e64 v8, s8, v8 clamp
1590 ; GFX9-NEXT: v_sub_u32_e64 v9, s9, v9 clamp
1591 ; GFX9-NEXT: v_sub_u32_e64 v10, s10, v10 clamp
1592 ; GFX9-NEXT: v_sub_u32_e64 v11, s11, v11 clamp
1593 ; GFX9-NEXT: v_sub_u32_e64 v12, s12, v12 clamp
1594 ; GFX9-NEXT: v_sub_u32_e64 v13, s13, v13 clamp
1595 ; GFX9-NEXT: v_sub_u32_e64 v14, s14, v14 clamp
1596 ; GFX9-NEXT: v_sub_u32_e64 v15, s15, v15 clamp
1597 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1598 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1599 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1600 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1601 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1602 ; GFX9-NEXT: v_readfirstlane_b32 s5, v5
1603 ; GFX9-NEXT: v_readfirstlane_b32 s6, v6
1604 ; GFX9-NEXT: v_readfirstlane_b32 s7, v7
1605 ; GFX9-NEXT: v_readfirstlane_b32 s8, v8
1606 ; GFX9-NEXT: v_readfirstlane_b32 s9, v9
1607 ; GFX9-NEXT: v_readfirstlane_b32 s10, v10
1608 ; GFX9-NEXT: v_readfirstlane_b32 s11, v11
1609 ; GFX9-NEXT: v_readfirstlane_b32 s12, v12
1610 ; GFX9-NEXT: v_readfirstlane_b32 s13, v13
1611 ; GFX9-NEXT: v_readfirstlane_b32 s14, v14
1612 ; GFX9-NEXT: v_readfirstlane_b32 s15, v15
1613 ; GFX9-NEXT: ; return to shader part epilog
1615 ; GFX10PLUS-LABEL: s_usubsat_v16i32:
1616 ; GFX10PLUS: ; %bb.0:
1617 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp
1618 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp
1619 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp
1620 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp
1621 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp
1622 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp
1623 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp
1624 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp
1625 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp
1626 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp
1627 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp
1628 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp
1629 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp
1630 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp
1631 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp
1632 ; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp
1633 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1634 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1635 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1636 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1637 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
1638 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5
1639 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6
1640 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7
1641 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8
1642 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9
1643 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10
1644 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11
1645 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12
1646 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13
1647 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14
1648 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15
1649 ; GFX10PLUS-NEXT: ; return to shader part epilog
1650 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1651 ret <16 x i32> %result
1654 define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
1655 ; GFX6-LABEL: v_usubsat_i16:
1657 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1658 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1659 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1660 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
1661 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1662 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1663 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1665 ; GFX8-LABEL: v_usubsat_i16:
1667 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1668 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
1669 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1671 ; GFX9-LABEL: v_usubsat_i16:
1673 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1674 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
1675 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1677 ; GFX10PLUS-LABEL: v_usubsat_i16:
1678 ; GFX10PLUS: ; %bb.0:
1679 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1680 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
1681 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1682 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1686 define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
1687 ; GFX6-LABEL: s_usubsat_i16:
1689 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1690 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1691 ; GFX6-NEXT: s_min_u32 s1, s0, s1
1692 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
1693 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
1694 ; GFX6-NEXT: ; return to shader part epilog
1696 ; GFX8-LABEL: s_usubsat_i16:
1698 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1699 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1700 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1701 ; GFX8-NEXT: ; return to shader part epilog
1703 ; GFX9-LABEL: s_usubsat_i16:
1705 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1706 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1707 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1708 ; GFX9-NEXT: ; return to shader part epilog
1710 ; GFX10PLUS-LABEL: s_usubsat_i16:
1711 ; GFX10PLUS: ; %bb.0:
1712 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
1713 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1714 ; GFX10PLUS-NEXT: ; return to shader part epilog
1715 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1719 define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
1720 ; GFX6-LABEL: usubsat_i16_sv:
1722 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1723 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1724 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
1725 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
1726 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1727 ; GFX6-NEXT: ; return to shader part epilog
1729 ; GFX8-LABEL: usubsat_i16_sv:
1731 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1732 ; GFX8-NEXT: ; return to shader part epilog
1734 ; GFX9-LABEL: usubsat_i16_sv:
1736 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1737 ; GFX9-NEXT: ; return to shader part epilog
1739 ; GFX10PLUS-LABEL: usubsat_i16_sv:
1740 ; GFX10PLUS: ; %bb.0:
1741 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, v0 clamp
1742 ; GFX10PLUS-NEXT: ; return to shader part epilog
1743 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1744 %cast = bitcast i16 %result to half
1748 define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
1749 ; GFX6-LABEL: usubsat_i16_vs:
1751 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1752 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1753 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v0
1754 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1755 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1756 ; GFX6-NEXT: ; return to shader part epilog
1758 ; GFX8-LABEL: usubsat_i16_vs:
1760 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, s0 clamp
1761 ; GFX8-NEXT: ; return to shader part epilog
1763 ; GFX9-LABEL: usubsat_i16_vs:
1765 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp
1766 ; GFX9-NEXT: ; return to shader part epilog
1768 ; GFX10PLUS-LABEL: usubsat_i16_vs:
1769 ; GFX10PLUS: ; %bb.0:
1770 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, s0 clamp
1771 ; GFX10PLUS-NEXT: ; return to shader part epilog
1772 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1773 %cast = bitcast i16 %result to half
1777 define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
1778 ; GFX6-LABEL: v_usubsat_v2i16:
1780 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1781 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1782 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1783 ; GFX6-NEXT: v_min_u32_e32 v2, v0, v2
1784 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1785 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1786 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1787 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
1788 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
1789 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1790 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1791 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1793 ; GFX8-LABEL: v_usubsat_v2i16:
1795 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1796 ; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp
1797 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1798 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
1799 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1801 ; GFX9-LABEL: v_usubsat_v2i16:
1803 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1804 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
1805 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1807 ; GFX10PLUS-LABEL: v_usubsat_v2i16:
1808 ; GFX10PLUS: ; %bb.0:
1809 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1810 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
1811 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1812 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1813 ret <2 x i16> %result
1816 define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
1817 ; GFX6-LABEL: s_usubsat_v2i16:
1819 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1820 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
1821 ; GFX6-NEXT: s_min_u32 s2, s0, s2
1822 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
1823 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1824 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16
1825 ; GFX6-NEXT: s_min_u32 s2, s1, s2
1826 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
1827 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
1828 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
1829 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
1830 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1831 ; GFX6-NEXT: ; return to shader part epilog
1833 ; GFX8-LABEL: s_usubsat_v2i16:
1835 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
1836 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
1837 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1838 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1839 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1840 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1841 ; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1842 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1843 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1844 ; GFX8-NEXT: ; return to shader part epilog
1846 ; GFX9-LABEL: s_usubsat_v2i16:
1848 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1849 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1850 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1851 ; GFX9-NEXT: ; return to shader part epilog
1853 ; GFX10PLUS-LABEL: s_usubsat_v2i16:
1854 ; GFX10PLUS: ; %bb.0:
1855 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
1856 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1857 ; GFX10PLUS-NEXT: ; return to shader part epilog
1858 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1859 %cast = bitcast <2 x i16> %result to i32
1863 define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
1864 ; GFX6-LABEL: usubsat_v2i16_sv:
1866 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1867 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1868 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
1869 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
1870 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
1871 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1872 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1
1873 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
1874 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1875 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1876 ; GFX6-NEXT: ; return to shader part epilog
1878 ; GFX8-LABEL: usubsat_v2i16_sv:
1880 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
1881 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1882 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp
1883 ; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1884 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
1885 ; GFX8-NEXT: ; return to shader part epilog
1887 ; GFX9-LABEL: usubsat_v2i16_sv:
1889 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1890 ; GFX9-NEXT: ; return to shader part epilog
1892 ; GFX10PLUS-LABEL: usubsat_v2i16_sv:
1893 ; GFX10PLUS: ; %bb.0:
1894 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1895 ; GFX10PLUS-NEXT: ; return to shader part epilog
1896 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1897 %cast = bitcast <2 x i16> %result to float
1901 define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
1902 ; GFX6-LABEL: usubsat_v2i16_vs:
1904 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1905 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1906 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v0
1907 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1908 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
1909 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1910 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v1
1911 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
1912 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1913 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1914 ; GFX6-NEXT: ; return to shader part epilog
1916 ; GFX8-LABEL: usubsat_v2i16_vs:
1918 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
1919 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1920 ; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp
1921 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1922 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
1923 ; GFX8-NEXT: ; return to shader part epilog
1925 ; GFX9-LABEL: usubsat_v2i16_vs:
1927 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 clamp
1928 ; GFX9-NEXT: ; return to shader part epilog
1930 ; GFX10PLUS-LABEL: usubsat_v2i16_vs:
1931 ; GFX10PLUS: ; %bb.0:
1932 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, s0 clamp
1933 ; GFX10PLUS-NEXT: ; return to shader part epilog
1934 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1935 %cast = bitcast <2 x i16> %result to float
1939 ; FIXME: v3i16 insert/extract
1940 ; define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
1941 ; %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1942 ; ret <3 x i16> %result
1945 ; define amdgpu_ps <3 x i16> @s_usubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
1946 ; %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1947 ; ret <3 x i16> %result
1950 define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
1951 ; GFX6-LABEL: v_usubsat_v4i16:
1953 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1954 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1955 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1956 ; GFX6-NEXT: v_min_u32_e32 v4, v0, v4
1957 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
1958 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1959 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
1960 ; GFX6-NEXT: v_min_u32_e32 v4, v1, v4
1961 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
1962 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1963 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
1964 ; GFX6-NEXT: v_min_u32_e32 v4, v2, v4
1965 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
1966 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1967 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
1968 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4
1969 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
1970 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1971 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1972 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1973 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
1974 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1976 ; GFX8-LABEL: v_usubsat_v4i16:
1978 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979 ; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp
1980 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1981 ; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp
1982 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1983 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
1984 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
1985 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1987 ; GFX9-LABEL: v_usubsat_v4i16:
1989 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1990 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
1991 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
1992 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1994 ; GFX10PLUS-LABEL: v_usubsat_v4i16:
1995 ; GFX10PLUS: ; %bb.0:
1996 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1997 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
1998 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
1999 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2000 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2001 %cast = bitcast <4 x i16> %result to <2 x float>
2002 ret <2 x float> %cast
2005 define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
2006 ; GFX6-LABEL: s_usubsat_v4i16:
2008 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2009 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2010 ; GFX6-NEXT: s_min_u32 s4, s0, s4
2011 ; GFX6-NEXT: s_sub_i32 s0, s0, s4
2012 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2013 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16
2014 ; GFX6-NEXT: s_min_u32 s4, s1, s4
2015 ; GFX6-NEXT: s_sub_i32 s1, s1, s4
2016 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2017 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16
2018 ; GFX6-NEXT: s_min_u32 s4, s2, s4
2019 ; GFX6-NEXT: s_sub_i32 s2, s2, s4
2020 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2021 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16
2022 ; GFX6-NEXT: s_min_u32 s4, s3, s4
2023 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
2024 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2025 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2026 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2027 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2028 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2029 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2030 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2031 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2032 ; GFX6-NEXT: ; return to shader part epilog
2034 ; GFX8-LABEL: s_usubsat_v4i16:
2036 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16
2037 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16
2038 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16
2039 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16
2040 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
2041 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2042 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
2043 ; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2044 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
2045 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
2046 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
2047 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
2048 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
2049 ; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2050 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2051 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
2052 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2053 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2054 ; GFX8-NEXT: ; return to shader part epilog
2056 ; GFX9-LABEL: s_usubsat_v4i16:
2058 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2059 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2060 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
2061 ; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp
2062 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2063 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2064 ; GFX9-NEXT: ; return to shader part epilog
2066 ; GFX10PLUS-LABEL: s_usubsat_v4i16:
2067 ; GFX10PLUS: ; %bb.0:
2068 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s2 clamp
2069 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, s1, s3 clamp
2070 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2071 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2072 ; GFX10PLUS-NEXT: ; return to shader part epilog
2073 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2074 %cast = bitcast <4 x i16> %result to <2 x i32>
2079 ; define <5 x i16> @v_usubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
2080 ; %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2081 ; ret <5 x i16> %result
2084 ; define amdgpu_ps <5 x i16> @s_usubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
2085 ; %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2086 ; ret <5 x i16> %result
2089 define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
2090 ; GFX6-LABEL: v_usubsat_v6i16:
2092 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2093 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2094 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2095 ; GFX6-NEXT: v_min_u32_e32 v6, v0, v6
2096 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
2097 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2098 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7
2099 ; GFX6-NEXT: v_min_u32_e32 v6, v1, v6
2100 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
2101 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2102 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8
2103 ; GFX6-NEXT: v_min_u32_e32 v6, v2, v6
2104 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
2105 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2106 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9
2107 ; GFX6-NEXT: v_min_u32_e32 v6, v3, v6
2108 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
2109 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2110 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10
2111 ; GFX6-NEXT: v_min_u32_e32 v6, v4, v6
2112 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
2113 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2114 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11
2115 ; GFX6-NEXT: v_min_u32_e32 v6, v5, v6
2116 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
2117 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2118 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2119 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2120 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2121 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2122 ; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16
2123 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2125 ; GFX8-LABEL: v_usubsat_v6i16:
2127 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128 ; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp
2129 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2130 ; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp
2131 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2132 ; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp
2133 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2134 ; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
2135 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
2136 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
2137 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2139 ; GFX9-LABEL: v_usubsat_v6i16:
2141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2142 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v3 clamp
2143 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v4 clamp
2144 ; GFX9-NEXT: v_pk_sub_u16 v2, v2, v5 clamp
2145 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2147 ; GFX10PLUS-LABEL: v_usubsat_v6i16:
2148 ; GFX10PLUS: ; %bb.0:
2149 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2150 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v3 clamp
2151 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v4 clamp
2152 ; GFX10PLUS-NEXT: v_pk_sub_u16 v2, v2, v5 clamp
2153 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2154 %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2155 %cast = bitcast <6 x i16> %result to <3 x float>
2156 ret <3 x float> %cast
2159 define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
2160 ; GFX6-LABEL: s_usubsat_v6i16:
2162 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2163 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
2164 ; GFX6-NEXT: s_min_u32 s6, s0, s6
2165 ; GFX6-NEXT: s_sub_i32 s0, s0, s6
2166 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2167 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16
2168 ; GFX6-NEXT: s_min_u32 s6, s1, s6
2169 ; GFX6-NEXT: s_sub_i32 s1, s1, s6
2170 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2171 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16
2172 ; GFX6-NEXT: s_min_u32 s6, s2, s6
2173 ; GFX6-NEXT: s_sub_i32 s2, s2, s6
2174 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2175 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16
2176 ; GFX6-NEXT: s_min_u32 s6, s3, s6
2177 ; GFX6-NEXT: s_sub_i32 s3, s3, s6
2178 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2179 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16
2180 ; GFX6-NEXT: s_min_u32 s6, s4, s6
2181 ; GFX6-NEXT: s_sub_i32 s4, s4, s6
2182 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
2183 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16
2184 ; GFX6-NEXT: s_min_u32 s6, s5, s6
2185 ; GFX6-NEXT: s_sub_i32 s5, s5, s6
2186 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2187 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2188 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16
2189 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2190 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2191 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2192 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2193 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2194 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
2195 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2196 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2197 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2198 ; GFX6-NEXT: ; return to shader part epilog
2200 ; GFX8-LABEL: s_usubsat_v6i16:
2202 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16
2203 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16
2204 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16
2205 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16
2206 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16
2207 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16
2208 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
2209 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
2210 ; GFX8-NEXT: v_mov_b32_e32 v3, s10
2211 ; GFX8-NEXT: v_mov_b32_e32 v4, s7
2212 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
2213 ; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2214 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2215 ; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2216 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
2217 ; GFX8-NEXT: v_mov_b32_e32 v5, s11
2218 ; GFX8-NEXT: v_mov_b32_e32 v6, s8
2219 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
2220 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
2221 ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
2222 ; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2223 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2224 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
2225 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
2226 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2227 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2228 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2229 ; GFX8-NEXT: ; return to shader part epilog
2231 ; GFX9-LABEL: s_usubsat_v6i16:
2233 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
2234 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2235 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
2236 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
2237 ; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp
2238 ; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp
2239 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2240 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2241 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2242 ; GFX9-NEXT: ; return to shader part epilog
2244 ; GFX10PLUS-LABEL: s_usubsat_v6i16:
2245 ; GFX10PLUS: ; %bb.0:
2246 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s3 clamp
2247 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, s1, s4 clamp
2248 ; GFX10PLUS-NEXT: v_pk_sub_u16 v2, s2, s5 clamp
2249 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2250 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2251 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
2252 ; GFX10PLUS-NEXT: ; return to shader part epilog
2253 %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2254 %cast = bitcast <6 x i16> %result to <3 x i32>
2258 define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
2259 ; GFX6-LABEL: v_usubsat_v8i16:
2261 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2262 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2263 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
2264 ; GFX6-NEXT: v_min_u32_e32 v8, v0, v8
2265 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
2266 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2267 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9
2268 ; GFX6-NEXT: v_min_u32_e32 v8, v1, v8
2269 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
2270 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2271 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10
2272 ; GFX6-NEXT: v_min_u32_e32 v8, v2, v8
2273 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
2274 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2275 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11
2276 ; GFX6-NEXT: v_min_u32_e32 v8, v3, v8
2277 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8
2278 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2279 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12
2280 ; GFX6-NEXT: v_min_u32_e32 v8, v4, v8
2281 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
2282 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2283 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13
2284 ; GFX6-NEXT: v_min_u32_e32 v8, v5, v8
2285 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8
2286 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2287 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14
2288 ; GFX6-NEXT: v_min_u32_e32 v8, v6, v8
2289 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
2290 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2291 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15
2292 ; GFX6-NEXT: v_min_u32_e32 v8, v7, v8
2293 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8
2294 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2295 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2296 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2297 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2298 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2299 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2300 ; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16
2301 ; GFX6-NEXT: v_alignbit_b32 v3, v7, v6, 16
2302 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2304 ; GFX8-LABEL: v_usubsat_v8i16:
2306 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2307 ; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp
2308 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2309 ; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp
2310 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2311 ; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp
2312 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2313 ; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp
2314 ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2315 ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
2316 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
2317 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
2318 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
2319 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2321 ; GFX9-LABEL: v_usubsat_v8i16:
2323 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2324 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v4 clamp
2325 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v5 clamp
2326 ; GFX9-NEXT: v_pk_sub_u16 v2, v2, v6 clamp
2327 ; GFX9-NEXT: v_pk_sub_u16 v3, v3, v7 clamp
2328 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2330 ; GFX10PLUS-LABEL: v_usubsat_v8i16:
2331 ; GFX10PLUS: ; %bb.0:
2332 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2333 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v4 clamp
2334 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v5 clamp
2335 ; GFX10PLUS-NEXT: v_pk_sub_u16 v2, v2, v6 clamp
2336 ; GFX10PLUS-NEXT: v_pk_sub_u16 v3, v3, v7 clamp
2337 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2338 %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2339 %cast = bitcast <8 x i16> %result to <4 x float>
2340 ret <4 x float> %cast
2343 define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
2344 ; GFX6-LABEL: s_usubsat_v8i16:
2346 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2347 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16
2348 ; GFX6-NEXT: s_min_u32 s8, s0, s8
2349 ; GFX6-NEXT: s_sub_i32 s0, s0, s8
2350 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2351 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16
2352 ; GFX6-NEXT: s_min_u32 s8, s1, s8
2353 ; GFX6-NEXT: s_sub_i32 s1, s1, s8
2354 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2355 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16
2356 ; GFX6-NEXT: s_min_u32 s8, s2, s8
2357 ; GFX6-NEXT: s_sub_i32 s2, s2, s8
2358 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2359 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16
2360 ; GFX6-NEXT: s_min_u32 s8, s3, s8
2361 ; GFX6-NEXT: s_sub_i32 s3, s3, s8
2362 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2363 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16
2364 ; GFX6-NEXT: s_min_u32 s8, s4, s8
2365 ; GFX6-NEXT: s_sub_i32 s4, s4, s8
2366 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
2367 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16
2368 ; GFX6-NEXT: s_min_u32 s8, s5, s8
2369 ; GFX6-NEXT: s_sub_i32 s5, s5, s8
2370 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
2371 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16
2372 ; GFX6-NEXT: s_min_u32 s8, s6, s8
2373 ; GFX6-NEXT: s_sub_i32 s6, s6, s8
2374 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16
2375 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16
2376 ; GFX6-NEXT: s_min_u32 s8, s7, s8
2377 ; GFX6-NEXT: s_sub_i32 s7, s7, s8
2378 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2379 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2380 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16
2381 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16
2382 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2383 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2384 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2385 ; GFX6-NEXT: v_mov_b32_e32 v3, s6
2386 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2387 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2388 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
2389 ; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16
2390 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2391 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2392 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2393 ; GFX6-NEXT: v_readfirstlane_b32 s3, v3
2394 ; GFX6-NEXT: ; return to shader part epilog
2396 ; GFX8-LABEL: s_usubsat_v8i16:
2398 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16
2399 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16
2400 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16
2401 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16
2402 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16
2403 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16
2404 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16
2405 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16
2406 ; GFX8-NEXT: v_mov_b32_e32 v1, s12
2407 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
2408 ; GFX8-NEXT: v_mov_b32_e32 v3, s13
2409 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
2410 ; GFX8-NEXT: v_mov_b32_e32 v5, s14
2411 ; GFX8-NEXT: v_mov_b32_e32 v6, s10
2412 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2413 ; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2414 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
2415 ; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2416 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
2417 ; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2418 ; GFX8-NEXT: v_mov_b32_e32 v6, s7
2419 ; GFX8-NEXT: v_mov_b32_e32 v7, s15
2420 ; GFX8-NEXT: v_mov_b32_e32 v8, s11
2421 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
2422 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
2423 ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
2424 ; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp
2425 ; GFX8-NEXT: v_sub_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2426 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2427 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
2428 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
2429 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v7
2430 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2431 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2432 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2433 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
2434 ; GFX8-NEXT: ; return to shader part epilog
2436 ; GFX9-LABEL: s_usubsat_v8i16:
2438 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2439 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2440 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
2441 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
2442 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
2443 ; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp
2444 ; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp
2445 ; GFX9-NEXT: v_pk_sub_u16 v3, s3, v3 clamp
2446 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2447 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2448 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2449 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
2450 ; GFX9-NEXT: ; return to shader part epilog
2452 ; GFX10PLUS-LABEL: s_usubsat_v8i16:
2453 ; GFX10PLUS: ; %bb.0:
2454 ; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s4 clamp
2455 ; GFX10PLUS-NEXT: v_pk_sub_u16 v1, s1, s5 clamp
2456 ; GFX10PLUS-NEXT: v_pk_sub_u16 v2, s2, s6 clamp
2457 ; GFX10PLUS-NEXT: v_pk_sub_u16 v3, s3, s7 clamp
2458 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2459 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2460 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
2461 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
2462 ; GFX10PLUS-NEXT: ; return to shader part epilog
2463 %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2464 %cast = bitcast <8 x i16> %result to <4 x i32>
2468 define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
2469 ; GFX6-LABEL: v_usubsat_i48:
2471 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2472 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2473 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
2474 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
2475 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
2476 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2477 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
2478 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2479 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2480 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2481 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2482 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2483 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2484 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2486 ; GFX8-LABEL: v_usubsat_i48:
2488 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2489 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2490 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
2491 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
2492 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
2493 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2494 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2495 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2496 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2498 ; GFX9-LABEL: v_usubsat_i48:
2500 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2501 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2502 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
2503 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
2504 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
2505 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2506 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2507 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2508 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2510 ; GFX10PLUS-LABEL: v_usubsat_i48:
2511 ; GFX10PLUS: ; %bb.0:
2512 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2513 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2514 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
2515 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
2516 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2517 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2518 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2519 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2520 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2521 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2525 define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
2526 ; GFX6-LABEL: s_usubsat_i48:
2528 ; GFX6-NEXT: s_sub_u32 s0, s0, s2
2529 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0
2530 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2531 ; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
2532 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0
2533 ; GFX6-NEXT: s_subb_u32 s2, s1, s3
2534 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
2535 ; GFX6-NEXT: s_cmp_lg_u32 s2, s1
2536 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0
2537 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
2538 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
2539 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2540 ; GFX6-NEXT: s_or_b32 s0, s0, s3
2541 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0
2542 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2543 ; GFX6-NEXT: ; return to shader part epilog
2545 ; GFX8-LABEL: s_usubsat_i48:
2547 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2548 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
2549 ; GFX8-NEXT: s_sub_u32 s0, s0, s2
2550 ; GFX8-NEXT: s_subb_u32 s1, s1, s3
2551 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2552 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
2553 ; GFX8-NEXT: ; return to shader part epilog
2555 ; GFX9-LABEL: s_usubsat_i48:
2557 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2558 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
2559 ; GFX9-NEXT: s_sub_u32 s0, s0, s2
2560 ; GFX9-NEXT: s_subb_u32 s1, s1, s3
2561 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2562 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
2563 ; GFX9-NEXT: ; return to shader part epilog
2565 ; GFX10PLUS-LABEL: s_usubsat_i48:
2566 ; GFX10PLUS: ; %bb.0:
2567 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2568 ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
2569 ; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2
2570 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3
2571 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2572 ; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
2573 ; GFX10PLUS-NEXT: ; return to shader part epilog
2574 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2578 define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
2579 ; GFX6-LABEL: usubsat_i48_sv:
2581 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2582 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2583 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2584 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
2585 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
2586 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2587 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
2588 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2589 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2590 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2591 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2592 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2593 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2594 ; GFX6-NEXT: ; return to shader part epilog
2596 ; GFX8-LABEL: usubsat_i48_sv:
2598 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2599 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2600 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2601 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
2602 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
2603 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2604 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2605 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2606 ; GFX8-NEXT: ; return to shader part epilog
2608 ; GFX9-LABEL: usubsat_i48_sv:
2610 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2611 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2612 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2613 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
2614 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2615 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2616 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2617 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2618 ; GFX9-NEXT: ; return to shader part epilog
2620 ; GFX10PLUS-LABEL: usubsat_i48_sv:
2621 ; GFX10PLUS: ; %bb.0:
2622 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2623 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2624 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
2625 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2626 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2627 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2628 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2629 ; GFX10PLUS-NEXT: ; return to shader part epilog
2630 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2631 %ext.result = zext i48 %result to i64
2632 %cast = bitcast i64 %ext.result to <2 x float>
2633 ret <2 x float> %cast
2636 define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
2637 ; GFX6-LABEL: usubsat_i48_vs:
2639 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2640 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2641 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2642 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
2643 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
2644 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2645 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
2646 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2647 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2648 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2649 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2650 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2651 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2652 ; GFX6-NEXT: ; return to shader part epilog
2654 ; GFX8-LABEL: usubsat_i48_vs:
2656 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2657 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2658 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2659 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
2660 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
2661 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2662 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2663 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2664 ; GFX8-NEXT: ; return to shader part epilog
2666 ; GFX9-LABEL: usubsat_i48_vs:
2668 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2669 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2670 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2671 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
2672 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
2673 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2674 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2675 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2676 ; GFX9-NEXT: ; return to shader part epilog
2678 ; GFX10PLUS-LABEL: usubsat_i48_vs:
2679 ; GFX10PLUS: ; %bb.0:
2680 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2681 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2682 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
2683 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2684 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2685 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2686 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2687 ; GFX10PLUS-NEXT: ; return to shader part epilog
2688 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2689 %ext.result = zext i48 %result to i64
2690 %cast = bitcast i64 %ext.result to <2 x float>
2691 ret <2 x float> %cast
2694 define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
2695 ; GFX6-LABEL: v_usubsat_i64:
2697 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2698 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
2699 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
2700 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2701 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2702 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2704 ; GFX8-LABEL: v_usubsat_i64:
2706 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2707 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
2708 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
2709 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2710 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2711 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2713 ; GFX9-LABEL: v_usubsat_i64:
2715 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2716 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
2717 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
2718 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2719 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2720 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2722 ; GFX10PLUS-LABEL: v_usubsat_i64:
2723 ; GFX10PLUS: ; %bb.0:
2724 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2725 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
2726 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2727 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2728 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2729 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2730 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2734 define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
2735 ; GFX6-LABEL: s_usubsat_i64:
2737 ; GFX6-NEXT: s_sub_u32 s0, s0, s2
2738 ; GFX6-NEXT: s_subb_u32 s1, s1, s3
2739 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2740 ; GFX6-NEXT: ; return to shader part epilog
2742 ; GFX8-LABEL: s_usubsat_i64:
2744 ; GFX8-NEXT: s_sub_u32 s0, s0, s2
2745 ; GFX8-NEXT: s_subb_u32 s1, s1, s3
2746 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2747 ; GFX8-NEXT: ; return to shader part epilog
2749 ; GFX9-LABEL: s_usubsat_i64:
2751 ; GFX9-NEXT: s_sub_u32 s0, s0, s2
2752 ; GFX9-NEXT: s_subb_u32 s1, s1, s3
2753 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2754 ; GFX9-NEXT: ; return to shader part epilog
2756 ; GFX10PLUS-LABEL: s_usubsat_i64:
2757 ; GFX10PLUS: ; %bb.0:
2758 ; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2
2759 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3
2760 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2761 ; GFX10PLUS-NEXT: ; return to shader part epilog
2762 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2766 define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
2767 ; GFX6-LABEL: usubsat_i64_sv:
2769 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2770 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
2771 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
2772 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2773 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2774 ; GFX6-NEXT: ; return to shader part epilog
2776 ; GFX8-LABEL: usubsat_i64_sv:
2778 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2779 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
2780 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
2781 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2782 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2783 ; GFX8-NEXT: ; return to shader part epilog
2785 ; GFX9-LABEL: usubsat_i64_sv:
2787 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2788 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
2789 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2790 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2791 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2792 ; GFX9-NEXT: ; return to shader part epilog
2794 ; GFX10PLUS-LABEL: usubsat_i64_sv:
2795 ; GFX10PLUS: ; %bb.0:
2796 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
2797 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2798 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2799 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2800 ; GFX10PLUS-NEXT: ; return to shader part epilog
2801 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2802 %cast = bitcast i64 %result to <2 x float>
2803 ret <2 x float> %cast
2806 define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
2807 ; GFX6-LABEL: usubsat_i64_vs:
2809 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2810 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
2811 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
2812 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2813 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2814 ; GFX6-NEXT: ; return to shader part epilog
2816 ; GFX8-LABEL: usubsat_i64_vs:
2818 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2819 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
2820 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
2821 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2822 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2823 ; GFX8-NEXT: ; return to shader part epilog
2825 ; GFX9-LABEL: usubsat_i64_vs:
2827 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2828 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
2829 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
2830 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2831 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2832 ; GFX9-NEXT: ; return to shader part epilog
2834 ; GFX10PLUS-LABEL: usubsat_i64_vs:
2835 ; GFX10PLUS: ; %bb.0:
2836 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
2837 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2838 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2839 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2840 ; GFX10PLUS-NEXT: ; return to shader part epilog
2841 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2842 %cast = bitcast i64 %result to <2 x float>
2843 ret <2 x float> %cast
2846 define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
2847 ; GFX6-LABEL: v_usubsat_v2i64:
2849 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2850 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
2851 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
2852 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2853 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2854 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
2855 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
2856 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2857 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2858 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2860 ; GFX8-LABEL: v_usubsat_v2i64:
2862 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2863 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
2864 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
2865 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2866 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2867 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
2868 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
2869 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2870 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2871 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2873 ; GFX9-LABEL: v_usubsat_v2i64:
2875 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2876 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
2877 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
2878 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
2879 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
2880 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6
2881 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
2882 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2883 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2884 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2886 ; GFX10-LABEL: v_usubsat_v2i64:
2888 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2889 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
2890 ; GFX10-NEXT: v_sub_co_u32 v2, s4, v2, v6
2891 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
2892 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4
2893 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2894 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2895 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4
2896 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4
2897 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2899 ; GFX11-LABEL: v_usubsat_v2i64:
2901 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2902 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
2903 ; GFX11-NEXT: v_sub_co_u32 v2, s0, v2, v6
2904 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
2905 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0
2906 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
2907 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
2908 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
2909 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
2910 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2911 %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2912 ret <2 x i64> %result
2915 define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
2916 ; GFX6-LABEL: s_usubsat_v2i64:
2918 ; GFX6-NEXT: s_sub_u32 s0, s0, s4
2919 ; GFX6-NEXT: s_subb_u32 s1, s1, s5
2920 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2921 ; GFX6-NEXT: s_sub_u32 s2, s2, s6
2922 ; GFX6-NEXT: s_subb_u32 s3, s3, s7
2923 ; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2924 ; GFX6-NEXT: ; return to shader part epilog
2926 ; GFX8-LABEL: s_usubsat_v2i64:
2928 ; GFX8-NEXT: s_sub_u32 s0, s0, s4
2929 ; GFX8-NEXT: s_subb_u32 s1, s1, s5
2930 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2931 ; GFX8-NEXT: s_sub_u32 s2, s2, s6
2932 ; GFX8-NEXT: s_subb_u32 s3, s3, s7
2933 ; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2934 ; GFX8-NEXT: ; return to shader part epilog
2936 ; GFX9-LABEL: s_usubsat_v2i64:
2938 ; GFX9-NEXT: s_sub_u32 s0, s0, s4
2939 ; GFX9-NEXT: s_subb_u32 s1, s1, s5
2940 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2941 ; GFX9-NEXT: s_sub_u32 s2, s2, s6
2942 ; GFX9-NEXT: s_subb_u32 s3, s3, s7
2943 ; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2944 ; GFX9-NEXT: ; return to shader part epilog
2946 ; GFX10PLUS-LABEL: s_usubsat_v2i64:
2947 ; GFX10PLUS: ; %bb.0:
2948 ; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4
2949 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5
2950 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2951 ; GFX10PLUS-NEXT: s_sub_u32 s2, s2, s6
2952 ; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7
2953 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2954 ; GFX10PLUS-NEXT: ; return to shader part epilog
2955 %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2956 ret <2 x i64> %result
2959 define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
2960 ; GFX6-LABEL: s_usubsat_i128:
2962 ; GFX6-NEXT: s_sub_u32 s0, s0, s4
2963 ; GFX6-NEXT: s_subb_u32 s1, s1, s5
2964 ; GFX6-NEXT: s_subb_u32 s2, s2, s6
2965 ; GFX6-NEXT: s_subb_u32 s3, s3, s7
2966 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2967 ; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2968 ; GFX6-NEXT: ; return to shader part epilog
2970 ; GFX8-LABEL: s_usubsat_i128:
2972 ; GFX8-NEXT: s_sub_u32 s0, s0, s4
2973 ; GFX8-NEXT: s_subb_u32 s1, s1, s5
2974 ; GFX8-NEXT: s_subb_u32 s2, s2, s6
2975 ; GFX8-NEXT: s_subb_u32 s3, s3, s7
2976 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2977 ; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2978 ; GFX8-NEXT: ; return to shader part epilog
2980 ; GFX9-LABEL: s_usubsat_i128:
2982 ; GFX9-NEXT: s_sub_u32 s0, s0, s4
2983 ; GFX9-NEXT: s_subb_u32 s1, s1, s5
2984 ; GFX9-NEXT: s_subb_u32 s2, s2, s6
2985 ; GFX9-NEXT: s_subb_u32 s3, s3, s7
2986 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2987 ; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2988 ; GFX9-NEXT: ; return to shader part epilog
2990 ; GFX10PLUS-LABEL: s_usubsat_i128:
2991 ; GFX10PLUS: ; %bb.0:
2992 ; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4
2993 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5
2994 ; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s6
2995 ; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7
2996 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
2997 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
2998 ; GFX10PLUS-NEXT: ; return to shader part epilog
2999 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3003 define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
3004 ; GFX6-LABEL: usubsat_i128_sv:
3006 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
3007 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
3008 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
3009 ; GFX6-NEXT: v_mov_b32_e32 v4, s2
3010 ; GFX6-NEXT: v_mov_b32_e32 v5, s3
3011 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc
3012 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
3013 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3014 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3015 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3016 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3017 ; GFX6-NEXT: ; return to shader part epilog
3019 ; GFX8-LABEL: usubsat_i128_sv:
3021 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
3022 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
3023 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
3024 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
3025 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
3026 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc
3027 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
3028 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3029 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3030 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3031 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3032 ; GFX8-NEXT: ; return to shader part epilog
3034 ; GFX9-LABEL: usubsat_i128_sv:
3036 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
3037 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
3038 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
3039 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
3040 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
3041 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v2, vcc
3042 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
3043 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3044 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3045 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3046 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3047 ; GFX9-NEXT: ; return to shader part epilog
3049 ; GFX10PLUS-LABEL: usubsat_i128_sv:
3050 ; GFX10PLUS: ; %bb.0:
3051 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
3052 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3053 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3054 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3055 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3056 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3057 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3058 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3059 ; GFX10PLUS-NEXT: ; return to shader part epilog
3060 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3061 %cast = bitcast i128 %result to <4 x float>
3062 ret <4 x float> %cast
3065 define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
3066 ; GFX6-LABEL: usubsat_i128_vs:
3068 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
3069 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
3070 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
3071 ; GFX6-NEXT: v_mov_b32_e32 v4, s2
3072 ; GFX6-NEXT: v_mov_b32_e32 v5, s3
3073 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc
3074 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
3075 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3076 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3077 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3078 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3079 ; GFX6-NEXT: ; return to shader part epilog
3081 ; GFX8-LABEL: usubsat_i128_vs:
3083 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
3084 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
3085 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
3086 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
3087 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
3088 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc
3089 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
3090 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3091 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3092 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3093 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3094 ; GFX8-NEXT: ; return to shader part epilog
3096 ; GFX9-LABEL: usubsat_i128_vs:
3098 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
3099 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
3100 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
3101 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
3102 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
3103 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
3104 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
3105 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3106 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3107 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3108 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3109 ; GFX9-NEXT: ; return to shader part epilog
3111 ; GFX10PLUS-LABEL: usubsat_i128_vs:
3112 ; GFX10PLUS: ; %bb.0:
3113 ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
3114 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3115 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3116 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3117 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3118 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3119 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3120 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3121 ; GFX10PLUS-NEXT: ; return to shader part epilog
3122 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3123 %cast = bitcast i128 %result to <4 x float>
3124 ret <4 x float> %cast
3127 define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
3128 ; GFX6-LABEL: v_usubsat_v2i128:
3130 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3131 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
3132 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
3133 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc
3134 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
3135 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3136 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3137 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3138 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3139 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12
3140 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc
3141 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc
3142 ; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc
3143 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
3144 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
3145 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
3146 ; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
3147 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3149 ; GFX8-LABEL: v_usubsat_v2i128:
3151 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
3153 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
3154 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc
3155 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
3156 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3157 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3158 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3159 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3160 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v12
3161 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc
3162 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc
3163 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc
3164 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
3165 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
3166 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
3167 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
3168 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3170 ; GFX9-LABEL: v_usubsat_v2i128:
3172 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3173 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8
3174 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
3175 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
3176 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
3177 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
3178 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3179 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3180 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3181 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v12
3182 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v13, vcc
3183 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v14, vcc
3184 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v15, vcc
3185 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
3186 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
3187 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
3188 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
3189 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3191 ; GFX10-LABEL: v_usubsat_v2i128:
3193 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3194 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8
3195 ; GFX10-NEXT: v_sub_co_u32 v4, s4, v4, v12
3196 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3197 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4
3198 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3199 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4
3200 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3201 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4
3202 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3203 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3204 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3205 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3206 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4
3207 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s4
3208 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s4
3209 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s4
3210 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3212 ; GFX11-LABEL: v_usubsat_v2i128:
3214 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3215 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8
3216 ; GFX11-NEXT: v_sub_co_u32 v4, s0, v4, v12
3217 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3218 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0
3219 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3220 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0
3221 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3222 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0
3223 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
3224 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
3225 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
3226 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
3227 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0
3228 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 0, s0
3229 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, s0
3230 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, 0, s0
3231 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3232 %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3233 ret <2 x i128> %result
3236 define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
3237 ; GFX6-LABEL: s_usubsat_v2i128:
3239 ; GFX6-NEXT: s_sub_u32 s0, s0, s8
3240 ; GFX6-NEXT: s_subb_u32 s1, s1, s9
3241 ; GFX6-NEXT: s_subb_u32 s2, s2, s10
3242 ; GFX6-NEXT: s_subb_u32 s3, s3, s11
3243 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
3244 ; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
3245 ; GFX6-NEXT: s_sub_u32 s4, s4, s12
3246 ; GFX6-NEXT: s_subb_u32 s5, s5, s13
3247 ; GFX6-NEXT: s_subb_u32 s6, s6, s14
3248 ; GFX6-NEXT: s_subb_u32 s7, s7, s15
3249 ; GFX6-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
3250 ; GFX6-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
3251 ; GFX6-NEXT: ; return to shader part epilog
3253 ; GFX8-LABEL: s_usubsat_v2i128:
3255 ; GFX8-NEXT: s_sub_u32 s0, s0, s8
3256 ; GFX8-NEXT: s_subb_u32 s1, s1, s9
3257 ; GFX8-NEXT: s_subb_u32 s2, s2, s10
3258 ; GFX8-NEXT: s_subb_u32 s3, s3, s11
3259 ; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
3260 ; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
3261 ; GFX8-NEXT: s_sub_u32 s4, s4, s12
3262 ; GFX8-NEXT: s_subb_u32 s5, s5, s13
3263 ; GFX8-NEXT: s_subb_u32 s6, s6, s14
3264 ; GFX8-NEXT: s_subb_u32 s7, s7, s15
3265 ; GFX8-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
3266 ; GFX8-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
3267 ; GFX8-NEXT: ; return to shader part epilog
3269 ; GFX9-LABEL: s_usubsat_v2i128:
3271 ; GFX9-NEXT: s_sub_u32 s0, s0, s8
3272 ; GFX9-NEXT: s_subb_u32 s1, s1, s9
3273 ; GFX9-NEXT: s_subb_u32 s2, s2, s10
3274 ; GFX9-NEXT: s_subb_u32 s3, s3, s11
3275 ; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
3276 ; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
3277 ; GFX9-NEXT: s_sub_u32 s4, s4, s12
3278 ; GFX9-NEXT: s_subb_u32 s5, s5, s13
3279 ; GFX9-NEXT: s_subb_u32 s6, s6, s14
3280 ; GFX9-NEXT: s_subb_u32 s7, s7, s15
3281 ; GFX9-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
3282 ; GFX9-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
3283 ; GFX9-NEXT: ; return to shader part epilog
3285 ; GFX10PLUS-LABEL: s_usubsat_v2i128:
3286 ; GFX10PLUS: ; %bb.0:
3287 ; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s8
3288 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s9
3289 ; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s10
3290 ; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s11
3291 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
3292 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
3293 ; GFX10PLUS-NEXT: s_sub_u32 s4, s4, s12
3294 ; GFX10PLUS-NEXT: s_subb_u32 s5, s5, s13
3295 ; GFX10PLUS-NEXT: s_subb_u32 s6, s6, s14
3296 ; GFX10PLUS-NEXT: s_subb_u32 s7, s7, s15
3297 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
3298 ; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
3299 ; GFX10PLUS-NEXT: ; return to shader part epilog
3300 %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3301 ret <2 x i128> %result
3304 declare i7 @llvm.usub.sat.i7(i7, i7) #0
3305 declare i8 @llvm.usub.sat.i8(i8, i8) #0
3306 declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) #0
3307 declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) #0
3309 declare i16 @llvm.usub.sat.i16(i16, i16) #0
3310 declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
3311 declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
3312 declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
3313 declare <5 x i16> @llvm.usub.sat.v5i16(<5 x i16>, <5 x i16>) #0
3314 declare <6 x i16> @llvm.usub.sat.v6i16(<6 x i16>, <6 x i16>) #0
3315 declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) #0
3317 declare i24 @llvm.usub.sat.i24(i24, i24) #0
3319 declare i32 @llvm.usub.sat.i32(i32, i32) #0
3320 declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
3321 declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
3322 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
3323 declare <5 x i32> @llvm.usub.sat.v5i32(<5 x i32>, <5 x i32>) #0
3324 declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
3326 declare i48 @llvm.usub.sat.i48(i48, i48) #0
3328 declare i64 @llvm.usub.sat.i64(i64, i64) #0
3329 declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) #0
3331 declare i128 @llvm.usub.sat.i128(i128, i128) #0
3332 declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>) #0
3334 attributes #0 = { nounwind readnone speculatable willreturn }