1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
7 define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
8 ; GFX6-LABEL: v_usubsat_i7:
10 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0
12 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1
13 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
14 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
15 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0
16 ; GFX6-NEXT: s_setpc_b64 s[30:31]
18 ; GFX8-LABEL: v_usubsat_i7:
20 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0
22 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1
23 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
24 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
25 ; GFX8-NEXT: s_setpc_b64 s[30:31]
27 ; GFX9-LABEL: v_usubsat_i7:
29 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0
31 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1
32 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
33 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
34 ; GFX9-NEXT: s_setpc_b64 s[30:31]
36 ; GFX10-LABEL: v_usubsat_i7:
38 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
40 ; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
41 ; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
42 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
43 ; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
44 ; GFX10-NEXT: s_setpc_b64 s[30:31]
45 %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
49 define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
50 ; GFX6-LABEL: s_usubsat_i7:
52 ; GFX6-NEXT: s_lshl_b32 s0, s0, 25
53 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25
54 ; GFX6-NEXT: s_min_u32 s1, s0, s1
55 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
56 ; GFX6-NEXT: s_lshr_b32 s0, s0, 25
57 ; GFX6-NEXT: ; return to shader part epilog
59 ; GFX8-LABEL: s_usubsat_i7:
61 ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000
62 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2
63 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2
64 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
65 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
66 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
67 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
68 ; GFX8-NEXT: ; return to shader part epilog
70 ; GFX9-LABEL: s_usubsat_i7:
72 ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000
73 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2
74 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2
75 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
76 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
77 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
78 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
79 ; GFX9-NEXT: ; return to shader part epilog
81 ; GFX10-LABEL: s_usubsat_i7:
83 ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000
84 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2
85 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2
86 ; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
87 ; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
88 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
89 ; GFX10-NEXT: ; return to shader part epilog
90 %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
94 define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
95 ; GFX6-LABEL: v_usubsat_i8:
97 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
99 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
100 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
101 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
102 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
103 ; GFX6-NEXT: s_setpc_b64 s[30:31]
105 ; GFX8-LABEL: v_usubsat_i8:
107 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
109 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
110 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
111 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
112 ; GFX8-NEXT: s_setpc_b64 s[30:31]
114 ; GFX9-LABEL: v_usubsat_i8:
116 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
118 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
119 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
120 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
121 ; GFX9-NEXT: s_setpc_b64 s[30:31]
123 ; GFX10-LABEL: v_usubsat_i8:
125 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
127 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
128 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
129 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
130 ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
131 ; GFX10-NEXT: s_setpc_b64 s[30:31]
132 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
136 define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
137 ; GFX6-LABEL: s_usubsat_i8:
139 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
140 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
141 ; GFX6-NEXT: s_min_u32 s1, s0, s1
142 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
143 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24
144 ; GFX6-NEXT: ; return to shader part epilog
146 ; GFX8-LABEL: s_usubsat_i8:
148 ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000
149 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2
150 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2
151 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
152 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
153 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
154 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
155 ; GFX8-NEXT: ; return to shader part epilog
157 ; GFX9-LABEL: s_usubsat_i8:
159 ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000
160 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2
161 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2
162 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
163 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
164 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
165 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
166 ; GFX9-NEXT: ; return to shader part epilog
168 ; GFX10-LABEL: s_usubsat_i8:
170 ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000
171 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2
172 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2
173 ; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
174 ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
175 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
176 ; GFX10-NEXT: ; return to shader part epilog
177 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
181 define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
182 ; GFX6-LABEL: v_usubsat_v2i8:
184 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
186 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
187 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
188 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
189 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
190 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
191 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
192 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
193 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
194 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
195 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
196 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
197 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
198 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
199 ; GFX6-NEXT: s_setpc_b64 s[30:31]
201 ; GFX8-LABEL: v_usubsat_v2i8:
203 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
205 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
206 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
207 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
208 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
209 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
210 ; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp
211 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
212 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
213 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
214 ; GFX8-NEXT: s_setpc_b64 s[30:31]
216 ; GFX9-LABEL: v_usubsat_v2i8:
218 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; GFX9-NEXT: s_mov_b32 s4, 8
220 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
221 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
222 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
223 ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2
224 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3
225 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
226 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
227 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
228 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
229 ; GFX9-NEXT: s_movk_i32 s4, 0xff
230 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
231 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
232 ; GFX9-NEXT: s_setpc_b64 s[30:31]
234 ; GFX10-LABEL: v_usubsat_v2i8:
236 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
238 ; GFX10-NEXT: s_mov_b32 s4, 8
239 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
240 ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
241 ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
242 ; GFX10-NEXT: s_movk_i32 s4, 0xff
243 ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
244 ; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4
245 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
246 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
247 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
248 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
249 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
250 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
251 ; GFX10-NEXT: s_setpc_b64 s[30:31]
252 %lhs = bitcast i16 %lhs.arg to <2 x i8>
253 %rhs = bitcast i16 %rhs.arg to <2 x i8>
254 %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
255 %cast.result = bitcast <2 x i8> %result to i16
259 define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
260 ; GFX6-LABEL: s_usubsat_v2i8:
262 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
263 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8
264 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
265 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
266 ; GFX6-NEXT: s_min_u32 s1, s0, s1
267 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
268 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
269 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
270 ; GFX6-NEXT: s_min_u32 s2, s1, s2
271 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
272 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24
273 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24
274 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
275 ; GFX6-NEXT: s_or_b32 s0, s0, s1
276 ; GFX6-NEXT: ; return to shader part epilog
278 ; GFX8-LABEL: s_usubsat_v2i8:
280 ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000
281 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8
282 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4
283 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
284 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4
285 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
286 ; GFX8-NEXT: s_lshl_b32 s1, s3, s4
287 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
288 ; GFX8-NEXT: s_lshl_b32 s0, s2, s4
289 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
290 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
291 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
292 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
293 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
294 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
295 ; GFX8-NEXT: ; return to shader part epilog
297 ; GFX9-LABEL: s_usubsat_v2i8:
299 ; GFX9-NEXT: s_lshr_b32 s2, s0, 8
300 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8
301 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
302 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
303 ; GFX9-NEXT: s_mov_b32 s2, 0x80008
304 ; GFX9-NEXT: s_lshr_b32 s3, s0, 16
305 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2
306 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8
307 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
308 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
309 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2
310 ; GFX9-NEXT: s_lshl_b32 s2, s3, 8
311 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
312 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
313 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
314 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
315 ; GFX9-NEXT: s_movk_i32 s0, 0xff
316 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
317 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
318 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
319 ; GFX9-NEXT: ; return to shader part epilog
321 ; GFX10-LABEL: s_usubsat_v2i8:
323 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
324 ; GFX10-NEXT: s_lshr_b32 s3, s1, 8
325 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
326 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
327 ; GFX10-NEXT: s_mov_b32 s2, 0x80008
328 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
329 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
330 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2
331 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
332 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2
333 ; GFX10-NEXT: s_lshl_b32 s2, s4, 8
334 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
335 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
336 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
337 ; GFX10-NEXT: s_movk_i32 s0, 0xff
338 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
339 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
340 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
341 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
342 ; GFX10-NEXT: ; return to shader part epilog
343 %lhs = bitcast i16 %lhs.arg to <2 x i8>
344 %rhs = bitcast i16 %rhs.arg to <2 x i8>
345 %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
346 %cast.result = bitcast <2 x i8> %result to i16
350 define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
351 ; GFX6-LABEL: v_usubsat_v4i8:
353 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
355 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
356 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0
357 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1
358 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1
359 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1
360 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
361 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
362 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
363 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
364 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
365 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5
366 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
367 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
368 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
369 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6
370 ; GFX6-NEXT: v_min_u32_e32 v3, v2, v3
371 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
372 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4
373 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7
374 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4
375 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
376 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2
377 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
378 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3
379 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 24
380 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
381 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
382 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3
383 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
384 ; GFX6-NEXT: s_setpc_b64 s[30:31]
386 ; GFX8-LABEL: v_usubsat_v4i8:
388 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
390 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
391 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
392 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0
393 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
394 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
395 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1
396 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
397 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
398 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
399 ; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp
400 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4
401 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6
402 ; GFX8-NEXT: v_sub_u16_e64 v2, v2, v3 clamp
403 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5
404 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7
405 ; GFX8-NEXT: v_sub_u16_e64 v3, v3, v4 clamp
406 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
407 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
408 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
409 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
410 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
411 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
412 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
413 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
414 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
415 ; GFX8-NEXT: s_setpc_b64 s[30:31]
417 ; GFX9-LABEL: v_usubsat_v4i8:
419 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420 ; GFX9-NEXT: s_mov_b32 s4, 8
421 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
422 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0
423 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff
424 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
425 ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
426 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1
427 ; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2
428 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
429 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
430 ; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2
431 ; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5
432 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7
433 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
434 ; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3
435 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
436 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
437 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
438 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
439 ; GFX9-NEXT: v_pk_sub_u16 v1, v2, v3 clamp
440 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
441 ; GFX9-NEXT: v_mov_b32_e32 v2, 8
442 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
443 ; GFX9-NEXT: s_movk_i32 s4, 0xff
444 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
445 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
446 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v1
447 ; GFX9-NEXT: v_mov_b32_e32 v3, 24
448 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
449 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
450 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
451 ; GFX9-NEXT: s_setpc_b64 s[30:31]
453 ; GFX10-LABEL: v_usubsat_v4i8:
455 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
457 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
458 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
459 ; GFX10-NEXT: s_mov_b32 s4, 8
460 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
461 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
462 ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
463 ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
464 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
465 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
466 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
467 ; GFX10-NEXT: s_movk_i32 s4, 0xff
468 ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2
469 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6
470 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
471 ; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5
472 ; GFX10-NEXT: v_mov_b32_e32 v4, 24
473 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
474 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
475 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
476 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
477 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
478 ; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp
479 ; GFX10-NEXT: v_mov_b32_e32 v2, 8
480 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
481 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
482 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
483 ; GFX10-NEXT: v_and_b32_e32 v3, s4, v1
484 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
485 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2
486 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
487 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
488 ; GFX10-NEXT: s_setpc_b64 s[30:31]
489 %lhs = bitcast i32 %lhs.arg to <4 x i8>
490 %rhs = bitcast i32 %rhs.arg to <4 x i8>
491 %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
492 %cast.result = bitcast <4 x i8> %result to i32
496 define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
497 ; GFX6-LABEL: s_usubsat_v4i8:
499 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
500 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
501 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24
502 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8
503 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16
504 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24
505 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
506 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
507 ; GFX6-NEXT: s_min_u32 s1, s0, s1
508 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
509 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
510 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24
511 ; GFX6-NEXT: s_min_u32 s2, s1, s2
512 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
513 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
514 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24
515 ; GFX6-NEXT: s_min_u32 s3, s2, s3
516 ; GFX6-NEXT: s_sub_i32 s2, s2, s3
517 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24
518 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24
519 ; GFX6-NEXT: s_min_u32 s4, s3, s4
520 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24
521 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24
522 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
523 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
524 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24
525 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
526 ; GFX6-NEXT: s_lshl_b32 s0, s2, 16
527 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
528 ; GFX6-NEXT: s_lshl_b32 s0, s3, 24
529 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
530 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
531 ; GFX6-NEXT: ; return to shader part epilog
533 ; GFX8-LABEL: s_usubsat_v4i8:
535 ; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000
536 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8
537 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16
538 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24
539 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8
540 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
541 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16
542 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24
543 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8
544 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
545 ; GFX8-NEXT: s_lshl_b32 s1, s5, s8
546 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
547 ; GFX8-NEXT: s_lshl_b32 s0, s2, s8
548 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
549 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
550 ; GFX8-NEXT: s_lshl_b32 s1, s6, s8
551 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
552 ; GFX8-NEXT: s_lshl_b32 s0, s3, s8
553 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
554 ; GFX8-NEXT: s_lshl_b32 s1, s7, s8
555 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
556 ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
557 ; GFX8-NEXT: s_lshl_b32 s0, s4, s8
558 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
559 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
560 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
561 ; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
562 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
563 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
564 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
565 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
566 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
567 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
568 ; GFX8-NEXT: ; return to shader part epilog
570 ; GFX9-LABEL: s_usubsat_v4i8:
572 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8
573 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
574 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24
575 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
576 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
577 ; GFX9-NEXT: s_mov_b32 s4, 0x80008
578 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16
579 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8
580 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4
581 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
582 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16
583 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24
584 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
585 ; GFX9-NEXT: s_lshr_b32 s6, s3, 16
586 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
587 ; GFX9-NEXT: s_lshl_b32 s3, s3, s4
588 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
589 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16
590 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
591 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9
592 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4
593 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8
594 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
595 ; GFX9-NEXT: s_lshr_b32 s7, s6, 16
596 ; GFX9-NEXT: s_lshl_b32 s4, s6, s4
597 ; GFX9-NEXT: s_lshl_b32 s6, s7, 8
598 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
599 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
600 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
601 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
602 ; GFX9-NEXT: s_mov_b32 s2, 8
603 ; GFX9-NEXT: v_pk_sub_u16 v1, s3, v1 clamp
604 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
605 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
606 ; GFX9-NEXT: s_movk_i32 s0, 0xff
607 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
608 ; GFX9-NEXT: s_mov_b32 s5, 24
609 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2
610 ; GFX9-NEXT: v_and_b32_e32 v2, s0, v1
611 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
612 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
613 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
614 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
615 ; GFX9-NEXT: ; return to shader part epilog
617 ; GFX10-LABEL: s_usubsat_v4i8:
619 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
620 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
621 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24
622 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
623 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
624 ; GFX10-NEXT: s_mov_b32 s3, 0x80008
625 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
626 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8
627 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16
628 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24
629 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3
630 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
631 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
632 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
633 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
634 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16
635 ; GFX10-NEXT: s_lshr_b32 s5, s1, 16
636 ; GFX10-NEXT: s_lshr_b32 s6, s4, 16
637 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3
638 ; GFX10-NEXT: s_lshl_b32 s8, s8, 8
639 ; GFX10-NEXT: s_lshl_b32 s1, s1, s3
640 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
641 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3
642 ; GFX10-NEXT: s_lshl_b32 s4, s6, 8
643 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8
644 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
645 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
646 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
647 ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp
648 ; GFX10-NEXT: s_mov_b32 s0, 8
649 ; GFX10-NEXT: s_movk_i32 s1, 0xff
650 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
651 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
652 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
653 ; GFX10-NEXT: v_and_b32_e32 v3, s1, v1
654 ; GFX10-NEXT: s_mov_b32 s0, 24
655 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
656 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
657 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
658 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
659 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
660 ; GFX10-NEXT: ; return to shader part epilog
661 %lhs = bitcast i32 %lhs.arg to <4 x i8>
662 %rhs = bitcast i32 %rhs.arg to <4 x i8>
663 %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
664 %cast.result = bitcast <4 x i8> %result to i32
668 define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) {
669 ; GFX6-LABEL: v_usubsat_i24:
671 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
673 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
674 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
675 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
676 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0
677 ; GFX6-NEXT: s_setpc_b64 s[30:31]
679 ; GFX8-LABEL: v_usubsat_i24:
681 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
682 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0
683 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
684 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
685 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
686 ; GFX8-NEXT: s_setpc_b64 s[30:31]
688 ; GFX9-LABEL: v_usubsat_i24:
690 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
692 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
693 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp
694 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
695 ; GFX9-NEXT: s_setpc_b64 s[30:31]
697 ; GFX10-LABEL: v_usubsat_i24:
699 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
700 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
701 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
702 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
703 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp
704 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
705 ; GFX10-NEXT: s_setpc_b64 s[30:31]
706 %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
710 define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
711 ; GFX6-LABEL: s_usubsat_i24:
713 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8
714 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
715 ; GFX6-NEXT: s_min_u32 s1, s0, s1
716 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
717 ; GFX6-NEXT: s_lshr_b32 s0, s0, 8
718 ; GFX6-NEXT: ; return to shader part epilog
720 ; GFX8-LABEL: s_usubsat_i24:
722 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
723 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
724 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
725 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
726 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
727 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
728 ; GFX8-NEXT: ; return to shader part epilog
730 ; GFX9-LABEL: s_usubsat_i24:
732 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
733 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
734 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
735 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
736 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
737 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
738 ; GFX9-NEXT: ; return to shader part epilog
740 ; GFX10-LABEL: s_usubsat_i24:
742 ; GFX10-NEXT: s_lshl_b32 s0, s0, 8
743 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8
744 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
745 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
746 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
747 ; GFX10-NEXT: ; return to shader part epilog
748 %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
752 define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
753 ; GFX6-LABEL: v_usubsat_i32:
755 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
756 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
757 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
758 ; GFX6-NEXT: s_setpc_b64 s[30:31]
760 ; GFX8-LABEL: v_usubsat_i32:
762 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
764 ; GFX8-NEXT: s_setpc_b64 s[30:31]
766 ; GFX9-LABEL: v_usubsat_i32:
768 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
769 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp
770 ; GFX9-NEXT: s_setpc_b64 s[30:31]
772 ; GFX10-LABEL: v_usubsat_i32:
774 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
775 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
776 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp
777 ; GFX10-NEXT: s_setpc_b64 s[30:31]
778 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
782 define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
783 ; GFX6-LABEL: s_usubsat_i32:
785 ; GFX6-NEXT: s_min_u32 s1, s0, s1
786 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
787 ; GFX6-NEXT: ; return to shader part epilog
789 ; GFX8-LABEL: s_usubsat_i32:
791 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
792 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
793 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
794 ; GFX8-NEXT: ; return to shader part epilog
796 ; GFX9-LABEL: s_usubsat_i32:
798 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
799 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
800 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
801 ; GFX9-NEXT: ; return to shader part epilog
803 ; GFX10-LABEL: s_usubsat_i32:
805 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
806 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
807 ; GFX10-NEXT: ; return to shader part epilog
808 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
812 define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
813 ; GFX6-LABEL: usubsat_i32_sv:
815 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
816 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
817 ; GFX6-NEXT: ; return to shader part epilog
819 ; GFX8-LABEL: usubsat_i32_sv:
821 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
822 ; GFX8-NEXT: ; return to shader part epilog
824 ; GFX9-LABEL: usubsat_i32_sv:
826 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
827 ; GFX9-NEXT: ; return to shader part epilog
829 ; GFX10-LABEL: usubsat_i32_sv:
831 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, v0 clamp
832 ; GFX10-NEXT: ; return to shader part epilog
833 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
834 %cast = bitcast i32 %result to float
838 define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
839 ; GFX6-LABEL: usubsat_i32_vs:
841 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v0
842 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
843 ; GFX6-NEXT: ; return to shader part epilog
845 ; GFX8-LABEL: usubsat_i32_vs:
847 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], v0, s0 clamp
848 ; GFX8-NEXT: ; return to shader part epilog
850 ; GFX9-LABEL: usubsat_i32_vs:
852 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, s0 clamp
853 ; GFX9-NEXT: ; return to shader part epilog
855 ; GFX10-LABEL: usubsat_i32_vs:
857 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, s0 clamp
858 ; GFX10-NEXT: ; return to shader part epilog
859 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
860 %cast = bitcast i32 %result to float
864 define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
865 ; GFX6-LABEL: v_usubsat_v2i32:
867 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
868 ; GFX6-NEXT: v_min_u32_e32 v2, v0, v2
869 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
870 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v3
871 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
872 ; GFX6-NEXT: s_setpc_b64 s[30:31]
874 ; GFX8-LABEL: v_usubsat_v2i32:
876 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
878 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
879 ; GFX8-NEXT: s_setpc_b64 s[30:31]
881 ; GFX9-LABEL: v_usubsat_v2i32:
883 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
884 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp
885 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp
886 ; GFX9-NEXT: s_setpc_b64 s[30:31]
888 ; GFX10-LABEL: v_usubsat_v2i32:
890 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
892 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp
893 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp
894 ; GFX10-NEXT: s_setpc_b64 s[30:31]
895 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
896 ret <2 x i32> %result
899 define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
900 ; GFX6-LABEL: s_usubsat_v2i32:
902 ; GFX6-NEXT: s_min_u32 s2, s0, s2
903 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
904 ; GFX6-NEXT: s_min_u32 s2, s1, s3
905 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
906 ; GFX6-NEXT: ; return to shader part epilog
908 ; GFX8-LABEL: s_usubsat_v2i32:
910 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
911 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
912 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], s0, v0 clamp
913 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
914 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
915 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
916 ; GFX8-NEXT: ; return to shader part epilog
918 ; GFX9-LABEL: s_usubsat_v2i32:
920 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
921 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
922 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
923 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
924 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
925 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
926 ; GFX9-NEXT: ; return to shader part epilog
928 ; GFX10-LABEL: s_usubsat_v2i32:
930 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s2 clamp
931 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s3 clamp
932 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
933 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
934 ; GFX10-NEXT: ; return to shader part epilog
935 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
936 ret <2 x i32> %result
939 define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
940 ; GFX6-LABEL: v_usubsat_v3i32:
942 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943 ; GFX6-NEXT: v_min_u32_e32 v3, v0, v3
944 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
945 ; GFX6-NEXT: v_min_u32_e32 v3, v1, v4
946 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
947 ; GFX6-NEXT: v_min_u32_e32 v3, v2, v5
948 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
949 ; GFX6-NEXT: s_setpc_b64 s[30:31]
951 ; GFX8-LABEL: v_usubsat_v3i32:
953 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
954 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
955 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
956 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
957 ; GFX8-NEXT: s_setpc_b64 s[30:31]
959 ; GFX9-LABEL: v_usubsat_v3i32:
961 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp
963 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp
964 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp
965 ; GFX9-NEXT: s_setpc_b64 s[30:31]
967 ; GFX10-LABEL: v_usubsat_v3i32:
969 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
970 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
971 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp
972 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp
973 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp
974 ; GFX10-NEXT: s_setpc_b64 s[30:31]
975 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
976 ret <3 x i32> %result
979 define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
980 ; GFX6-LABEL: s_usubsat_v3i32:
982 ; GFX6-NEXT: s_min_u32 s3, s0, s3
983 ; GFX6-NEXT: s_sub_i32 s0, s0, s3
984 ; GFX6-NEXT: s_min_u32 s3, s1, s4
985 ; GFX6-NEXT: s_sub_i32 s1, s1, s3
986 ; GFX6-NEXT: s_min_u32 s3, s2, s5
987 ; GFX6-NEXT: s_sub_i32 s2, s2, s3
988 ; GFX6-NEXT: ; return to shader part epilog
990 ; GFX8-LABEL: s_usubsat_v3i32:
992 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
993 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
994 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
995 ; GFX8-NEXT: v_sub_u32_e64 v0, s[6:7], s0, v0 clamp
996 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
997 ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
998 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
999 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1000 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1001 ; GFX8-NEXT: ; return to shader part epilog
1003 ; GFX9-LABEL: s_usubsat_v3i32:
1005 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1006 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1007 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
1008 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1009 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1010 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1011 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1012 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1013 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1014 ; GFX9-NEXT: ; return to shader part epilog
1016 ; GFX10-LABEL: s_usubsat_v3i32:
1018 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s3 clamp
1019 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s4 clamp
1020 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s5 clamp
1021 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1022 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
1023 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1024 ; GFX10-NEXT: ; return to shader part epilog
1025 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1026 ret <3 x i32> %result
1029 define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1030 ; GFX6-LABEL: v_usubsat_v4i32:
1032 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033 ; GFX6-NEXT: v_min_u32_e32 v4, v0, v4
1034 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
1035 ; GFX6-NEXT: v_min_u32_e32 v4, v1, v5
1036 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
1037 ; GFX6-NEXT: v_min_u32_e32 v4, v2, v6
1038 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
1039 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v7
1040 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
1041 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1043 ; GFX8-LABEL: v_usubsat_v4i32:
1045 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
1047 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
1048 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
1049 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
1050 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1052 ; GFX9-LABEL: v_usubsat_v4i32:
1054 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1055 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp
1056 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp
1057 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp
1058 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp
1059 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1061 ; GFX10-LABEL: v_usubsat_v4i32:
1063 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1064 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1065 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp
1066 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp
1067 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp
1068 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp
1069 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1070 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1071 ret <4 x i32> %result
1074 define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1075 ; GFX6-LABEL: s_usubsat_v4i32:
1077 ; GFX6-NEXT: s_min_u32 s4, s0, s4
1078 ; GFX6-NEXT: s_sub_i32 s0, s0, s4
1079 ; GFX6-NEXT: s_min_u32 s4, s1, s5
1080 ; GFX6-NEXT: s_sub_i32 s1, s1, s4
1081 ; GFX6-NEXT: s_min_u32 s4, s2, s6
1082 ; GFX6-NEXT: s_sub_i32 s2, s2, s4
1083 ; GFX6-NEXT: s_min_u32 s4, s3, s7
1084 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
1085 ; GFX6-NEXT: ; return to shader part epilog
1087 ; GFX8-LABEL: s_usubsat_v4i32:
1089 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1090 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1091 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
1092 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1093 ; GFX8-NEXT: v_sub_u32_e64 v0, s[8:9], s0, v0 clamp
1094 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1095 ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1096 ; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp
1097 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1098 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1099 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1100 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1101 ; GFX8-NEXT: ; return to shader part epilog
1103 ; GFX9-LABEL: s_usubsat_v4i32:
1105 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1106 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1107 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1108 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1109 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1110 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1111 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1112 ; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp
1113 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1114 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1115 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1116 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1117 ; GFX9-NEXT: ; return to shader part epilog
1119 ; GFX10-LABEL: s_usubsat_v4i32:
1121 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s4 clamp
1122 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s5 clamp
1123 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s6 clamp
1124 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s7 clamp
1125 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1126 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
1127 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1128 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
1129 ; GFX10-NEXT: ; return to shader part epilog
1130 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1131 ret <4 x i32> %result
1134 define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1135 ; GFX6-LABEL: v_usubsat_v5i32:
1137 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1138 ; GFX6-NEXT: v_min_u32_e32 v5, v0, v5
1139 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
1140 ; GFX6-NEXT: v_min_u32_e32 v5, v1, v6
1141 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
1142 ; GFX6-NEXT: v_min_u32_e32 v5, v2, v7
1143 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
1144 ; GFX6-NEXT: v_min_u32_e32 v5, v3, v8
1145 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
1146 ; GFX6-NEXT: v_min_u32_e32 v5, v4, v9
1147 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5
1148 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1150 ; GFX8-LABEL: v_usubsat_v5i32:
1152 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v5 clamp
1154 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v6 clamp
1155 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v7 clamp
1156 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v8 clamp
1157 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v9 clamp
1158 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1160 ; GFX9-LABEL: v_usubsat_v5i32:
1162 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v5 clamp
1164 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v6 clamp
1165 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v7 clamp
1166 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v8 clamp
1167 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v9 clamp
1168 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1170 ; GFX10-LABEL: v_usubsat_v5i32:
1172 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1173 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1174 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v5 clamp
1175 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v6 clamp
1176 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v7 clamp
1177 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v8 clamp
1178 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v9 clamp
1179 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1180 %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1181 ret <5 x i32> %result
1184 define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1185 ; GFX6-LABEL: s_usubsat_v5i32:
1187 ; GFX6-NEXT: s_min_u32 s5, s0, s5
1188 ; GFX6-NEXT: s_sub_i32 s0, s0, s5
1189 ; GFX6-NEXT: s_min_u32 s5, s1, s6
1190 ; GFX6-NEXT: s_sub_i32 s1, s1, s5
1191 ; GFX6-NEXT: s_min_u32 s5, s2, s7
1192 ; GFX6-NEXT: s_sub_i32 s2, s2, s5
1193 ; GFX6-NEXT: s_min_u32 s5, s3, s8
1194 ; GFX6-NEXT: s_sub_i32 s3, s3, s5
1195 ; GFX6-NEXT: s_min_u32 s5, s4, s9
1196 ; GFX6-NEXT: s_sub_i32 s4, s4, s5
1197 ; GFX6-NEXT: ; return to shader part epilog
1199 ; GFX8-LABEL: s_usubsat_v5i32:
1201 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1202 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1203 ; GFX8-NEXT: v_mov_b32_e32 v2, s7
1204 ; GFX8-NEXT: v_mov_b32_e32 v3, s8
1205 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
1206 ; GFX8-NEXT: v_sub_u32_e64 v0, s[10:11], s0, v0 clamp
1207 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp
1208 ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp
1209 ; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp
1210 ; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], s4, v4 clamp
1211 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1212 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1213 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1214 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1215 ; GFX8-NEXT: v_readfirstlane_b32 s4, v4
1216 ; GFX8-NEXT: ; return to shader part epilog
1218 ; GFX9-LABEL: s_usubsat_v5i32:
1220 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1221 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1222 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1223 ; GFX9-NEXT: v_mov_b32_e32 v3, s8
1224 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1225 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1226 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1227 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1228 ; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp
1229 ; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp
1230 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1231 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1232 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1233 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1234 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1235 ; GFX9-NEXT: ; return to shader part epilog
1237 ; GFX10-LABEL: s_usubsat_v5i32:
1239 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp
1240 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp
1241 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp
1242 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp
1243 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp
1244 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1245 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
1246 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1247 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
1248 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4
1249 ; GFX10-NEXT: ; return to shader part epilog
1250 %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1251 ret <5 x i32> %result
1254 define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1255 ; GFX6-LABEL: v_usubsat_v16i32:
1257 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1258 ; GFX6-NEXT: v_min_u32_e32 v16, v0, v16
1259 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
1260 ; GFX6-NEXT: v_min_u32_e32 v16, v1, v17
1261 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16
1262 ; GFX6-NEXT: v_min_u32_e32 v16, v2, v18
1263 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16
1264 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19
1265 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16
1266 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
1267 ; GFX6-NEXT: v_min_u32_e32 v17, v4, v20
1268 ; GFX6-NEXT: v_min_u32_e32 v18, v5, v21
1269 ; GFX6-NEXT: v_min_u32_e32 v19, v6, v22
1270 ; GFX6-NEXT: v_min_u32_e32 v20, v7, v23
1271 ; GFX6-NEXT: v_min_u32_e32 v21, v8, v24
1272 ; GFX6-NEXT: v_min_u32_e32 v22, v9, v25
1273 ; GFX6-NEXT: v_min_u32_e32 v23, v10, v26
1274 ; GFX6-NEXT: v_min_u32_e32 v24, v11, v27
1275 ; GFX6-NEXT: v_min_u32_e32 v25, v12, v28
1276 ; GFX6-NEXT: v_min_u32_e32 v26, v13, v29
1277 ; GFX6-NEXT: v_min_u32_e32 v27, v14, v30
1278 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17
1279 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18
1280 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19
1281 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20
1282 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21
1283 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22
1284 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23
1285 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24
1286 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25
1287 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26
1288 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27
1289 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1290 ; GFX6-NEXT: v_min_u32_e32 v16, v15, v16
1291 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16
1292 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1294 ; GFX8-LABEL: v_usubsat_v16i32:
1296 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1297 ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
1298 ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
1299 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
1300 ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp
1301 ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp
1302 ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp
1303 ; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp
1304 ; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp
1305 ; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp
1306 ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp
1307 ; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp
1308 ; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp
1309 ; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp
1310 ; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp
1311 ; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp
1312 ; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
1313 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1314 ; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
1315 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1317 ; GFX9-LABEL: v_usubsat_v16i32:
1319 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320 ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp
1321 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
1322 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp
1323 ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp
1324 ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp
1325 ; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp
1326 ; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp
1327 ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp
1328 ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp
1329 ; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp
1330 ; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp
1331 ; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp
1332 ; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp
1333 ; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp
1334 ; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp
1335 ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp
1336 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1337 ; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp
1338 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1340 ; GFX10-LABEL: v_usubsat_v16i32:
1342 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1343 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1344 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
1345 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp
1346 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp
1347 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp
1348 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp
1349 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp
1350 ; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp
1351 ; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp
1352 ; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp
1353 ; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp
1354 ; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp
1355 ; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp
1356 ; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp
1357 ; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp
1358 ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp
1359 ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp
1360 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1361 ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp
1362 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1363 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1364 ret <16 x i32> %result
1367 define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
1368 ; GFX6-LABEL: s_usubsat_v16i32:
1370 ; GFX6-NEXT: s_min_u32 s16, s0, s16
1371 ; GFX6-NEXT: s_sub_i32 s0, s0, s16
1372 ; GFX6-NEXT: s_min_u32 s16, s1, s17
1373 ; GFX6-NEXT: s_sub_i32 s1, s1, s16
1374 ; GFX6-NEXT: s_min_u32 s16, s2, s18
1375 ; GFX6-NEXT: s_sub_i32 s2, s2, s16
1376 ; GFX6-NEXT: s_min_u32 s16, s3, s19
1377 ; GFX6-NEXT: s_sub_i32 s3, s3, s16
1378 ; GFX6-NEXT: s_min_u32 s16, s4, s20
1379 ; GFX6-NEXT: s_sub_i32 s4, s4, s16
1380 ; GFX6-NEXT: s_min_u32 s16, s5, s21
1381 ; GFX6-NEXT: s_sub_i32 s5, s5, s16
1382 ; GFX6-NEXT: s_min_u32 s16, s6, s22
1383 ; GFX6-NEXT: s_sub_i32 s6, s6, s16
1384 ; GFX6-NEXT: s_min_u32 s16, s7, s23
1385 ; GFX6-NEXT: s_sub_i32 s7, s7, s16
1386 ; GFX6-NEXT: s_min_u32 s16, s8, s24
1387 ; GFX6-NEXT: s_sub_i32 s8, s8, s16
1388 ; GFX6-NEXT: s_min_u32 s16, s9, s25
1389 ; GFX6-NEXT: s_sub_i32 s9, s9, s16
1390 ; GFX6-NEXT: s_min_u32 s16, s10, s26
1391 ; GFX6-NEXT: s_sub_i32 s10, s10, s16
1392 ; GFX6-NEXT: s_min_u32 s16, s11, s27
1393 ; GFX6-NEXT: s_sub_i32 s11, s11, s16
1394 ; GFX6-NEXT: s_min_u32 s16, s12, s28
1395 ; GFX6-NEXT: s_sub_i32 s12, s12, s16
1396 ; GFX6-NEXT: s_min_u32 s16, s13, s29
1397 ; GFX6-NEXT: s_sub_i32 s13, s13, s16
1398 ; GFX6-NEXT: s_min_u32 s16, s14, s30
1399 ; GFX6-NEXT: s_sub_i32 s14, s14, s16
1400 ; GFX6-NEXT: s_min_u32 s16, s15, s31
1401 ; GFX6-NEXT: s_sub_i32 s15, s15, s16
1402 ; GFX6-NEXT: ; return to shader part epilog
1404 ; GFX8-LABEL: s_usubsat_v16i32:
1406 ; GFX8-NEXT: v_mov_b32_e32 v0, s16
1407 ; GFX8-NEXT: v_mov_b32_e32 v1, s17
1408 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
1409 ; GFX8-NEXT: v_mov_b32_e32 v3, s19
1410 ; GFX8-NEXT: v_mov_b32_e32 v4, s20
1411 ; GFX8-NEXT: v_mov_b32_e32 v5, s21
1412 ; GFX8-NEXT: v_mov_b32_e32 v6, s22
1413 ; GFX8-NEXT: v_mov_b32_e32 v7, s23
1414 ; GFX8-NEXT: v_mov_b32_e32 v8, s24
1415 ; GFX8-NEXT: v_mov_b32_e32 v9, s25
1416 ; GFX8-NEXT: v_mov_b32_e32 v10, s26
1417 ; GFX8-NEXT: v_mov_b32_e32 v11, s27
1418 ; GFX8-NEXT: v_mov_b32_e32 v12, s28
1419 ; GFX8-NEXT: v_mov_b32_e32 v13, s29
1420 ; GFX8-NEXT: v_mov_b32_e32 v14, s30
1421 ; GFX8-NEXT: v_mov_b32_e32 v15, s31
1422 ; GFX8-NEXT: v_sub_u32_e64 v0, s[32:33], s0, v0 clamp
1423 ; GFX8-NEXT: v_sub_u32_e64 v1, s[16:17], s1, v1 clamp
1424 ; GFX8-NEXT: v_sub_u32_e64 v2, s[16:17], s2, v2 clamp
1425 ; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s3, v3 clamp
1426 ; GFX8-NEXT: v_sub_u32_e64 v4, s[2:3], s4, v4 clamp
1427 ; GFX8-NEXT: v_sub_u32_e64 v5, s[2:3], s5, v5 clamp
1428 ; GFX8-NEXT: v_sub_u32_e64 v6, s[2:3], s6, v6 clamp
1429 ; GFX8-NEXT: v_sub_u32_e64 v7, s[2:3], s7, v7 clamp
1430 ; GFX8-NEXT: v_sub_u32_e64 v8, s[2:3], s8, v8 clamp
1431 ; GFX8-NEXT: v_sub_u32_e64 v9, s[2:3], s9, v9 clamp
1432 ; GFX8-NEXT: v_sub_u32_e64 v10, s[2:3], s10, v10 clamp
1433 ; GFX8-NEXT: v_sub_u32_e64 v11, s[2:3], s11, v11 clamp
1434 ; GFX8-NEXT: v_sub_u32_e64 v12, s[2:3], s12, v12 clamp
1435 ; GFX8-NEXT: v_sub_u32_e64 v13, s[2:3], s13, v13 clamp
1436 ; GFX8-NEXT: v_sub_u32_e64 v14, s[2:3], s14, v14 clamp
1437 ; GFX8-NEXT: v_sub_u32_e64 v15, s[2:3], s15, v15 clamp
1438 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1439 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1440 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1441 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1442 ; GFX8-NEXT: v_readfirstlane_b32 s4, v4
1443 ; GFX8-NEXT: v_readfirstlane_b32 s5, v5
1444 ; GFX8-NEXT: v_readfirstlane_b32 s6, v6
1445 ; GFX8-NEXT: v_readfirstlane_b32 s7, v7
1446 ; GFX8-NEXT: v_readfirstlane_b32 s8, v8
1447 ; GFX8-NEXT: v_readfirstlane_b32 s9, v9
1448 ; GFX8-NEXT: v_readfirstlane_b32 s10, v10
1449 ; GFX8-NEXT: v_readfirstlane_b32 s11, v11
1450 ; GFX8-NEXT: v_readfirstlane_b32 s12, v12
1451 ; GFX8-NEXT: v_readfirstlane_b32 s13, v13
1452 ; GFX8-NEXT: v_readfirstlane_b32 s14, v14
1453 ; GFX8-NEXT: v_readfirstlane_b32 s15, v15
1454 ; GFX8-NEXT: ; return to shader part epilog
1456 ; GFX9-LABEL: s_usubsat_v16i32:
1458 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
1459 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
1460 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
1461 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
1462 ; GFX9-NEXT: v_mov_b32_e32 v4, s20
1463 ; GFX9-NEXT: v_mov_b32_e32 v5, s21
1464 ; GFX9-NEXT: v_mov_b32_e32 v6, s22
1465 ; GFX9-NEXT: v_mov_b32_e32 v7, s23
1466 ; GFX9-NEXT: v_mov_b32_e32 v8, s24
1467 ; GFX9-NEXT: v_mov_b32_e32 v9, s25
1468 ; GFX9-NEXT: v_mov_b32_e32 v10, s26
1469 ; GFX9-NEXT: v_mov_b32_e32 v11, s27
1470 ; GFX9-NEXT: v_mov_b32_e32 v12, s28
1471 ; GFX9-NEXT: v_mov_b32_e32 v13, s29
1472 ; GFX9-NEXT: v_mov_b32_e32 v14, s30
1473 ; GFX9-NEXT: v_mov_b32_e32 v15, s31
1474 ; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
1475 ; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp
1476 ; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp
1477 ; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp
1478 ; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp
1479 ; GFX9-NEXT: v_sub_u32_e64 v5, s5, v5 clamp
1480 ; GFX9-NEXT: v_sub_u32_e64 v6, s6, v6 clamp
1481 ; GFX9-NEXT: v_sub_u32_e64 v7, s7, v7 clamp
1482 ; GFX9-NEXT: v_sub_u32_e64 v8, s8, v8 clamp
1483 ; GFX9-NEXT: v_sub_u32_e64 v9, s9, v9 clamp
1484 ; GFX9-NEXT: v_sub_u32_e64 v10, s10, v10 clamp
1485 ; GFX9-NEXT: v_sub_u32_e64 v11, s11, v11 clamp
1486 ; GFX9-NEXT: v_sub_u32_e64 v12, s12, v12 clamp
1487 ; GFX9-NEXT: v_sub_u32_e64 v13, s13, v13 clamp
1488 ; GFX9-NEXT: v_sub_u32_e64 v14, s14, v14 clamp
1489 ; GFX9-NEXT: v_sub_u32_e64 v15, s15, v15 clamp
1490 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1491 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1492 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1493 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1494 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1495 ; GFX9-NEXT: v_readfirstlane_b32 s5, v5
1496 ; GFX9-NEXT: v_readfirstlane_b32 s6, v6
1497 ; GFX9-NEXT: v_readfirstlane_b32 s7, v7
1498 ; GFX9-NEXT: v_readfirstlane_b32 s8, v8
1499 ; GFX9-NEXT: v_readfirstlane_b32 s9, v9
1500 ; GFX9-NEXT: v_readfirstlane_b32 s10, v10
1501 ; GFX9-NEXT: v_readfirstlane_b32 s11, v11
1502 ; GFX9-NEXT: v_readfirstlane_b32 s12, v12
1503 ; GFX9-NEXT: v_readfirstlane_b32 s13, v13
1504 ; GFX9-NEXT: v_readfirstlane_b32 s14, v14
1505 ; GFX9-NEXT: v_readfirstlane_b32 s15, v15
1506 ; GFX9-NEXT: ; return to shader part epilog
1508 ; GFX10-LABEL: s_usubsat_v16i32:
1510 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp
1511 ; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp
1512 ; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp
1513 ; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp
1514 ; GFX10-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp
1515 ; GFX10-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp
1516 ; GFX10-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp
1517 ; GFX10-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp
1518 ; GFX10-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp
1519 ; GFX10-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp
1520 ; GFX10-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp
1521 ; GFX10-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp
1522 ; GFX10-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp
1523 ; GFX10-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp
1524 ; GFX10-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp
1525 ; GFX10-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp
1526 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1527 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
1528 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1529 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
1530 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4
1531 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5
1532 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6
1533 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7
1534 ; GFX10-NEXT: v_readfirstlane_b32 s8, v8
1535 ; GFX10-NEXT: v_readfirstlane_b32 s9, v9
1536 ; GFX10-NEXT: v_readfirstlane_b32 s10, v10
1537 ; GFX10-NEXT: v_readfirstlane_b32 s11, v11
1538 ; GFX10-NEXT: v_readfirstlane_b32 s12, v12
1539 ; GFX10-NEXT: v_readfirstlane_b32 s13, v13
1540 ; GFX10-NEXT: v_readfirstlane_b32 s14, v14
1541 ; GFX10-NEXT: v_readfirstlane_b32 s15, v15
1542 ; GFX10-NEXT: ; return to shader part epilog
1543 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1544 ret <16 x i32> %result
1547 define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
1548 ; GFX6-LABEL: v_usubsat_i16:
1550 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1551 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1552 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1553 ; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
1554 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1555 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1556 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1558 ; GFX8-LABEL: v_usubsat_i16:
1560 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1561 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
1562 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1564 ; GFX9-LABEL: v_usubsat_i16:
1566 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1567 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
1568 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1570 ; GFX10-LABEL: v_usubsat_i16:
1572 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1573 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1574 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
1575 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1576 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1580 define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
1581 ; GFX6-LABEL: s_usubsat_i16:
1583 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1584 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1585 ; GFX6-NEXT: s_min_u32 s1, s0, s1
1586 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
1587 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
1588 ; GFX6-NEXT: ; return to shader part epilog
1590 ; GFX8-LABEL: s_usubsat_i16:
1592 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1593 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1594 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1595 ; GFX8-NEXT: ; return to shader part epilog
1597 ; GFX9-LABEL: s_usubsat_i16:
1599 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1600 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1601 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1602 ; GFX9-NEXT: ; return to shader part epilog
1604 ; GFX10-LABEL: s_usubsat_i16:
1606 ; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
1607 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1608 ; GFX10-NEXT: ; return to shader part epilog
1609 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1613 define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
1614 ; GFX6-LABEL: usubsat_i16_sv:
1616 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1617 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1618 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
1619 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
1620 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1621 ; GFX6-NEXT: ; return to shader part epilog
1623 ; GFX8-LABEL: usubsat_i16_sv:
1625 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1626 ; GFX8-NEXT: ; return to shader part epilog
1628 ; GFX9-LABEL: usubsat_i16_sv:
1630 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1631 ; GFX9-NEXT: ; return to shader part epilog
1633 ; GFX10-LABEL: usubsat_i16_sv:
1635 ; GFX10-NEXT: v_sub_nc_u16 v0, s0, v0 clamp
1636 ; GFX10-NEXT: ; return to shader part epilog
1637 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1638 %cast = bitcast i16 %result to half
1642 define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
1643 ; GFX6-LABEL: usubsat_i16_vs:
1645 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1646 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1647 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v0
1648 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1649 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1650 ; GFX6-NEXT: ; return to shader part epilog
1652 ; GFX8-LABEL: usubsat_i16_vs:
1654 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, s0 clamp
1655 ; GFX8-NEXT: ; return to shader part epilog
1657 ; GFX9-LABEL: usubsat_i16_vs:
1659 ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp
1660 ; GFX9-NEXT: ; return to shader part epilog
1662 ; GFX10-LABEL: usubsat_i16_vs:
1664 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, s0 clamp
1665 ; GFX10-NEXT: ; return to shader part epilog
1666 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
1667 %cast = bitcast i16 %result to half
1671 define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
1672 ; GFX6-LABEL: v_usubsat_v2i16:
1674 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1675 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1676 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1677 ; GFX6-NEXT: v_min_u32_e32 v2, v0, v2
1678 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1679 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1680 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1681 ; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
1682 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
1683 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1684 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1685 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1687 ; GFX8-LABEL: v_usubsat_v2i16:
1689 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1690 ; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp
1691 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1692 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
1693 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1694 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1695 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1697 ; GFX9-LABEL: v_usubsat_v2i16:
1699 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1700 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
1701 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1703 ; GFX10-LABEL: v_usubsat_v2i16:
1705 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1706 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1707 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
1708 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1709 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1710 ret <2 x i16> %result
1713 define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
1714 ; GFX6-LABEL: s_usubsat_v2i16:
1716 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1717 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
1718 ; GFX6-NEXT: s_min_u32 s2, s0, s2
1719 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
1720 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1721 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16
1722 ; GFX6-NEXT: s_min_u32 s2, s1, s2
1723 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
1724 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
1725 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
1726 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
1727 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1728 ; GFX6-NEXT: ; return to shader part epilog
1730 ; GFX8-LABEL: s_usubsat_v2i16:
1732 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
1733 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
1734 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1735 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1736 ; GFX8-NEXT: v_sub_u16_e64 v1, s2, v1 clamp
1737 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1738 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1739 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1740 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1741 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1742 ; GFX8-NEXT: ; return to shader part epilog
1744 ; GFX9-LABEL: s_usubsat_v2i16:
1746 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1747 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1748 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1749 ; GFX9-NEXT: ; return to shader part epilog
1751 ; GFX10-LABEL: s_usubsat_v2i16:
1753 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
1754 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1755 ; GFX10-NEXT: ; return to shader part epilog
1756 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1757 %cast = bitcast <2 x i16> %result to i32
1761 define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
1762 ; GFX6-LABEL: usubsat_v2i16_sv:
1764 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1765 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1766 ; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
1767 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
1768 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
1769 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1770 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1
1771 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
1772 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1773 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1774 ; GFX6-NEXT: ; return to shader part epilog
1776 ; GFX8-LABEL: usubsat_v2i16_sv:
1778 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
1779 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1780 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp
1781 ; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1782 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1783 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1784 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1785 ; GFX8-NEXT: ; return to shader part epilog
1787 ; GFX9-LABEL: usubsat_v2i16_sv:
1789 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1790 ; GFX9-NEXT: ; return to shader part epilog
1792 ; GFX10-LABEL: usubsat_v2i16_sv:
1794 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1795 ; GFX10-NEXT: ; return to shader part epilog
1796 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1797 %cast = bitcast <2 x i16> %result to float
1801 define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
1802 ; GFX6-LABEL: usubsat_v2i16_vs:
1804 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1805 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1806 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v0
1807 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1808 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
1809 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1810 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v1
1811 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
1812 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1813 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1814 ; GFX6-NEXT: ; return to shader part epilog
1816 ; GFX8-LABEL: usubsat_v2i16_vs:
1818 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
1819 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1820 ; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp
1821 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1822 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1823 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1824 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1825 ; GFX8-NEXT: ; return to shader part epilog
1827 ; GFX9-LABEL: usubsat_v2i16_vs:
1829 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 clamp
1830 ; GFX9-NEXT: ; return to shader part epilog
1832 ; GFX10-LABEL: usubsat_v2i16_vs:
1834 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, s0 clamp
1835 ; GFX10-NEXT: ; return to shader part epilog
1836 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1837 %cast = bitcast <2 x i16> %result to float
1841 ; FIXME: v3i16 insert/extract
1842 ; define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
1843 ; %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1844 ; ret <3 x i16> %result
1847 ; define amdgpu_ps <3 x i16> @s_usubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
1848 ; %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1849 ; ret <3 x i16> %result
1852 define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
1853 ; GFX6-LABEL: v_usubsat_v4i16:
1855 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1856 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1857 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1858 ; GFX6-NEXT: v_min_u32_e32 v4, v0, v4
1859 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
1860 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1861 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
1862 ; GFX6-NEXT: v_min_u32_e32 v4, v1, v4
1863 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
1864 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1865 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
1866 ; GFX6-NEXT: v_min_u32_e32 v4, v2, v4
1867 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
1868 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1869 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
1870 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4
1871 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
1872 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1873 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1874 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1875 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
1876 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1878 ; GFX8-LABEL: v_usubsat_v4i16:
1880 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1881 ; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp
1882 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1883 ; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp
1884 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1885 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
1886 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1887 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1888 ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1889 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1890 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1892 ; GFX9-LABEL: v_usubsat_v4i16:
1894 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1895 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
1896 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
1897 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1899 ; GFX10-LABEL: v_usubsat_v4i16:
1901 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1902 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1903 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp
1904 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp
1905 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1906 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
1907 %cast = bitcast <4 x i16> %result to <2 x float>
1908 ret <2 x float> %cast
1911 define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
1912 ; GFX6-LABEL: s_usubsat_v4i16:
1914 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1915 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
1916 ; GFX6-NEXT: s_min_u32 s4, s0, s4
1917 ; GFX6-NEXT: s_sub_i32 s0, s0, s4
1918 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1919 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16
1920 ; GFX6-NEXT: s_min_u32 s4, s1, s4
1921 ; GFX6-NEXT: s_sub_i32 s1, s1, s4
1922 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
1923 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16
1924 ; GFX6-NEXT: s_min_u32 s4, s2, s4
1925 ; GFX6-NEXT: s_sub_i32 s2, s2, s4
1926 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
1927 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16
1928 ; GFX6-NEXT: s_min_u32 s4, s3, s4
1929 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
1930 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
1931 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
1932 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
1933 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
1934 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
1935 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
1936 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1937 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
1938 ; GFX6-NEXT: ; return to shader part epilog
1940 ; GFX8-LABEL: s_usubsat_v4i16:
1942 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16
1943 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16
1944 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16
1945 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1946 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16
1947 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1948 ; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp
1949 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1950 ; GFX8-NEXT: v_mov_b32_e32 v4, 16
1951 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
1952 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
1953 ; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp
1954 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1955 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
1956 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1957 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1958 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1959 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1960 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1961 ; GFX8-NEXT: ; return to shader part epilog
1963 ; GFX9-LABEL: s_usubsat_v4i16:
1965 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1966 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1967 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
1968 ; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp
1969 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1970 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1971 ; GFX9-NEXT: ; return to shader part epilog
1973 ; GFX10-LABEL: s_usubsat_v4i16:
1975 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s2 clamp
1976 ; GFX10-NEXT: v_pk_sub_u16 v1, s1, s3 clamp
1977 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1978 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
1979 ; GFX10-NEXT: ; return to shader part epilog
1980 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
1981 %cast = bitcast <4 x i16> %result to <2 x i32>
1986 ; define <5 x i16> @v_usubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
1987 ; %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
1988 ; ret <5 x i16> %result
1991 ; define amdgpu_ps <5 x i16> @s_usubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
1992 ; %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
1993 ; ret <5 x i16> %result
1996 define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
1997 ; GFX6-LABEL: v_usubsat_v6i16:
1999 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2001 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2002 ; GFX6-NEXT: v_min_u32_e32 v6, v0, v6
2003 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
2004 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2005 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7
2006 ; GFX6-NEXT: v_min_u32_e32 v6, v1, v6
2007 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
2008 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2009 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8
2010 ; GFX6-NEXT: v_min_u32_e32 v6, v2, v6
2011 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
2012 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2013 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9
2014 ; GFX6-NEXT: v_min_u32_e32 v6, v3, v6
2015 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
2016 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2017 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10
2018 ; GFX6-NEXT: v_min_u32_e32 v6, v4, v6
2019 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
2020 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2021 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11
2022 ; GFX6-NEXT: v_min_u32_e32 v6, v5, v6
2023 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
2024 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2025 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2026 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2027 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2028 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2029 ; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16
2030 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2032 ; GFX8-LABEL: v_usubsat_v6i16:
2034 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2035 ; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp
2036 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2037 ; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp
2038 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2039 ; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp
2040 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2041 ; GFX8-NEXT: v_mov_b32_e32 v5, 16
2042 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2043 ; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2044 ; GFX8-NEXT: v_mov_b32_e32 v3, 16
2045 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2046 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2047 ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2048 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2049 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2051 ; GFX9-LABEL: v_usubsat_v6i16:
2053 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2054 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v3 clamp
2055 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v4 clamp
2056 ; GFX9-NEXT: v_pk_sub_u16 v2, v2, v5 clamp
2057 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2059 ; GFX10-LABEL: v_usubsat_v6i16:
2061 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2062 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2063 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v3 clamp
2064 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, v4 clamp
2065 ; GFX10-NEXT: v_pk_sub_u16 v2, v2, v5 clamp
2066 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2067 %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2068 %cast = bitcast <6 x i16> %result to <3 x float>
2069 ret <3 x float> %cast
2072 define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
2073 ; GFX6-LABEL: s_usubsat_v6i16:
2075 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2076 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
2077 ; GFX6-NEXT: s_min_u32 s6, s0, s6
2078 ; GFX6-NEXT: s_sub_i32 s0, s0, s6
2079 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2080 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16
2081 ; GFX6-NEXT: s_min_u32 s6, s1, s6
2082 ; GFX6-NEXT: s_sub_i32 s1, s1, s6
2083 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2084 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16
2085 ; GFX6-NEXT: s_min_u32 s6, s2, s6
2086 ; GFX6-NEXT: s_sub_i32 s2, s2, s6
2087 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2088 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16
2089 ; GFX6-NEXT: s_min_u32 s6, s3, s6
2090 ; GFX6-NEXT: s_sub_i32 s3, s3, s6
2091 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2092 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16
2093 ; GFX6-NEXT: s_min_u32 s6, s4, s6
2094 ; GFX6-NEXT: s_sub_i32 s4, s4, s6
2095 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
2096 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16
2097 ; GFX6-NEXT: s_min_u32 s6, s5, s6
2098 ; GFX6-NEXT: s_sub_i32 s5, s5, s6
2099 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2100 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2101 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16
2102 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2103 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2104 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2105 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2106 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2107 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
2108 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2109 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2110 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2111 ; GFX6-NEXT: ; return to shader part epilog
2113 ; GFX8-LABEL: s_usubsat_v6i16:
2115 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16
2116 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16
2117 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16
2118 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
2119 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16
2120 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16
2121 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
2122 ; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp
2123 ; GFX8-NEXT: v_mov_b32_e32 v3, s10
2124 ; GFX8-NEXT: v_mov_b32_e32 v6, 16
2125 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16
2126 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
2127 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2128 ; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp
2129 ; GFX8-NEXT: v_mov_b32_e32 v5, s11
2130 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2131 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
2132 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
2133 ; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp
2134 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2135 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2136 ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
2137 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2138 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2139 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2140 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2141 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2142 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2143 ; GFX8-NEXT: ; return to shader part epilog
2145 ; GFX9-LABEL: s_usubsat_v6i16:
2147 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
2148 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2149 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
2150 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
2151 ; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp
2152 ; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp
2153 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2154 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2155 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2156 ; GFX9-NEXT: ; return to shader part epilog
2158 ; GFX10-LABEL: s_usubsat_v6i16:
2160 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s3 clamp
2161 ; GFX10-NEXT: v_pk_sub_u16 v1, s1, s4 clamp
2162 ; GFX10-NEXT: v_pk_sub_u16 v2, s2, s5 clamp
2163 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2164 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
2165 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
2166 ; GFX10-NEXT: ; return to shader part epilog
2167 %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2168 %cast = bitcast <6 x i16> %result to <3 x i32>
2172 define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
2173 ; GFX6-LABEL: v_usubsat_v8i16:
2175 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2176 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2177 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
2178 ; GFX6-NEXT: v_min_u32_e32 v8, v0, v8
2179 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
2180 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2181 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9
2182 ; GFX6-NEXT: v_min_u32_e32 v8, v1, v8
2183 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
2184 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2185 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10
2186 ; GFX6-NEXT: v_min_u32_e32 v8, v2, v8
2187 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
2188 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2189 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11
2190 ; GFX6-NEXT: v_min_u32_e32 v8, v3, v8
2191 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8
2192 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2193 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12
2194 ; GFX6-NEXT: v_min_u32_e32 v8, v4, v8
2195 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
2196 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2197 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13
2198 ; GFX6-NEXT: v_min_u32_e32 v8, v5, v8
2199 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8
2200 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2201 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14
2202 ; GFX6-NEXT: v_min_u32_e32 v8, v6, v8
2203 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
2204 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2205 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15
2206 ; GFX6-NEXT: v_min_u32_e32 v8, v7, v8
2207 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8
2208 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2209 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2210 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2211 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2212 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2213 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2214 ; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16
2215 ; GFX6-NEXT: v_alignbit_b32 v3, v7, v6, 16
2216 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2218 ; GFX8-LABEL: v_usubsat_v8i16:
2220 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2221 ; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp
2222 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2223 ; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp
2224 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2225 ; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp
2226 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2227 ; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp
2228 ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2229 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
2230 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2231 ; GFX8-NEXT: v_mov_b32_e32 v7, 16
2232 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2233 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2234 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2235 ; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2236 ; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2237 ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2238 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2239 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2241 ; GFX9-LABEL: v_usubsat_v8i16:
2243 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2244 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v4 clamp
2245 ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v5 clamp
2246 ; GFX9-NEXT: v_pk_sub_u16 v2, v2, v6 clamp
2247 ; GFX9-NEXT: v_pk_sub_u16 v3, v3, v7 clamp
2248 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2250 ; GFX10-LABEL: v_usubsat_v8i16:
2252 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2253 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2254 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v4 clamp
2255 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, v5 clamp
2256 ; GFX10-NEXT: v_pk_sub_u16 v2, v2, v6 clamp
2257 ; GFX10-NEXT: v_pk_sub_u16 v3, v3, v7 clamp
2258 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2259 %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2260 %cast = bitcast <8 x i16> %result to <4 x float>
2261 ret <4 x float> %cast
2264 define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
2265 ; GFX6-LABEL: s_usubsat_v8i16:
2267 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2268 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16
2269 ; GFX6-NEXT: s_min_u32 s8, s0, s8
2270 ; GFX6-NEXT: s_sub_i32 s0, s0, s8
2271 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2272 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16
2273 ; GFX6-NEXT: s_min_u32 s8, s1, s8
2274 ; GFX6-NEXT: s_sub_i32 s1, s1, s8
2275 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2276 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16
2277 ; GFX6-NEXT: s_min_u32 s8, s2, s8
2278 ; GFX6-NEXT: s_sub_i32 s2, s2, s8
2279 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2280 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16
2281 ; GFX6-NEXT: s_min_u32 s8, s3, s8
2282 ; GFX6-NEXT: s_sub_i32 s3, s3, s8
2283 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2284 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16
2285 ; GFX6-NEXT: s_min_u32 s8, s4, s8
2286 ; GFX6-NEXT: s_sub_i32 s4, s4, s8
2287 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
2288 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16
2289 ; GFX6-NEXT: s_min_u32 s8, s5, s8
2290 ; GFX6-NEXT: s_sub_i32 s5, s5, s8
2291 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
2292 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16
2293 ; GFX6-NEXT: s_min_u32 s8, s6, s8
2294 ; GFX6-NEXT: s_sub_i32 s6, s6, s8
2295 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16
2296 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16
2297 ; GFX6-NEXT: s_min_u32 s8, s7, s8
2298 ; GFX6-NEXT: s_sub_i32 s7, s7, s8
2299 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2300 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2301 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16
2302 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16
2303 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2304 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2305 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2306 ; GFX6-NEXT: v_mov_b32_e32 v3, s6
2307 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2308 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2309 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
2310 ; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16
2311 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2312 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2313 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2314 ; GFX6-NEXT: v_readfirstlane_b32 s3, v3
2315 ; GFX6-NEXT: ; return to shader part epilog
2317 ; GFX8-LABEL: s_usubsat_v8i16:
2319 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16
2320 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16
2321 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16
2322 ; GFX8-NEXT: v_mov_b32_e32 v1, s12
2323 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16
2324 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16
2325 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16
2326 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2327 ; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp
2328 ; GFX8-NEXT: v_mov_b32_e32 v3, s13
2329 ; GFX8-NEXT: v_mov_b32_e32 v8, 16
2330 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16
2331 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16
2332 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
2333 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
2334 ; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp
2335 ; GFX8-NEXT: v_mov_b32_e32 v5, s14
2336 ; GFX8-NEXT: v_mov_b32_e32 v7, s15
2337 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2338 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
2339 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
2340 ; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp
2341 ; GFX8-NEXT: v_mov_b32_e32 v6, s7
2342 ; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp
2343 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2344 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2345 ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
2346 ; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp
2347 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2348 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2349 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2350 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2351 ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2352 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2353 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2354 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2355 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
2356 ; GFX8-NEXT: ; return to shader part epilog
2358 ; GFX9-LABEL: s_usubsat_v8i16:
2360 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2361 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2362 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
2363 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
2364 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
2365 ; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp
2366 ; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp
2367 ; GFX9-NEXT: v_pk_sub_u16 v3, s3, v3 clamp
2368 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2369 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2370 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2371 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
2372 ; GFX9-NEXT: ; return to shader part epilog
2374 ; GFX10-LABEL: s_usubsat_v8i16:
2376 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s4 clamp
2377 ; GFX10-NEXT: v_pk_sub_u16 v1, s1, s5 clamp
2378 ; GFX10-NEXT: v_pk_sub_u16 v2, s2, s6 clamp
2379 ; GFX10-NEXT: v_pk_sub_u16 v3, s3, s7 clamp
2380 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2381 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
2382 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
2383 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
2384 ; GFX10-NEXT: ; return to shader part epilog
2385 %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2386 %cast = bitcast <8 x i16> %result to <4 x i32>
2390 ; FIXME: i48 broken because i48 add broken
2391 ; define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
2392 ; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2396 ; define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
2397 ; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2401 ; define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
2402 ; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2403 ; %ext.result = zext i48 %result to i64
2404 ; %cast = bitcast i64 %ext.result to <2 x float>
2405 ; ret <2 x float> %cast
2408 ; define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
2409 ; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
2410 ; %ext.result = zext i48 %result to i64
2411 ; %cast = bitcast i64 %ext.result to <2 x float>
2412 ; ret <2 x float> %cast
2415 define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
2416 ; GFX6-LABEL: v_usubsat_i64:
2418 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2419 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
2420 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
2421 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
2422 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
2423 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
2424 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2426 ; GFX8-LABEL: v_usubsat_i64:
2428 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2429 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
2430 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
2431 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
2432 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
2433 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
2434 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2436 ; GFX9-LABEL: v_usubsat_i64:
2438 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2439 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
2440 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
2441 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
2442 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
2443 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
2444 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2446 ; GFX10-LABEL: v_usubsat_i64:
2448 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2449 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2450 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
2451 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
2452 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
2453 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
2454 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo
2455 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2456 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2460 define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
2461 ; GFX6-LABEL: s_usubsat_i64:
2463 ; GFX6-NEXT: s_sub_u32 s4, s0, s2
2464 ; GFX6-NEXT: s_cselect_b32 s5, 1, 0
2465 ; GFX6-NEXT: s_and_b32 s5, s5, 1
2466 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
2467 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0
2468 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
2469 ; GFX6-NEXT: s_subb_u32 s5, s1, s3
2470 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2471 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2472 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
2473 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
2474 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
2475 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2476 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2477 ; GFX6-NEXT: ; return to shader part epilog
2479 ; GFX8-LABEL: s_usubsat_i64:
2481 ; GFX8-NEXT: s_sub_u32 s4, s0, s2
2482 ; GFX8-NEXT: s_cselect_b32 s5, 1, 0
2483 ; GFX8-NEXT: s_and_b32 s5, s5, 1
2484 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
2485 ; GFX8-NEXT: s_cmp_lg_u32 s5, 0
2486 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2487 ; GFX8-NEXT: s_subb_u32 s5, s1, s3
2488 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2489 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2490 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
2491 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
2492 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
2493 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2494 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2495 ; GFX8-NEXT: ; return to shader part epilog
2497 ; GFX9-LABEL: s_usubsat_i64:
2499 ; GFX9-NEXT: s_sub_u32 s4, s0, s2
2500 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0
2501 ; GFX9-NEXT: s_and_b32 s5, s5, 1
2502 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2503 ; GFX9-NEXT: s_cmp_lg_u32 s5, 0
2504 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2505 ; GFX9-NEXT: s_subb_u32 s5, s1, s3
2506 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2507 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
2508 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
2509 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
2510 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
2511 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2512 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2513 ; GFX9-NEXT: ; return to shader part epilog
2515 ; GFX10-LABEL: s_usubsat_i64:
2517 ; GFX10-NEXT: s_sub_u32 s4, s0, s2
2518 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0
2519 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
2520 ; GFX10-NEXT: s_and_b32 s5, s5, 1
2521 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0
2522 ; GFX10-NEXT: s_subb_u32 s1, s1, s3
2523 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0
2524 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0
2525 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2526 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
2527 ; GFX10-NEXT: ; return to shader part epilog
2528 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2532 define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
2533 ; GFX6-LABEL: usubsat_i64_sv:
2535 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2536 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0
2537 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
2538 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2539 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
2540 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2541 ; GFX6-NEXT: ; return to shader part epilog
2543 ; GFX8-LABEL: usubsat_i64_sv:
2545 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2546 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0
2547 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
2548 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2549 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
2550 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2551 ; GFX8-NEXT: ; return to shader part epilog
2553 ; GFX9-LABEL: usubsat_i64_sv:
2555 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2556 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0
2557 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc
2558 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2559 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
2560 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2561 ; GFX9-NEXT: ; return to shader part epilog
2563 ; GFX10-LABEL: usubsat_i64_sv:
2565 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
2566 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
2567 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
2568 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
2569 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
2570 ; GFX10-NEXT: ; return to shader part epilog
2571 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2572 %cast = bitcast i64 %result to <2 x float>
2573 ret <2 x float> %cast
2576 define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
2577 ; GFX6-LABEL: usubsat_i64_vs:
2579 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2580 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0
2581 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
2582 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
2583 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
2584 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2585 ; GFX6-NEXT: ; return to shader part epilog
2587 ; GFX8-LABEL: usubsat_i64_vs:
2589 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2590 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0
2591 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
2592 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
2593 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
2594 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2595 ; GFX8-NEXT: ; return to shader part epilog
2597 ; GFX9-LABEL: usubsat_i64_vs:
2599 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2600 ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0
2601 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
2602 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
2603 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
2604 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2605 ; GFX9-NEXT: ; return to shader part epilog
2607 ; GFX10-LABEL: usubsat_i64_vs:
2609 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
2610 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
2611 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
2612 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
2613 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
2614 ; GFX10-NEXT: ; return to shader part epilog
2615 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
2616 %cast = bitcast i64 %result to <2 x float>
2617 ret <2 x float> %cast
2620 define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
2621 ; GFX6-LABEL: v_usubsat_v2i64:
2623 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2624 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
2625 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc
2626 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
2627 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc
2628 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
2629 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6
2630 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc
2631 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
2632 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc
2633 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc
2634 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2636 ; GFX8-LABEL: v_usubsat_v2i64:
2638 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2639 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4
2640 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc
2641 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
2642 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc
2643 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
2644 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6
2645 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc
2646 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
2647 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc
2648 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc
2649 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2651 ; GFX9-LABEL: v_usubsat_v2i64:
2653 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2654 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4
2655 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
2656 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
2657 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc
2658 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
2659 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6
2660 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
2661 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
2662 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc
2663 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc
2664 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2666 ; GFX10-LABEL: v_usubsat_v2i64:
2668 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2669 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2670 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
2671 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
2672 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
2673 ; GFX10-NEXT: v_sub_co_u32 v4, s4, v2, v6
2674 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4
2675 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
2676 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo
2677 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo
2678 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4
2679 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0, s4
2680 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2681 %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2682 ret <2 x i64> %result
2685 define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
2686 ; GFX6-LABEL: s_usubsat_v2i64:
2688 ; GFX6-NEXT: s_sub_u32 s8, s0, s4
2689 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0
2690 ; GFX6-NEXT: s_and_b32 s9, s9, 1
2691 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0
2692 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
2693 ; GFX6-NEXT: s_subb_u32 s9, s1, s5
2694 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
2695 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2696 ; GFX6-NEXT: s_sub_u32 s0, s2, s6
2697 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0
2698 ; GFX6-NEXT: s_and_b32 s1, s1, 1
2699 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
2700 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
2701 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
2702 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0
2703 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
2704 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2705 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2706 ; GFX6-NEXT: s_subb_u32 s1, s3, s7
2707 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2708 ; GFX6-NEXT: v_mov_b32_e32 v4, s0
2709 ; GFX6-NEXT: v_mov_b32_e32 v5, s1
2710 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
2711 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
2712 ; GFX6-NEXT: v_readfirstlane_b32 s0, v2
2713 ; GFX6-NEXT: v_readfirstlane_b32 s1, v3
2714 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
2715 ; GFX6-NEXT: v_readfirstlane_b32 s3, v1
2716 ; GFX6-NEXT: ; return to shader part epilog
2718 ; GFX8-LABEL: s_usubsat_v2i64:
2720 ; GFX8-NEXT: s_sub_u32 s8, s0, s4
2721 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0
2722 ; GFX8-NEXT: s_and_b32 s9, s9, 1
2723 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0
2724 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2725 ; GFX8-NEXT: s_subb_u32 s9, s1, s5
2726 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2727 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2728 ; GFX8-NEXT: s_sub_u32 s0, s2, s6
2729 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
2730 ; GFX8-NEXT: s_and_b32 s1, s1, 1
2731 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
2732 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
2733 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
2734 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0
2735 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2736 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2737 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2738 ; GFX8-NEXT: s_subb_u32 s1, s3, s7
2739 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2740 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
2741 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
2742 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
2743 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
2744 ; GFX8-NEXT: v_readfirstlane_b32 s0, v2
2745 ; GFX8-NEXT: v_readfirstlane_b32 s1, v3
2746 ; GFX8-NEXT: v_readfirstlane_b32 s2, v0
2747 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1
2748 ; GFX8-NEXT: ; return to shader part epilog
2750 ; GFX9-LABEL: s_usubsat_v2i64:
2752 ; GFX9-NEXT: s_sub_u32 s8, s0, s4
2753 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0
2754 ; GFX9-NEXT: s_and_b32 s9, s9, 1
2755 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0
2756 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2757 ; GFX9-NEXT: s_subb_u32 s9, s1, s5
2758 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2759 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2760 ; GFX9-NEXT: s_sub_u32 s0, s2, s6
2761 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0
2762 ; GFX9-NEXT: s_and_b32 s1, s1, 1
2763 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2764 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
2765 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
2766 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0
2767 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2768 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2769 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2770 ; GFX9-NEXT: s_subb_u32 s1, s3, s7
2771 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2772 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
2773 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
2774 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
2775 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
2776 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
2777 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
2778 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0
2779 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1
2780 ; GFX9-NEXT: ; return to shader part epilog
2782 ; GFX10-LABEL: s_usubsat_v2i64:
2784 ; GFX10-NEXT: s_sub_u32 s8, s0, s4
2785 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0
2786 ; GFX10-NEXT: s_and_b32 s9, s9, 1
2787 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0
2788 ; GFX10-NEXT: s_subb_u32 s9, s1, s5
2789 ; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[0:1], s[4:5]
2790 ; GFX10-NEXT: s_sub_u32 s0, s2, s6
2791 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0
2792 ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
2793 ; GFX10-NEXT: s_and_b32 s4, s4, 1
2794 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0
2795 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s1
2796 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, s1
2797 ; GFX10-NEXT: s_subb_u32 s1, s3, s7
2798 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s0, 0, s2
2799 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, s2
2800 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2801 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
2802 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
2803 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
2804 ; GFX10-NEXT: ; return to shader part epilog
2805 %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2806 ret <2 x i64> %result
2809 define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
2810 ; GFX6-LABEL: s_usubsat_i128:
2812 ; GFX6-NEXT: s_sub_u32 s8, s0, s4
2813 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0
2814 ; GFX6-NEXT: s_and_b32 s9, s9, 1
2815 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2816 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0
2817 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
2818 ; GFX6-NEXT: s_subb_u32 s9, s1, s5
2819 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
2820 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
2821 ; GFX6-NEXT: s_cselect_b32 s10, 1, 0
2822 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
2823 ; GFX6-NEXT: s_and_b32 s10, s10, 1
2824 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
2825 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2826 ; GFX6-NEXT: s_cmp_lg_u32 s10, 0
2827 ; GFX6-NEXT: s_subb_u32 s10, s2, s6
2828 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
2829 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
2830 ; GFX6-NEXT: s_cselect_b32 s11, 1, 0
2831 ; GFX6-NEXT: s_and_b32 s11, s11, 1
2832 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
2833 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0
2834 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
2835 ; GFX6-NEXT: s_subb_u32 s11, s3, s7
2836 ; GFX6-NEXT: v_mov_b32_e32 v1, s8
2837 ; GFX6-NEXT: v_mov_b32_e32 v2, s9
2838 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2839 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
2840 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2841 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
2842 ; GFX6-NEXT: v_mov_b32_e32 v3, s11
2843 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2844 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2845 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2846 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2847 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2848 ; GFX6-NEXT: v_readfirstlane_b32 s3, v3
2849 ; GFX6-NEXT: ; return to shader part epilog
2851 ; GFX8-LABEL: s_usubsat_i128:
2853 ; GFX8-NEXT: s_sub_u32 s8, s0, s4
2854 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0
2855 ; GFX8-NEXT: s_and_b32 s9, s9, 1
2856 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0
2857 ; GFX8-NEXT: s_subb_u32 s9, s1, s5
2858 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0
2859 ; GFX8-NEXT: s_and_b32 s10, s10, 1
2860 ; GFX8-NEXT: s_cmp_lg_u32 s10, 0
2861 ; GFX8-NEXT: s_subb_u32 s10, s2, s6
2862 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0
2863 ; GFX8-NEXT: s_and_b32 s11, s11, 1
2864 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2865 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0
2866 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
2867 ; GFX8-NEXT: s_subb_u32 s11, s3, s7
2868 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
2869 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
2870 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2871 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
2872 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0
2873 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
2874 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2875 ; GFX8-NEXT: s_and_b32 s0, 1, s6
2876 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2877 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
2878 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
2879 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
2880 ; GFX8-NEXT: v_mov_b32_e32 v1, s8
2881 ; GFX8-NEXT: v_mov_b32_e32 v2, s9
2882 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2883 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
2884 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2885 ; GFX8-NEXT: v_mov_b32_e32 v2, s10
2886 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
2887 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2888 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2889 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2890 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2891 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2892 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
2893 ; GFX8-NEXT: ; return to shader part epilog
2895 ; GFX9-LABEL: s_usubsat_i128:
2897 ; GFX9-NEXT: s_sub_u32 s8, s0, s4
2898 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0
2899 ; GFX9-NEXT: s_and_b32 s9, s9, 1
2900 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0
2901 ; GFX9-NEXT: s_subb_u32 s9, s1, s5
2902 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0
2903 ; GFX9-NEXT: s_and_b32 s10, s10, 1
2904 ; GFX9-NEXT: s_cmp_lg_u32 s10, 0
2905 ; GFX9-NEXT: s_subb_u32 s10, s2, s6
2906 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0
2907 ; GFX9-NEXT: s_and_b32 s11, s11, 1
2908 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
2909 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0
2910 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
2911 ; GFX9-NEXT: s_subb_u32 s11, s3, s7
2912 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2913 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
2914 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2915 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
2916 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0
2917 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
2918 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2919 ; GFX9-NEXT: s_and_b32 s0, 1, s6
2920 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2921 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
2922 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
2923 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
2924 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2925 ; GFX9-NEXT: v_mov_b32_e32 v2, s9
2926 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2927 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
2928 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
2929 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
2930 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
2931 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
2932 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
2933 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2934 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2935 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2936 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
2937 ; GFX9-NEXT: ; return to shader part epilog
2939 ; GFX10-LABEL: s_usubsat_i128:
2941 ; GFX10-NEXT: s_sub_u32 s8, s0, s4
2942 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0
2943 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5]
2944 ; GFX10-NEXT: s_and_b32 s9, s9, 1
2945 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0
2946 ; GFX10-NEXT: s_subb_u32 s9, s1, s5
2947 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0
2948 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
2949 ; GFX10-NEXT: s_and_b32 s10, s10, 1
2950 ; GFX10-NEXT: s_cmp_lg_u32 s10, 0
2951 ; GFX10-NEXT: s_subb_u32 s10, s2, s6
2952 ; GFX10-NEXT: s_cselect_b32 s11, 1, 0
2953 ; GFX10-NEXT: s_and_b32 s11, s11, 1
2954 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0
2955 ; GFX10-NEXT: s_subb_u32 s1, s3, s7
2956 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
2957 ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
2958 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0
2959 ; GFX10-NEXT: s_and_b32 s0, 1, s0
2960 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
2961 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
2962 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
2963 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
2964 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
2965 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
2966 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo
2967 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
2968 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo
2969 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2970 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
2971 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
2972 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
2973 ; GFX10-NEXT: ; return to shader part epilog
2974 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
2978 define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
2979 ; GFX6-LABEL: usubsat_i128_sv:
2981 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
2982 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s0, v0
2983 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
2984 ; GFX6-NEXT: v_mov_b32_e32 v6, s2
2985 ; GFX6-NEXT: v_mov_b32_e32 v7, s3
2986 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
2987 ; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
2988 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2989 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2990 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
2991 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
2992 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
2993 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2994 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
2995 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
2996 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
2997 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
2998 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
2999 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
3000 ; GFX6-NEXT: ; return to shader part epilog
3002 ; GFX8-LABEL: usubsat_i128_sv:
3004 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
3005 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s0, v0
3006 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
3007 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
3008 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
3009 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
3010 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
3011 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
3012 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3013 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
3014 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3015 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3016 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3017 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3018 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3019 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
3020 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
3021 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
3022 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
3023 ; GFX8-NEXT: ; return to shader part epilog
3025 ; GFX9-LABEL: usubsat_i128_sv:
3027 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
3028 ; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s0, v0
3029 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v1, vcc
3030 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
3031 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
3032 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc
3033 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
3034 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
3035 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3036 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
3037 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3038 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3039 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3040 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3041 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3042 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
3043 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
3044 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
3045 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
3046 ; GFX9-NEXT: ; return to shader part epilog
3048 ; GFX10-LABEL: usubsat_i128_sv:
3050 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
3051 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
3052 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3]
3053 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
3054 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
3055 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
3056 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
3057 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3058 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
3059 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3060 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3061 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4
3062 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
3063 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0
3064 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
3065 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
3066 ; GFX10-NEXT: ; return to shader part epilog
3067 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3068 %cast = bitcast i128 %result to <4 x float>
3069 ret <4 x float> %cast
3072 define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
3073 ; GFX6-LABEL: usubsat_i128_vs:
3075 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
3076 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0
3077 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
3078 ; GFX6-NEXT: v_mov_b32_e32 v6, s2
3079 ; GFX6-NEXT: v_mov_b32_e32 v7, s3
3080 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
3081 ; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
3082 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
3083 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3084 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
3085 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3086 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3087 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3088 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
3089 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3090 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
3091 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
3092 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
3093 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
3094 ; GFX6-NEXT: ; return to shader part epilog
3096 ; GFX8-LABEL: usubsat_i128_vs:
3098 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
3099 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0
3100 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
3101 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
3102 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
3103 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
3104 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
3105 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
3106 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3107 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
3108 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3109 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3110 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3111 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3112 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3113 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
3114 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
3115 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
3116 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
3117 ; GFX8-NEXT: ; return to shader part epilog
3119 ; GFX9-LABEL: usubsat_i128_vs:
3121 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
3122 ; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s0, v0
3123 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v4, vcc
3124 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
3125 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
3126 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v6, vcc
3127 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
3128 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
3129 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3130 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
3131 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3132 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3133 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3134 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3135 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3136 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
3137 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
3138 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
3139 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
3140 ; GFX9-NEXT: ; return to shader part epilog
3142 ; GFX10-LABEL: usubsat_i128_vs:
3144 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
3145 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
3146 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3]
3147 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
3148 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
3149 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
3150 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
3151 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3152 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
3153 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3154 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3155 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4
3156 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
3157 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0
3158 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
3159 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
3160 ; GFX10-NEXT: ; return to shader part epilog
3161 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
3162 %cast = bitcast i128 %result to <4 x float>
3163 ret <4 x float> %cast
3166 define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
3167 ; GFX6-LABEL: v_usubsat_v2i128:
3169 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3170 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v0, v8
3171 ; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc
3172 ; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc
3173 ; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc
3174 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
3175 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3176 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
3177 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3178 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
3179 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3180 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
3181 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3182 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
3183 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc
3184 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc
3185 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc
3186 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12
3187 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc
3188 ; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc
3189 ; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc
3190 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
3191 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3192 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
3193 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
3194 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
3195 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
3196 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
3197 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3198 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc
3199 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc
3200 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
3201 ; GFX6-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
3202 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3204 ; GFX8-LABEL: v_usubsat_v2i128:
3206 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3207 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v0, v8
3208 ; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc
3209 ; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc
3210 ; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc
3211 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
3212 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3213 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
3214 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3215 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
3216 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3217 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3218 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3219 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
3220 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc
3221 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc
3222 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc
3223 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12
3224 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc
3225 ; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc
3226 ; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc
3227 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
3228 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3229 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
3230 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
3231 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
3232 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
3233 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
3234 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3235 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc
3236 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc
3237 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
3238 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
3239 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3241 ; GFX9-LABEL: v_usubsat_v2i128:
3243 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3244 ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v0, v8
3245 ; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v1, v9, vcc
3246 ; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc
3247 ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc
3248 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
3249 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3250 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
3251 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
3252 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
3253 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3254 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3255 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3256 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
3257 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc
3258 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc
3259 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc
3260 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12
3261 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc
3262 ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc
3263 ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc
3264 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
3265 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3266 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
3267 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
3268 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
3269 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
3270 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
3271 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3272 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc
3273 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc
3274 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
3275 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
3276 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3278 ; GFX10-LABEL: v_usubsat_v2i128:
3280 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3281 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3282 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
3283 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[6:7], v[14:15]
3284 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
3285 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
3286 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
3287 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
3288 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo
3289 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
3290 ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
3291 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
3292 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
3293 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16
3294 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
3295 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8
3296 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3297 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5
3298 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3299 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3300 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12
3301 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
3302 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
3303 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
3304 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8
3305 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
3306 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
3307 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4
3308 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4
3309 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4
3310 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5
3311 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5
3312 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s5
3313 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s5
3314 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3315 %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3316 ret <2 x i128> %result
3319 define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
3320 ; GFX6-LABEL: s_usubsat_v2i128:
3322 ; GFX6-NEXT: s_sub_u32 s16, s0, s8
3323 ; GFX6-NEXT: s_cselect_b32 s17, 1, 0
3324 ; GFX6-NEXT: s_and_b32 s17, s17, 1
3325 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0
3326 ; GFX6-NEXT: s_subb_u32 s17, s1, s9
3327 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
3328 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0
3329 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
3330 ; GFX6-NEXT: s_and_b32 s18, s18, 1
3331 ; GFX6-NEXT: v_mov_b32_e32 v0, s10
3332 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3333 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0
3334 ; GFX6-NEXT: v_mov_b32_e32 v1, s11
3335 ; GFX6-NEXT: s_subb_u32 s18, s2, s10
3336 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3337 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3338 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0
3339 ; GFX6-NEXT: s_and_b32 s19, s19, 1
3340 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
3341 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
3342 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0
3343 ; GFX6-NEXT: s_subb_u32 s19, s3, s11
3344 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
3345 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
3346 ; GFX6-NEXT: s_sub_u32 s0, s4, s12
3347 ; GFX6-NEXT: v_mov_b32_e32 v2, s17
3348 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3349 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0
3350 ; GFX6-NEXT: v_mov_b32_e32 v1, s16
3351 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
3352 ; GFX6-NEXT: s_and_b32 s1, s1, 1
3353 ; GFX6-NEXT: v_mov_b32_e32 v2, s12
3354 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
3355 ; GFX6-NEXT: v_mov_b32_e32 v0, s18
3356 ; GFX6-NEXT: v_mov_b32_e32 v1, s19
3357 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0
3358 ; GFX6-NEXT: v_mov_b32_e32 v3, s13
3359 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
3360 ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
3361 ; GFX6-NEXT: s_subb_u32 s1, s5, s13
3362 ; GFX6-NEXT: v_mov_b32_e32 v0, s14
3363 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
3364 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0
3365 ; GFX6-NEXT: v_mov_b32_e32 v1, s15
3366 ; GFX6-NEXT: s_and_b32 s2, s2, 1
3367 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3368 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
3369 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0
3370 ; GFX6-NEXT: s_subb_u32 s2, s6, s14
3371 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
3372 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
3373 ; GFX6-NEXT: s_cselect_b32 s3, 1, 0
3374 ; GFX6-NEXT: s_and_b32 s3, s3, 1
3375 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
3376 ; GFX6-NEXT: s_cmp_lg_u32 s3, 0
3377 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
3378 ; GFX6-NEXT: s_subb_u32 s3, s7, s15
3379 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
3380 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
3381 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3382 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
3383 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
3384 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
3385 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
3386 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3387 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3388 ; GFX6-NEXT: v_readfirstlane_b32 s0, v4
3389 ; GFX6-NEXT: v_readfirstlane_b32 s1, v5
3390 ; GFX6-NEXT: v_readfirstlane_b32 s2, v6
3391 ; GFX6-NEXT: v_readfirstlane_b32 s3, v7
3392 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
3393 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1
3394 ; GFX6-NEXT: v_readfirstlane_b32 s6, v2
3395 ; GFX6-NEXT: v_readfirstlane_b32 s7, v3
3396 ; GFX6-NEXT: ; return to shader part epilog
3398 ; GFX8-LABEL: s_usubsat_v2i128:
3400 ; GFX8-NEXT: s_sub_u32 s16, s0, s8
3401 ; GFX8-NEXT: s_cselect_b32 s17, 1, 0
3402 ; GFX8-NEXT: s_and_b32 s17, s17, 1
3403 ; GFX8-NEXT: s_cmp_lg_u32 s17, 0
3404 ; GFX8-NEXT: s_subb_u32 s17, s1, s9
3405 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0
3406 ; GFX8-NEXT: s_and_b32 s18, s18, 1
3407 ; GFX8-NEXT: s_cmp_lg_u32 s18, 0
3408 ; GFX8-NEXT: s_subb_u32 s18, s2, s10
3409 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0
3410 ; GFX8-NEXT: s_and_b32 s19, s19, 1
3411 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
3412 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0
3413 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
3414 ; GFX8-NEXT: s_subb_u32 s19, s3, s11
3415 ; GFX8-NEXT: v_mov_b32_e32 v0, s10
3416 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3417 ; GFX8-NEXT: v_mov_b32_e32 v1, s11
3418 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
3419 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0
3420 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3421 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3422 ; GFX8-NEXT: s_and_b32 s0, 1, s10
3423 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3424 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
3425 ; GFX8-NEXT: s_sub_u32 s0, s4, s12
3426 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
3427 ; GFX8-NEXT: s_and_b32 s1, s1, 1
3428 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0
3429 ; GFX8-NEXT: s_subb_u32 s1, s5, s13
3430 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
3431 ; GFX8-NEXT: s_and_b32 s2, s2, 1
3432 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
3433 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0
3434 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3435 ; GFX8-NEXT: s_subb_u32 s2, s6, s14
3436 ; GFX8-NEXT: v_mov_b32_e32 v2, s17
3437 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3438 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0
3439 ; GFX8-NEXT: v_mov_b32_e32 v1, s16
3440 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
3441 ; GFX8-NEXT: s_and_b32 s3, s3, 1
3442 ; GFX8-NEXT: v_mov_b32_e32 v2, s12
3443 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
3444 ; GFX8-NEXT: v_mov_b32_e32 v0, s18
3445 ; GFX8-NEXT: v_mov_b32_e32 v1, s19
3446 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0
3447 ; GFX8-NEXT: v_mov_b32_e32 v3, s13
3448 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
3449 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
3450 ; GFX8-NEXT: s_subb_u32 s3, s7, s15
3451 ; GFX8-NEXT: v_mov_b32_e32 v0, s14
3452 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
3453 ; GFX8-NEXT: v_mov_b32_e32 v1, s15
3454 ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
3455 ; GFX8-NEXT: s_cselect_b32 s8, 1, 0
3456 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3457 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
3458 ; GFX8-NEXT: s_and_b32 s4, 1, s8
3459 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3460 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
3461 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
3462 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3463 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
3464 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
3465 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3466 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
3467 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
3468 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
3469 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
3470 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3471 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3472 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4
3473 ; GFX8-NEXT: v_readfirstlane_b32 s1, v5
3474 ; GFX8-NEXT: v_readfirstlane_b32 s2, v6
3475 ; GFX8-NEXT: v_readfirstlane_b32 s3, v7
3476 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0
3477 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1
3478 ; GFX8-NEXT: v_readfirstlane_b32 s6, v2
3479 ; GFX8-NEXT: v_readfirstlane_b32 s7, v3
3480 ; GFX8-NEXT: ; return to shader part epilog
3482 ; GFX9-LABEL: s_usubsat_v2i128:
3484 ; GFX9-NEXT: s_sub_u32 s16, s0, s8
3485 ; GFX9-NEXT: s_cselect_b32 s17, 1, 0
3486 ; GFX9-NEXT: s_and_b32 s17, s17, 1
3487 ; GFX9-NEXT: s_cmp_lg_u32 s17, 0
3488 ; GFX9-NEXT: s_subb_u32 s17, s1, s9
3489 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0
3490 ; GFX9-NEXT: s_and_b32 s18, s18, 1
3491 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0
3492 ; GFX9-NEXT: s_subb_u32 s18, s2, s10
3493 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0
3494 ; GFX9-NEXT: s_and_b32 s19, s19, 1
3495 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
3496 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0
3497 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
3498 ; GFX9-NEXT: s_subb_u32 s19, s3, s11
3499 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
3500 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3501 ; GFX9-NEXT: v_mov_b32_e32 v1, s11
3502 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
3503 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0
3504 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3505 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3506 ; GFX9-NEXT: s_and_b32 s0, 1, s10
3507 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3508 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
3509 ; GFX9-NEXT: s_sub_u32 s0, s4, s12
3510 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0
3511 ; GFX9-NEXT: s_and_b32 s1, s1, 1
3512 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0
3513 ; GFX9-NEXT: s_subb_u32 s1, s5, s13
3514 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
3515 ; GFX9-NEXT: s_and_b32 s2, s2, 1
3516 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
3517 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0
3518 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3519 ; GFX9-NEXT: s_subb_u32 s2, s6, s14
3520 ; GFX9-NEXT: v_mov_b32_e32 v2, s17
3521 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3522 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0
3523 ; GFX9-NEXT: v_mov_b32_e32 v1, s16
3524 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
3525 ; GFX9-NEXT: s_and_b32 s3, s3, 1
3526 ; GFX9-NEXT: v_mov_b32_e32 v2, s12
3527 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
3528 ; GFX9-NEXT: v_mov_b32_e32 v0, s18
3529 ; GFX9-NEXT: v_mov_b32_e32 v1, s19
3530 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0
3531 ; GFX9-NEXT: v_mov_b32_e32 v3, s13
3532 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
3533 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
3534 ; GFX9-NEXT: s_subb_u32 s3, s7, s15
3535 ; GFX9-NEXT: v_mov_b32_e32 v0, s14
3536 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
3537 ; GFX9-NEXT: v_mov_b32_e32 v1, s15
3538 ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
3539 ; GFX9-NEXT: s_cselect_b32 s8, 1, 0
3540 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3541 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
3542 ; GFX9-NEXT: s_and_b32 s4, 1, s8
3543 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3544 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
3545 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
3546 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3547 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
3548 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
3549 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3550 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
3551 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
3552 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
3553 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3554 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
3555 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
3556 ; GFX9-NEXT: v_readfirstlane_b32 s0, v4
3557 ; GFX9-NEXT: v_readfirstlane_b32 s1, v5
3558 ; GFX9-NEXT: v_readfirstlane_b32 s2, v6
3559 ; GFX9-NEXT: v_readfirstlane_b32 s3, v7
3560 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0
3561 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1
3562 ; GFX9-NEXT: v_readfirstlane_b32 s6, v2
3563 ; GFX9-NEXT: v_readfirstlane_b32 s7, v3
3564 ; GFX9-NEXT: ; return to shader part epilog
3566 ; GFX10-LABEL: s_usubsat_v2i128:
3568 ; GFX10-NEXT: s_sub_u32 s16, s0, s8
3569 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0
3570 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
3571 ; GFX10-NEXT: s_and_b32 s17, s17, 1
3572 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0
3573 ; GFX10-NEXT: s_subb_u32 s17, s1, s9
3574 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0
3575 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
3576 ; GFX10-NEXT: s_and_b32 s18, s18, 1
3577 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0
3578 ; GFX10-NEXT: s_subb_u32 s18, s2, s10
3579 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0
3580 ; GFX10-NEXT: s_and_b32 s19, s19, 1
3581 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0
3582 ; GFX10-NEXT: s_subb_u32 s19, s3, s11
3583 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
3584 ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[10:11]
3585 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0
3586 ; GFX10-NEXT: s_and_b32 s0, 1, s20
3587 ; GFX10-NEXT: s_sub_u32 s8, s4, s12
3588 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0
3589 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
3590 ; GFX10-NEXT: s_and_b32 s1, s1, 1
3591 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
3592 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0
3593 ; GFX10-NEXT: s_subb_u32 s3, s5, s13
3594 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0
3595 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
3596 ; GFX10-NEXT: s_and_b32 s1, s1, 1
3597 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0
3598 ; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13]
3599 ; GFX10-NEXT: s_subb_u32 s10, s6, s14
3600 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0
3601 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
3602 ; GFX10-NEXT: s_and_b32 s0, s0, 1
3603 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
3604 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0
3605 ; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[6:7], s[14:15]
3606 ; GFX10-NEXT: s_subb_u32 s9, s7, s15
3607 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
3608 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0
3609 ; GFX10-NEXT: s_and_b32 s0, 1, s0
3610 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
3611 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
3612 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
3613 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3614 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
3615 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s16, 0, vcc_lo
3616 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s17, 0, vcc_lo
3617 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s18, 0, vcc_lo
3618 ; GFX10-NEXT: v_cndmask_b32_e64 v4, s19, 0, vcc_lo
3619 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3620 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
3621 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
3622 ; GFX10-NEXT: v_readfirstlane_b32 s2, v3
3623 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
3624 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo
3625 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
3626 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo
3627 ; GFX10-NEXT: v_readfirstlane_b32 s3, v4
3628 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
3629 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1
3630 ; GFX10-NEXT: v_readfirstlane_b32 s6, v2
3631 ; GFX10-NEXT: v_readfirstlane_b32 s7, v3
3632 ; GFX10-NEXT: ; return to shader part epilog
3633 %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3634 ret <2 x i128> %result
3637 declare i7 @llvm.usub.sat.i7(i7, i7) #0
3638 declare i8 @llvm.usub.sat.i8(i8, i8) #0
3639 declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) #0
3640 declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) #0
3642 declare i16 @llvm.usub.sat.i16(i16, i16) #0
3643 declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0
3644 declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0
3645 declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0
3646 declare <5 x i16> @llvm.usub.sat.v5i16(<5 x i16>, <5 x i16>) #0
3647 declare <6 x i16> @llvm.usub.sat.v6i16(<6 x i16>, <6 x i16>) #0
3648 declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) #0
3650 declare i24 @llvm.usub.sat.i24(i24, i24) #0
3652 declare i32 @llvm.usub.sat.i32(i32, i32) #0
3653 declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0
3654 declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0
3655 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0
3656 declare <5 x i32> @llvm.usub.sat.v5i32(<5 x i32>, <5 x i32>) #0
3657 declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0
3659 declare i48 @llvm.usub.sat.i48(i48, i48) #0
3661 declare i64 @llvm.usub.sat.i64(i64, i64) #0
3662 declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) #0
3664 declare i128 @llvm.usub.sat.i128(i128, i128) #0
3665 declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>) #0
3667 attributes #0 = { nounwind readnone speculatable willreturn }