1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
9 ; GFX6-LABEL: v_ssubsat_i7:
11 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0
13 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
14 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1
15 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
16 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0
17 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3
18 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
19 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3
20 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
21 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0
22 ; GFX6-NEXT: s_setpc_b64 s[30:31]
24 ; GFX8-LABEL: v_ssubsat_i7:
26 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0
28 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0
29 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1
30 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2
31 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0
32 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3
33 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1
34 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3
35 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
36 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 9, v0
37 ; GFX8-NEXT: s_setpc_b64 s[30:31]
39 ; GFX9-LABEL: v_ssubsat_i7:
41 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0
43 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1
44 ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp
45 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
46 ; GFX9-NEXT: s_setpc_b64 s[30:31]
48 ; GFX10PLUS-LABEL: v_ssubsat_i7:
50 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0
52 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1
53 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
54 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
55 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
56 %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
60 define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
61 ; GFX6-LABEL: s_ssubsat_i7:
63 ; GFX6-NEXT: s_lshl_b32 s0, s0, 25
64 ; GFX6-NEXT: s_max_i32 s2, s0, -1
65 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25
66 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
67 ; GFX6-NEXT: s_min_i32 s3, s0, -1
68 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000
69 ; GFX6-NEXT: s_max_i32 s1, s2, s1
70 ; GFX6-NEXT: s_min_i32 s1, s1, s3
71 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
72 ; GFX6-NEXT: s_ashr_i32 s0, s0, 25
73 ; GFX6-NEXT: ; return to shader part epilog
75 ; GFX8-LABEL: s_ssubsat_i7:
77 ; GFX8-NEXT: s_lshl_b32 s0, s0, 9
78 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
79 ; GFX8-NEXT: s_sext_i32_i16 s3, -1
80 ; GFX8-NEXT: s_max_i32 s4, s2, s3
81 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9
82 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
83 ; GFX8-NEXT: s_min_i32 s2, s2, s3
84 ; GFX8-NEXT: s_sext_i32_i16 s3, s4
85 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
86 ; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
87 ; GFX8-NEXT: s_max_i32 s1, s3, s1
88 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
89 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
90 ; GFX8-NEXT: s_min_i32 s1, s1, s2
91 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
92 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
93 ; GFX8-NEXT: s_ashr_i32 s0, s0, 9
94 ; GFX8-NEXT: ; return to shader part epilog
96 ; GFX9-LABEL: s_ssubsat_i7:
98 ; GFX9-NEXT: s_lshl_b32 s1, s1, 9
99 ; GFX9-NEXT: s_lshl_b32 s0, s0, 9
100 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
101 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
102 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
103 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
104 ; GFX9-NEXT: ; return to shader part epilog
106 ; GFX10PLUS-LABEL: s_ssubsat_i7:
107 ; GFX10PLUS: ; %bb.0:
108 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
109 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
110 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
111 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
112 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
113 ; GFX10PLUS-NEXT: ; return to shader part epilog
114 %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
118 define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
119 ; GFX6-LABEL: v_ssubsat_i8:
121 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
123 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
124 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
125 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
126 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0
127 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3
128 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
129 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3
130 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
131 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0
132 ; GFX6-NEXT: s_setpc_b64 s[30:31]
134 ; GFX8-LABEL: v_ssubsat_i8:
136 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
138 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0
139 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
140 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2
141 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0
142 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3
143 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1
144 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3
145 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
146 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0
147 ; GFX8-NEXT: s_setpc_b64 s[30:31]
149 ; GFX9-LABEL: v_ssubsat_i8:
151 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
153 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
154 ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp
155 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
156 ; GFX9-NEXT: s_setpc_b64 s[30:31]
158 ; GFX10PLUS-LABEL: v_ssubsat_i8:
159 ; GFX10PLUS: ; %bb.0:
160 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
162 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
163 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
164 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
165 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
166 %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
170 define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
171 ; GFX6-LABEL: s_ssubsat_i8:
173 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
174 ; GFX6-NEXT: s_max_i32 s2, s0, -1
175 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
176 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
177 ; GFX6-NEXT: s_min_i32 s3, s0, -1
178 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000
179 ; GFX6-NEXT: s_max_i32 s1, s2, s1
180 ; GFX6-NEXT: s_min_i32 s1, s1, s3
181 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
182 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24
183 ; GFX6-NEXT: ; return to shader part epilog
185 ; GFX8-LABEL: s_ssubsat_i8:
187 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
188 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
189 ; GFX8-NEXT: s_sext_i32_i16 s3, -1
190 ; GFX8-NEXT: s_max_i32 s4, s2, s3
191 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
192 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
193 ; GFX8-NEXT: s_min_i32 s2, s2, s3
194 ; GFX8-NEXT: s_sext_i32_i16 s3, s4
195 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
196 ; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
197 ; GFX8-NEXT: s_max_i32 s1, s3, s1
198 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
199 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
200 ; GFX8-NEXT: s_min_i32 s1, s1, s2
201 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
202 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
203 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8
204 ; GFX8-NEXT: ; return to shader part epilog
206 ; GFX9-LABEL: s_ssubsat_i8:
208 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
209 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
210 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
211 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
212 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
213 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
214 ; GFX9-NEXT: ; return to shader part epilog
216 ; GFX10PLUS-LABEL: s_ssubsat_i8:
217 ; GFX10PLUS: ; %bb.0:
218 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
219 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
220 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
221 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
222 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
223 ; GFX10PLUS-NEXT: ; return to shader part epilog
224 %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
228 define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
229 ; GFX6-LABEL: v_ssubsat_v2i8:
231 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
233 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
234 ; GFX6-NEXT: s_brev_b32 s4, -2
235 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
236 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
237 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
238 ; GFX6-NEXT: s_brev_b32 s5, 1
239 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
240 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
241 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5
242 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
243 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v5
244 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
245 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
246 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
247 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
248 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
249 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
250 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
251 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
252 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
253 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
254 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1
255 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0
256 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
257 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
258 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
259 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
260 ; GFX6-NEXT: s_setpc_b64 s[30:31]
262 ; GFX8-LABEL: v_ssubsat_v2i8:
264 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
266 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
267 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
268 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0
269 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
270 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
271 ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4
272 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0
273 ; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5
274 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1
275 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5
276 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
277 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3
278 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1
279 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3
280 ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4
281 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2
282 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
283 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
284 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
285 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
287 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
288 ; GFX8-NEXT: s_setpc_b64 s[30:31]
290 ; GFX9-LABEL: v_ssubsat_v2i8:
292 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
294 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
295 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
296 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
297 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
298 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
299 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
300 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
301 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
302 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
303 ; GFX9-NEXT: s_movk_i32 s4, 0xff
304 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
306 ; GFX9-NEXT: s_setpc_b64 s[30:31]
308 ; GFX10-LABEL: v_ssubsat_v2i8:
310 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
312 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
313 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
314 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
315 ; GFX10-NEXT: s_movk_i32 s4, 0xff
316 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
317 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
318 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
319 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
320 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
321 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
322 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
323 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
324 ; GFX10-NEXT: s_setpc_b64 s[30:31]
326 ; GFX11-LABEL: v_ssubsat_v2i8:
328 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
329 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
330 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
331 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
332 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
333 ; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
334 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
335 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
336 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
337 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
338 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
339 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
340 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
341 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
342 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
343 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
344 ; GFX11-NEXT: s_setpc_b64 s[30:31]
345 %lhs = bitcast i16 %lhs.arg to <2 x i8>
346 %rhs = bitcast i16 %rhs.arg to <2 x i8>
347 %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
348 %cast.result = bitcast <2 x i8> %result to i16
352 define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
353 ; GFX6-LABEL: s_ssubsat_v2i8:
355 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
356 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
357 ; GFX6-NEXT: s_max_i32 s4, s0, -1
358 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8
359 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
360 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff
361 ; GFX6-NEXT: s_min_i32 s5, s0, -1
362 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000
363 ; GFX6-NEXT: s_max_i32 s1, s4, s1
364 ; GFX6-NEXT: s_min_i32 s1, s1, s5
365 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
366 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
367 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
368 ; GFX6-NEXT: s_max_i32 s3, s1, -1
369 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff
370 ; GFX6-NEXT: s_min_i32 s4, s1, -1
371 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000
372 ; GFX6-NEXT: s_max_i32 s2, s3, s2
373 ; GFX6-NEXT: s_min_i32 s2, s2, s4
374 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
375 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24
376 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24
377 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff
378 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff
379 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
380 ; GFX6-NEXT: s_or_b32 s0, s0, s1
381 ; GFX6-NEXT: ; return to shader part epilog
383 ; GFX8-LABEL: s_ssubsat_v2i8:
385 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
386 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
387 ; GFX8-NEXT: s_sext_i32_i16 s4, s0
388 ; GFX8-NEXT: s_sext_i32_i16 s5, -1
389 ; GFX8-NEXT: s_max_i32 s6, s4, s5
390 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8
391 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
392 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
393 ; GFX8-NEXT: s_min_i32 s4, s4, s5
394 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
395 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
396 ; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000
397 ; GFX8-NEXT: s_max_i32 s1, s6, s1
398 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
399 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
400 ; GFX8-NEXT: s_min_i32 s1, s1, s4
401 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
402 ; GFX8-NEXT: s_lshl_b32 s1, s2, 8
403 ; GFX8-NEXT: s_lshl_b32 s2, s3, 8
404 ; GFX8-NEXT: s_sext_i32_i16 s3, s1
405 ; GFX8-NEXT: s_max_i32 s4, s3, s5
406 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
407 ; GFX8-NEXT: s_min_i32 s3, s3, s5
408 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
409 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
410 ; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000
411 ; GFX8-NEXT: s_max_i32 s2, s4, s2
412 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
413 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
414 ; GFX8-NEXT: s_min_i32 s2, s2, s3
415 ; GFX8-NEXT: s_sub_i32 s1, s1, s2
416 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
417 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
418 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8
419 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8
420 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff
421 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff
422 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
423 ; GFX8-NEXT: s_or_b32 s0, s0, s1
424 ; GFX8-NEXT: ; return to shader part epilog
426 ; GFX9-LABEL: s_ssubsat_v2i8:
428 ; GFX9-NEXT: s_lshr_b32 s2, s0, 8
429 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
430 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8
431 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
432 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
433 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
434 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
435 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
436 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16
437 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
438 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
439 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
440 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
441 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
442 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
443 ; GFX9-NEXT: s_movk_i32 s0, 0xff
444 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
446 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
447 ; GFX9-NEXT: ; return to shader part epilog
449 ; GFX10-LABEL: s_ssubsat_v2i8:
451 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
452 ; GFX10-NEXT: s_lshr_b32 s3, s1, 8
453 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
454 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
455 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
456 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
457 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
458 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8
459 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
460 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
461 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
462 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
463 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
464 ; GFX10-NEXT: s_movk_i32 s0, 0xff
465 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
466 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
467 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
468 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
469 ; GFX10-NEXT: ; return to shader part epilog
471 ; GFX11-LABEL: s_ssubsat_v2i8:
473 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
474 ; GFX11-NEXT: s_lshr_b32 s3, s1, 8
475 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
476 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
477 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
478 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
479 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
480 ; GFX11-NEXT: s_lshl_b32 s2, s2, 8
481 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
482 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8
483 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
484 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
485 ; GFX11-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
486 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
487 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
488 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
489 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
490 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
491 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
492 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
493 ; GFX11-NEXT: ; return to shader part epilog
494 %lhs = bitcast i16 %lhs.arg to <2 x i8>
495 %rhs = bitcast i16 %rhs.arg to <2 x i8>
496 %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
497 %cast.result = bitcast <2 x i8> %result to i16
501 define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
502 ; GFX6-LABEL: v_ssubsat_v4i8:
504 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
506 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
507 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0
508 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
509 ; GFX6-NEXT: s_brev_b32 s4, -2
510 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0
511 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1
512 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1
513 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1
514 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
515 ; GFX6-NEXT: s_brev_b32 s5, 1
516 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8
517 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0
518 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10
519 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1
520 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v10
521 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
522 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
523 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5
524 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1
525 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5
526 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1
527 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8
528 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2
529 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v8
530 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
531 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
532 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2
533 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2
534 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6
535 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
536 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2
537 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6
538 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3
539 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6
540 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
541 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4
542 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3
543 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1
544 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1
545 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7
546 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
547 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3
548 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0
549 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11
550 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4
551 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
552 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2
553 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6
554 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
555 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
556 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
557 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
558 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2
559 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3
560 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
561 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
562 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3
563 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
564 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
565 ; GFX6-NEXT: s_setpc_b64 s[30:31]
567 ; GFX8-LABEL: v_ssubsat_v4i8:
569 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
571 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
572 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
573 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0
574 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
575 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v0
576 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
577 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
578 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1
579 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
580 ; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8
581 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v0
582 ; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9
583 ; GFX8-NEXT: v_max_i16_e32 v1, v8, v1
584 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v9
585 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
586 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3
587 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1
588 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v3
589 ; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8
590 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2
591 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4
592 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8
593 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2
594 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
595 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6
596 ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4
597 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v2
598 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6
599 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3
600 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6
601 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3
602 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5
603 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3
604 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7
605 ; GFX8-NEXT: v_subrev_u16_e32 v5, 0x7fff, v5
606 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v3
607 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6
608 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4
609 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6
610 ; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4
611 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
612 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
614 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
615 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
618 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
620 ; GFX8-NEXT: s_setpc_b64 s[30:31]
622 ; GFX9-LABEL: v_ssubsat_v4i8:
624 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
626 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
627 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
628 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
629 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
630 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
631 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
632 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
633 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
634 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
635 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
636 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
637 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
638 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
639 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 clamp
640 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
641 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
642 ; GFX9-NEXT: v_mov_b32_e32 v2, 8
643 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
644 ; GFX9-NEXT: s_movk_i32 s4, 0xff
645 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
646 ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
647 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0
648 ; GFX9-NEXT: v_mov_b32_e32 v3, 24
649 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
650 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
651 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
652 ; GFX9-NEXT: s_setpc_b64 s[30:31]
654 ; GFX10-LABEL: v_ssubsat_v4i8:
656 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
658 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
659 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
660 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
661 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
662 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
663 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
664 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
665 ; GFX10-NEXT: v_mov_b32_e32 v4, 24
666 ; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
667 ; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
668 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
669 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
670 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
671 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
672 ; GFX10-NEXT: v_pk_sub_i16 v2, v2, v3 clamp
673 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
674 ; GFX10-NEXT: v_mov_b32_e32 v1, 8
675 ; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1]
676 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
677 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
678 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0
679 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
680 ; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1
681 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
682 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
683 ; GFX10-NEXT: s_setpc_b64 s[30:31]
685 ; GFX11-LABEL: v_ssubsat_v4i8:
687 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
689 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
690 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
691 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
692 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
693 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
694 ; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
695 ; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
696 ; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
697 ; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
698 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
699 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
700 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
701 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
702 ; GFX11-NEXT: v_pk_sub_i16 v2, v2, v3 clamp
703 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
704 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
705 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
706 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8
707 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0
708 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
709 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
710 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
711 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
712 ; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2
713 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0
714 ; GFX11-NEXT: s_setpc_b64 s[30:31]
715 %lhs = bitcast i32 %lhs.arg to <4 x i8>
716 %rhs = bitcast i32 %rhs.arg to <4 x i8>
717 %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
718 %cast.result = bitcast <4 x i8> %result to i32
722 define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
723 ; GFX6-LABEL: s_ssubsat_v4i8:
725 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
726 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
727 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24
728 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
729 ; GFX6-NEXT: s_max_i32 s8, s0, -1
730 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8
731 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16
732 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24
733 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
734 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff
735 ; GFX6-NEXT: s_min_i32 s9, s0, -1
736 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000
737 ; GFX6-NEXT: s_max_i32 s1, s8, s1
738 ; GFX6-NEXT: s_min_i32 s1, s1, s9
739 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
740 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
741 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24
742 ; GFX6-NEXT: s_max_i32 s5, s1, -1
743 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
744 ; GFX6-NEXT: s_min_i32 s8, s1, -1
745 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
746 ; GFX6-NEXT: s_max_i32 s2, s5, s2
747 ; GFX6-NEXT: s_min_i32 s2, s2, s8
748 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
749 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
750 ; GFX6-NEXT: s_max_i32 s5, s2, -1
751 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24
752 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
753 ; GFX6-NEXT: s_min_i32 s6, s2, -1
754 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
755 ; GFX6-NEXT: s_max_i32 s3, s5, s3
756 ; GFX6-NEXT: s_min_i32 s3, s3, s6
757 ; GFX6-NEXT: s_sub_i32 s2, s2, s3
758 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24
759 ; GFX6-NEXT: s_max_i32 s5, s3, -1
760 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24
761 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24
762 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
763 ; GFX6-NEXT: s_min_i32 s6, s3, -1
764 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24
765 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
766 ; GFX6-NEXT: s_max_i32 s4, s5, s4
767 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff
768 ; GFX6-NEXT: s_ashr_i32 s2, s2, 24
769 ; GFX6-NEXT: s_min_i32 s4, s4, s6
770 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff
771 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
772 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
773 ; GFX6-NEXT: s_or_b32 s0, s0, s1
774 ; GFX6-NEXT: s_and_b32 s1, s2, 0xff
775 ; GFX6-NEXT: s_ashr_i32 s3, s3, 24
776 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
777 ; GFX6-NEXT: s_or_b32 s0, s0, s1
778 ; GFX6-NEXT: s_and_b32 s1, s3, 0xff
779 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
780 ; GFX6-NEXT: s_or_b32 s0, s0, s1
781 ; GFX6-NEXT: ; return to shader part epilog
783 ; GFX8-LABEL: s_ssubsat_v4i8:
785 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
786 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16
787 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24
788 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
789 ; GFX8-NEXT: s_sext_i32_i16 s8, s0
790 ; GFX8-NEXT: s_sext_i32_i16 s9, -1
791 ; GFX8-NEXT: s_max_i32 s10, s8, s9
792 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8
793 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16
794 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24
795 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
796 ; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff
797 ; GFX8-NEXT: s_min_i32 s8, s8, s9
798 ; GFX8-NEXT: s_sext_i32_i16 s10, s10
799 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
800 ; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000
801 ; GFX8-NEXT: s_max_i32 s1, s10, s1
802 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
803 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
804 ; GFX8-NEXT: s_min_i32 s1, s1, s8
805 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
806 ; GFX8-NEXT: s_lshl_b32 s1, s2, 8
807 ; GFX8-NEXT: s_lshl_b32 s2, s5, 8
808 ; GFX8-NEXT: s_sext_i32_i16 s5, s1
809 ; GFX8-NEXT: s_max_i32 s8, s5, s9
810 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
811 ; GFX8-NEXT: s_min_i32 s5, s5, s9
812 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
813 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
814 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
815 ; GFX8-NEXT: s_max_i32 s2, s8, s2
816 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
817 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
818 ; GFX8-NEXT: s_min_i32 s2, s2, s5
819 ; GFX8-NEXT: s_sub_i32 s1, s1, s2
820 ; GFX8-NEXT: s_lshl_b32 s2, s3, 8
821 ; GFX8-NEXT: s_sext_i32_i16 s5, s2
822 ; GFX8-NEXT: s_lshl_b32 s3, s6, 8
823 ; GFX8-NEXT: s_max_i32 s6, s5, s9
824 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
825 ; GFX8-NEXT: s_min_i32 s5, s5, s9
826 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
827 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
828 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
829 ; GFX8-NEXT: s_max_i32 s3, s6, s3
830 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
831 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
832 ; GFX8-NEXT: s_min_i32 s3, s3, s5
833 ; GFX8-NEXT: s_sub_i32 s2, s2, s3
834 ; GFX8-NEXT: s_lshl_b32 s3, s4, 8
835 ; GFX8-NEXT: s_sext_i32_i16 s5, s3
836 ; GFX8-NEXT: s_max_i32 s6, s5, s9
837 ; GFX8-NEXT: s_lshl_b32 s4, s7, 8
838 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
839 ; GFX8-NEXT: s_min_i32 s5, s5, s9
840 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
841 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
842 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
843 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
844 ; GFX8-NEXT: s_max_i32 s4, s6, s4
845 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
846 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8
847 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
848 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
849 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8
850 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
851 ; GFX8-NEXT: s_min_i32 s4, s4, s5
852 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff
853 ; GFX8-NEXT: s_ashr_i32 s2, s2, 8
854 ; GFX8-NEXT: s_sub_i32 s3, s3, s4
855 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff
856 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
857 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
858 ; GFX8-NEXT: s_or_b32 s0, s0, s1
859 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
860 ; GFX8-NEXT: s_ashr_i32 s3, s3, 8
861 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
862 ; GFX8-NEXT: s_or_b32 s0, s0, s1
863 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff
864 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24
865 ; GFX8-NEXT: s_or_b32 s0, s0, s1
866 ; GFX8-NEXT: ; return to shader part epilog
868 ; GFX9-LABEL: s_ssubsat_v4i8:
870 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8
871 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
872 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24
873 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
874 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
875 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16
876 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
877 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
878 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8
879 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
880 ; GFX9-NEXT: s_lshr_b32 s6, s3, 16
881 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16
882 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24
883 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
884 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
885 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
886 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
887 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16
888 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
889 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
890 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
891 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
892 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16
893 ; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
894 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
895 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
896 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
897 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
898 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
899 ; GFX9-NEXT: s_mov_b32 s2, 8
900 ; GFX9-NEXT: v_pk_sub_i16 v1, s3, v1 clamp
901 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
902 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
903 ; GFX9-NEXT: s_movk_i32 s0, 0xff
904 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
905 ; GFX9-NEXT: s_mov_b32 s5, 24
906 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2
907 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
908 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
909 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
910 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
911 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
912 ; GFX9-NEXT: ; return to shader part epilog
914 ; GFX10-LABEL: s_ssubsat_v4i8:
916 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
917 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
918 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24
919 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8
920 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
921 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
922 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16
923 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24
924 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
925 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
926 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
927 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
928 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
929 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
930 ; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
931 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
932 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
933 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
934 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
935 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16
936 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
937 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
938 ; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
939 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
940 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
941 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
942 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
943 ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp
944 ; GFX10-NEXT: s_mov_b32 s0, 8
945 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
946 ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
947 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
948 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
949 ; GFX10-NEXT: s_mov_b32 s0, 24
950 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
951 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
952 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
953 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
954 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
955 ; GFX10-NEXT: ; return to shader part epilog
957 ; GFX11-LABEL: s_ssubsat_v4i8:
959 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
960 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24
961 ; GFX11-NEXT: s_lshr_b32 s4, s1, 8
962 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24
963 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2
964 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3
965 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4
966 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
967 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5
968 ; GFX11-NEXT: s_lshr_b32 s5, s3, 16
969 ; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008
970 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
971 ; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
972 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8
973 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
974 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
975 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16
976 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16
977 ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 clamp
978 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
979 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
980 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
981 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8
982 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
983 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
984 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
985 ; GFX11-NEXT: v_pk_sub_i16 v1, s0, s1 clamp
986 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
987 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
988 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
989 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
990 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
991 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2
992 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
993 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
994 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
995 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
996 ; GFX11-NEXT: ; return to shader part epilog
997 %lhs = bitcast i32 %lhs.arg to <4 x i8>
998 %rhs = bitcast i32 %rhs.arg to <4 x i8>
999 %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
1000 %cast.result = bitcast <4 x i8> %result to i32
1001 ret i32 %cast.result
1004 define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
1005 ; GFX6-LABEL: v_ssubsat_i24:
1007 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1008 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1009 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
1010 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1011 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
1012 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0
1013 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3
1014 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
1015 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3
1016 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1017 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1018 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1020 ; GFX8-LABEL: v_ssubsat_i24:
1022 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v1
1024 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 24
1025 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24
1026 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0
1027 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24
1028 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v0
1029 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3
1030 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0
1031 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
1032 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1033 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1035 ; GFX9-LABEL: v_ssubsat_i24:
1037 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1039 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1040 ; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp
1041 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1042 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1044 ; GFX10PLUS-LABEL: v_ssubsat_i24:
1045 ; GFX10PLUS: ; %bb.0:
1046 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1048 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1049 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v1 clamp
1050 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1051 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1052 %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
1056 define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
1057 ; GFX6-LABEL: s_ssubsat_i24:
1059 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8
1060 ; GFX6-NEXT: s_max_i32 s2, s0, -1
1061 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
1062 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
1063 ; GFX6-NEXT: s_min_i32 s3, s0, -1
1064 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000
1065 ; GFX6-NEXT: s_max_i32 s1, s2, s1
1066 ; GFX6-NEXT: s_min_i32 s1, s1, s3
1067 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
1068 ; GFX6-NEXT: s_ashr_i32 s0, s0, 8
1069 ; GFX6-NEXT: ; return to shader part epilog
1071 ; GFX8-LABEL: s_ssubsat_i24:
1073 ; GFX8-NEXT: s_sub_i32 s2, s0, s1
1074 ; GFX8-NEXT: s_bfe_i32 s3, s2, 0x180000
1075 ; GFX8-NEXT: s_bfe_i32 s0, s0, 0x180000
1076 ; GFX8-NEXT: s_cmp_lt_i32 s3, s0
1077 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0
1078 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000
1079 ; GFX8-NEXT: s_cmp_gt_i32 s1, 0
1080 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
1081 ; GFX8-NEXT: s_xor_b32 s0, s1, s0
1082 ; GFX8-NEXT: s_ashr_i32 s1, s3, 23
1083 ; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000
1084 ; GFX8-NEXT: s_and_b32 s0, s0, 1
1085 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0
1086 ; GFX8-NEXT: s_cselect_b32 s0, s1, s2
1087 ; GFX8-NEXT: ; return to shader part epilog
1089 ; GFX9-LABEL: s_ssubsat_i24:
1091 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
1092 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
1093 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1094 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1095 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1096 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1097 ; GFX9-NEXT: ; return to shader part epilog
1099 ; GFX10PLUS-LABEL: s_ssubsat_i24:
1100 ; GFX10PLUS: ; %bb.0:
1101 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
1102 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
1103 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s1 clamp
1104 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1105 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1106 ; GFX10PLUS-NEXT: ; return to shader part epilog
1107 %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
1111 define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
1112 ; GFX6-LABEL: v_ssubsat_i32:
1114 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1115 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
1116 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
1117 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0
1118 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3
1119 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
1120 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3
1121 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1122 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1124 ; GFX8-LABEL: v_ssubsat_i32:
1126 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1127 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v0
1128 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
1129 ; GFX8-NEXT: v_min_i32_e32 v3, -1, v0
1130 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x80000000, v3
1131 ; GFX8-NEXT: v_max_i32_e32 v1, v2, v1
1132 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v3
1133 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
1134 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1136 ; GFX9-LABEL: v_ssubsat_i32:
1138 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139 ; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp
1140 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1142 ; GFX10PLUS-LABEL: v_ssubsat_i32:
1143 ; GFX10PLUS: ; %bb.0:
1144 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1145 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v1 clamp
1146 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1147 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1151 define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
1152 ; GFX6-LABEL: s_ssubsat_i32:
1154 ; GFX6-NEXT: s_max_i32 s2, s0, -1
1155 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
1156 ; GFX6-NEXT: s_min_i32 s3, s0, -1
1157 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000
1158 ; GFX6-NEXT: s_max_i32 s1, s2, s1
1159 ; GFX6-NEXT: s_min_i32 s1, s1, s3
1160 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
1161 ; GFX6-NEXT: ; return to shader part epilog
1163 ; GFX8-LABEL: s_ssubsat_i32:
1165 ; GFX8-NEXT: s_max_i32 s2, s0, -1
1166 ; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff
1167 ; GFX8-NEXT: s_min_i32 s3, s0, -1
1168 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x80000000
1169 ; GFX8-NEXT: s_max_i32 s1, s2, s1
1170 ; GFX8-NEXT: s_min_i32 s1, s1, s3
1171 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
1172 ; GFX8-NEXT: ; return to shader part epilog
1174 ; GFX9-LABEL: s_ssubsat_i32:
1176 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1177 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1178 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1179 ; GFX9-NEXT: ; return to shader part epilog
1181 ; GFX10PLUS-LABEL: s_ssubsat_i32:
1182 ; GFX10PLUS: ; %bb.0:
1183 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s1 clamp
1184 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1185 ; GFX10PLUS-NEXT: ; return to shader part epilog
1186 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1190 define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
1191 ; GFX6-LABEL: ssubsat_i32_sv:
1193 ; GFX6-NEXT: s_max_i32 s1, s0, -1
1194 ; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff
1195 ; GFX6-NEXT: s_min_i32 s2, s0, -1
1196 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000
1197 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0
1198 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0
1199 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
1200 ; GFX6-NEXT: ; return to shader part epilog
1202 ; GFX8-LABEL: ssubsat_i32_sv:
1204 ; GFX8-NEXT: s_max_i32 s1, s0, -1
1205 ; GFX8-NEXT: s_sub_i32 s1, s1, 0x7fffffff
1206 ; GFX8-NEXT: s_min_i32 s2, s0, -1
1207 ; GFX8-NEXT: s_sub_i32 s2, s2, 0x80000000
1208 ; GFX8-NEXT: v_max_i32_e32 v0, s1, v0
1209 ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0
1210 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
1211 ; GFX8-NEXT: ; return to shader part epilog
1213 ; GFX9-LABEL: ssubsat_i32_sv:
1215 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1216 ; GFX9-NEXT: ; return to shader part epilog
1218 ; GFX10PLUS-LABEL: ssubsat_i32_sv:
1219 ; GFX10PLUS: ; %bb.0:
1220 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, v0 clamp
1221 ; GFX10PLUS-NEXT: ; return to shader part epilog
1222 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1223 %cast = bitcast i32 %result to float
1227 define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
1228 ; GFX6-LABEL: ssubsat_i32_vs:
1230 ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0
1231 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
1232 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0
1233 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2
1234 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1
1235 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
1236 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
1237 ; GFX6-NEXT: ; return to shader part epilog
1239 ; GFX8-LABEL: ssubsat_i32_vs:
1241 ; GFX8-NEXT: v_max_i32_e32 v1, -1, v0
1242 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 0x7fffffff, v1
1243 ; GFX8-NEXT: v_min_i32_e32 v2, -1, v0
1244 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x80000000, v2
1245 ; GFX8-NEXT: v_max_i32_e32 v1, s0, v1
1246 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v2
1247 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
1248 ; GFX8-NEXT: ; return to shader part epilog
1250 ; GFX9-LABEL: ssubsat_i32_vs:
1252 ; GFX9-NEXT: v_sub_i32 v0, v0, s0 clamp
1253 ; GFX9-NEXT: ; return to shader part epilog
1255 ; GFX10PLUS-LABEL: ssubsat_i32_vs:
1256 ; GFX10PLUS: ; %bb.0:
1257 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, s0 clamp
1258 ; GFX10PLUS-NEXT: ; return to shader part epilog
1259 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1260 %cast = bitcast i32 %result to float
1264 define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
1265 ; GFX6-LABEL: v_ssubsat_v2i32:
1267 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1268 ; GFX6-NEXT: s_brev_b32 s4, -2
1269 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
1270 ; GFX6-NEXT: s_brev_b32 s5, 1
1271 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
1272 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
1273 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5
1274 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
1275 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
1276 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1277 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1
1278 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2
1279 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
1280 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
1281 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
1282 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
1283 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
1284 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1286 ; GFX8-LABEL: v_ssubsat_v2i32:
1288 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289 ; GFX8-NEXT: s_brev_b32 s4, -2
1290 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0
1291 ; GFX8-NEXT: s_brev_b32 s5, 1
1292 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
1293 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0
1294 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5
1295 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2
1296 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5
1297 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
1298 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1
1299 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2
1300 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v1
1301 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4
1302 ; GFX8-NEXT: v_max_i32_e32 v2, v2, v3
1303 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4
1304 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2
1305 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1307 ; GFX9-LABEL: v_ssubsat_v2i32:
1309 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1310 ; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp
1311 ; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp
1312 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1314 ; GFX10PLUS-LABEL: v_ssubsat_v2i32:
1315 ; GFX10PLUS: ; %bb.0:
1316 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1317 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
1318 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
1319 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1320 %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1321 ret <2 x i32> %result
1324 define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1325 ; GFX6-LABEL: s_ssubsat_v2i32:
1327 ; GFX6-NEXT: s_max_i32 s4, s0, -1
1328 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1329 ; GFX6-NEXT: s_min_i32 s5, s0, -1
1330 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000
1331 ; GFX6-NEXT: s_max_i32 s2, s4, s2
1332 ; GFX6-NEXT: s_min_i32 s2, s2, s5
1333 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
1334 ; GFX6-NEXT: s_max_i32 s2, s1, -1
1335 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
1336 ; GFX6-NEXT: s_min_i32 s4, s1, -1
1337 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000
1338 ; GFX6-NEXT: s_max_i32 s2, s2, s3
1339 ; GFX6-NEXT: s_min_i32 s2, s2, s4
1340 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
1341 ; GFX6-NEXT: ; return to shader part epilog
1343 ; GFX8-LABEL: s_ssubsat_v2i32:
1345 ; GFX8-NEXT: s_max_i32 s4, s0, -1
1346 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1347 ; GFX8-NEXT: s_min_i32 s5, s0, -1
1348 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000
1349 ; GFX8-NEXT: s_max_i32 s2, s4, s2
1350 ; GFX8-NEXT: s_min_i32 s2, s2, s5
1351 ; GFX8-NEXT: s_sub_i32 s0, s0, s2
1352 ; GFX8-NEXT: s_max_i32 s2, s1, -1
1353 ; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff
1354 ; GFX8-NEXT: s_min_i32 s4, s1, -1
1355 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000
1356 ; GFX8-NEXT: s_max_i32 s2, s2, s3
1357 ; GFX8-NEXT: s_min_i32 s2, s2, s4
1358 ; GFX8-NEXT: s_sub_i32 s1, s1, s2
1359 ; GFX8-NEXT: ; return to shader part epilog
1361 ; GFX9-LABEL: s_ssubsat_v2i32:
1363 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1364 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1365 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1366 ; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp
1367 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1368 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1369 ; GFX9-NEXT: ; return to shader part epilog
1371 ; GFX10PLUS-LABEL: s_ssubsat_v2i32:
1372 ; GFX10PLUS: ; %bb.0:
1373 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s2 clamp
1374 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s3 clamp
1375 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1376 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1377 ; GFX10PLUS-NEXT: ; return to shader part epilog
1378 %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1379 ret <2 x i32> %result
1382 define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1383 ; GFX6-LABEL: v_ssubsat_v3i32:
1385 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1386 ; GFX6-NEXT: s_brev_b32 s4, -2
1387 ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0
1388 ; GFX6-NEXT: s_brev_b32 s5, 1
1389 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6
1390 ; GFX6-NEXT: v_min_i32_e32 v7, -1, v0
1391 ; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7
1392 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3
1393 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v7
1394 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
1395 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
1396 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
1397 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v1
1398 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6
1399 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
1400 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6
1401 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
1402 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2
1403 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
1404 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v2
1405 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
1406 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
1407 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v4
1408 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
1409 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1411 ; GFX8-LABEL: v_ssubsat_v3i32:
1413 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414 ; GFX8-NEXT: s_brev_b32 s4, -2
1415 ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0
1416 ; GFX8-NEXT: s_brev_b32 s5, 1
1417 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6
1418 ; GFX8-NEXT: v_min_i32_e32 v7, -1, v0
1419 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7
1420 ; GFX8-NEXT: v_max_i32_e32 v3, v6, v3
1421 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v7
1422 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
1423 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1
1424 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3
1425 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v1
1426 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6
1427 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
1428 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6
1429 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
1430 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2
1431 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3
1432 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v2
1433 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s5, v4
1434 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v5
1435 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v4
1436 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
1437 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1439 ; GFX9-LABEL: v_ssubsat_v3i32:
1441 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442 ; GFX9-NEXT: v_sub_i32 v0, v0, v3 clamp
1443 ; GFX9-NEXT: v_sub_i32 v1, v1, v4 clamp
1444 ; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp
1445 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1447 ; GFX10PLUS-LABEL: v_ssubsat_v3i32:
1448 ; GFX10PLUS: ; %bb.0:
1449 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1450 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v3 clamp
1451 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v4 clamp
1452 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v5 clamp
1453 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1454 %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1455 ret <3 x i32> %result
1458 define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1459 ; GFX6-LABEL: s_ssubsat_v3i32:
1461 ; GFX6-NEXT: s_max_i32 s6, s0, -1
1462 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x7fffffff
1463 ; GFX6-NEXT: s_min_i32 s7, s0, -1
1464 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x80000000
1465 ; GFX6-NEXT: s_max_i32 s3, s6, s3
1466 ; GFX6-NEXT: s_min_i32 s3, s3, s7
1467 ; GFX6-NEXT: s_sub_i32 s0, s0, s3
1468 ; GFX6-NEXT: s_max_i32 s3, s1, -1
1469 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff
1470 ; GFX6-NEXT: s_min_i32 s6, s1, -1
1471 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
1472 ; GFX6-NEXT: s_max_i32 s3, s3, s4
1473 ; GFX6-NEXT: s_min_i32 s3, s3, s6
1474 ; GFX6-NEXT: s_sub_i32 s1, s1, s3
1475 ; GFX6-NEXT: s_max_i32 s3, s2, -1
1476 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff
1477 ; GFX6-NEXT: s_min_i32 s4, s2, -1
1478 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000
1479 ; GFX6-NEXT: s_max_i32 s3, s3, s5
1480 ; GFX6-NEXT: s_min_i32 s3, s3, s4
1481 ; GFX6-NEXT: s_sub_i32 s2, s2, s3
1482 ; GFX6-NEXT: ; return to shader part epilog
1484 ; GFX8-LABEL: s_ssubsat_v3i32:
1486 ; GFX8-NEXT: s_max_i32 s6, s0, -1
1487 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fffffff
1488 ; GFX8-NEXT: s_min_i32 s7, s0, -1
1489 ; GFX8-NEXT: s_sub_i32 s7, s7, 0x80000000
1490 ; GFX8-NEXT: s_max_i32 s3, s6, s3
1491 ; GFX8-NEXT: s_min_i32 s3, s3, s7
1492 ; GFX8-NEXT: s_sub_i32 s0, s0, s3
1493 ; GFX8-NEXT: s_max_i32 s3, s1, -1
1494 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff
1495 ; GFX8-NEXT: s_min_i32 s6, s1, -1
1496 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000
1497 ; GFX8-NEXT: s_max_i32 s3, s3, s4
1498 ; GFX8-NEXT: s_min_i32 s3, s3, s6
1499 ; GFX8-NEXT: s_sub_i32 s1, s1, s3
1500 ; GFX8-NEXT: s_max_i32 s3, s2, -1
1501 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff
1502 ; GFX8-NEXT: s_min_i32 s4, s2, -1
1503 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000
1504 ; GFX8-NEXT: s_max_i32 s3, s3, s5
1505 ; GFX8-NEXT: s_min_i32 s3, s3, s4
1506 ; GFX8-NEXT: s_sub_i32 s2, s2, s3
1507 ; GFX8-NEXT: ; return to shader part epilog
1509 ; GFX9-LABEL: s_ssubsat_v3i32:
1511 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1512 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1513 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
1514 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1515 ; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp
1516 ; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp
1517 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1518 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1519 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1520 ; GFX9-NEXT: ; return to shader part epilog
1522 ; GFX10PLUS-LABEL: s_ssubsat_v3i32:
1523 ; GFX10PLUS: ; %bb.0:
1524 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s3 clamp
1525 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s4 clamp
1526 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s5 clamp
1527 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1528 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1529 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1530 ; GFX10PLUS-NEXT: ; return to shader part epilog
1531 %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1532 ret <3 x i32> %result
1535 define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1536 ; GFX6-LABEL: v_ssubsat_v4i32:
1538 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1539 ; GFX6-NEXT: s_brev_b32 s4, -2
1540 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0
1541 ; GFX6-NEXT: s_brev_b32 s5, 1
1542 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8
1543 ; GFX6-NEXT: v_min_i32_e32 v9, -1, v0
1544 ; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9
1545 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4
1546 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v9
1547 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
1548 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1
1549 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
1550 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1
1551 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8
1552 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5
1553 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8
1554 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
1555 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2
1556 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
1557 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v2
1558 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5
1559 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v6
1560 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5
1561 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
1562 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v3
1563 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
1564 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v3
1565 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5
1566 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
1567 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5
1568 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
1569 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1571 ; GFX8-LABEL: v_ssubsat_v4i32:
1573 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574 ; GFX8-NEXT: s_brev_b32 s4, -2
1575 ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0
1576 ; GFX8-NEXT: s_brev_b32 s5, 1
1577 ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8
1578 ; GFX8-NEXT: v_min_i32_e32 v9, -1, v0
1579 ; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9
1580 ; GFX8-NEXT: v_max_i32_e32 v4, v8, v4
1581 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v9
1582 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
1583 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1
1584 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
1585 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v1
1586 ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s5, v8
1587 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5
1588 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8
1589 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4
1590 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2
1591 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
1592 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v2
1593 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5
1594 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v6
1595 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5
1596 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4
1597 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v3
1598 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4
1599 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v3
1600 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5
1601 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v7
1602 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5
1603 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4
1604 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1606 ; GFX9-LABEL: v_ssubsat_v4i32:
1608 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1609 ; GFX9-NEXT: v_sub_i32 v0, v0, v4 clamp
1610 ; GFX9-NEXT: v_sub_i32 v1, v1, v5 clamp
1611 ; GFX9-NEXT: v_sub_i32 v2, v2, v6 clamp
1612 ; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp
1613 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1615 ; GFX10PLUS-LABEL: v_ssubsat_v4i32:
1616 ; GFX10PLUS: ; %bb.0:
1617 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1618 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v4 clamp
1619 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v5 clamp
1620 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v6 clamp
1621 ; GFX10PLUS-NEXT: v_sub_nc_i32 v3, v3, v7 clamp
1622 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1623 %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1624 ret <4 x i32> %result
1627 define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1628 ; GFX6-LABEL: s_ssubsat_v4i32:
1630 ; GFX6-NEXT: s_max_i32 s8, s0, -1
1631 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff
1632 ; GFX6-NEXT: s_min_i32 s9, s0, -1
1633 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000
1634 ; GFX6-NEXT: s_max_i32 s4, s8, s4
1635 ; GFX6-NEXT: s_min_i32 s4, s4, s9
1636 ; GFX6-NEXT: s_sub_i32 s0, s0, s4
1637 ; GFX6-NEXT: s_max_i32 s4, s1, -1
1638 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1639 ; GFX6-NEXT: s_min_i32 s8, s1, -1
1640 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
1641 ; GFX6-NEXT: s_max_i32 s4, s4, s5
1642 ; GFX6-NEXT: s_min_i32 s4, s4, s8
1643 ; GFX6-NEXT: s_sub_i32 s1, s1, s4
1644 ; GFX6-NEXT: s_max_i32 s4, s2, -1
1645 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1646 ; GFX6-NEXT: s_min_i32 s5, s2, -1
1647 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000
1648 ; GFX6-NEXT: s_max_i32 s4, s4, s6
1649 ; GFX6-NEXT: s_min_i32 s4, s4, s5
1650 ; GFX6-NEXT: s_sub_i32 s2, s2, s4
1651 ; GFX6-NEXT: s_max_i32 s4, s3, -1
1652 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1653 ; GFX6-NEXT: s_min_i32 s5, s3, -1
1654 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000
1655 ; GFX6-NEXT: s_max_i32 s4, s4, s7
1656 ; GFX6-NEXT: s_min_i32 s4, s4, s5
1657 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
1658 ; GFX6-NEXT: ; return to shader part epilog
1660 ; GFX8-LABEL: s_ssubsat_v4i32:
1662 ; GFX8-NEXT: s_max_i32 s8, s0, -1
1663 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fffffff
1664 ; GFX8-NEXT: s_min_i32 s9, s0, -1
1665 ; GFX8-NEXT: s_sub_i32 s9, s9, 0x80000000
1666 ; GFX8-NEXT: s_max_i32 s4, s8, s4
1667 ; GFX8-NEXT: s_min_i32 s4, s4, s9
1668 ; GFX8-NEXT: s_sub_i32 s0, s0, s4
1669 ; GFX8-NEXT: s_max_i32 s4, s1, -1
1670 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1671 ; GFX8-NEXT: s_min_i32 s8, s1, -1
1672 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x80000000
1673 ; GFX8-NEXT: s_max_i32 s4, s4, s5
1674 ; GFX8-NEXT: s_min_i32 s4, s4, s8
1675 ; GFX8-NEXT: s_sub_i32 s1, s1, s4
1676 ; GFX8-NEXT: s_max_i32 s4, s2, -1
1677 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1678 ; GFX8-NEXT: s_min_i32 s5, s2, -1
1679 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000
1680 ; GFX8-NEXT: s_max_i32 s4, s4, s6
1681 ; GFX8-NEXT: s_min_i32 s4, s4, s5
1682 ; GFX8-NEXT: s_sub_i32 s2, s2, s4
1683 ; GFX8-NEXT: s_max_i32 s4, s3, -1
1684 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff
1685 ; GFX8-NEXT: s_min_i32 s5, s3, -1
1686 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000
1687 ; GFX8-NEXT: s_max_i32 s4, s4, s7
1688 ; GFX8-NEXT: s_min_i32 s4, s4, s5
1689 ; GFX8-NEXT: s_sub_i32 s3, s3, s4
1690 ; GFX8-NEXT: ; return to shader part epilog
1692 ; GFX9-LABEL: s_ssubsat_v4i32:
1694 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1695 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1696 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1697 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1698 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1699 ; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp
1700 ; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp
1701 ; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp
1702 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1703 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1704 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1705 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1706 ; GFX9-NEXT: ; return to shader part epilog
1708 ; GFX10PLUS-LABEL: s_ssubsat_v4i32:
1709 ; GFX10PLUS: ; %bb.0:
1710 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s4 clamp
1711 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s5 clamp
1712 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s6 clamp
1713 ; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s7 clamp
1714 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1715 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1716 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1717 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1718 ; GFX10PLUS-NEXT: ; return to shader part epilog
1719 %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1720 ret <4 x i32> %result
1723 define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1724 ; GFX6-LABEL: v_ssubsat_v5i32:
1726 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1727 ; GFX6-NEXT: s_brev_b32 s4, -2
1728 ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0
1729 ; GFX6-NEXT: s_brev_b32 s5, 1
1730 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10
1731 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0
1732 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12
1733 ; GFX6-NEXT: v_max_i32_e32 v5, v10, v5
1734 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v12
1735 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
1736 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1
1737 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5
1738 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v1
1739 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10
1740 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6
1741 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10
1742 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
1743 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2
1744 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5
1745 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2
1746 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6
1747 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
1748 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6
1749 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2
1750 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
1751 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3
1752 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11
1753 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3
1754 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6
1755 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8
1756 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6
1757 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
1758 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4
1759 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5
1760 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4
1761 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6
1762 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9
1763 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6
1764 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5
1765 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1767 ; GFX8-LABEL: v_ssubsat_v5i32:
1769 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1770 ; GFX8-NEXT: s_brev_b32 s4, -2
1771 ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0
1772 ; GFX8-NEXT: s_brev_b32 s5, 1
1773 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10
1774 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0
1775 ; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12
1776 ; GFX8-NEXT: v_max_i32_e32 v5, v10, v5
1777 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v12
1778 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5
1779 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1
1780 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5
1781 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v1
1782 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s5, v10
1783 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6
1784 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10
1785 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5
1786 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2
1787 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5
1788 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v2
1789 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6
1790 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7
1791 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6
1792 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2
1793 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5
1794 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3
1795 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11
1796 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3
1797 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6
1798 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8
1799 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6
1800 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5
1801 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4
1802 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5
1803 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4
1804 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6
1805 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9
1806 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6
1807 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
1808 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1810 ; GFX9-LABEL: v_ssubsat_v5i32:
1812 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1813 ; GFX9-NEXT: v_sub_i32 v0, v0, v5 clamp
1814 ; GFX9-NEXT: v_sub_i32 v1, v1, v6 clamp
1815 ; GFX9-NEXT: v_sub_i32 v2, v2, v7 clamp
1816 ; GFX9-NEXT: v_sub_i32 v3, v3, v8 clamp
1817 ; GFX9-NEXT: v_sub_i32 v4, v4, v9 clamp
1818 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1820 ; GFX10PLUS-LABEL: v_ssubsat_v5i32:
1821 ; GFX10PLUS: ; %bb.0:
1822 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1823 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v5 clamp
1824 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v6 clamp
1825 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v7 clamp
1826 ; GFX10PLUS-NEXT: v_sub_nc_i32 v3, v3, v8 clamp
1827 ; GFX10PLUS-NEXT: v_sub_nc_i32 v4, v4, v9 clamp
1828 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1829 %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1830 ret <5 x i32> %result
1833 define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1834 ; GFX6-LABEL: s_ssubsat_v5i32:
1836 ; GFX6-NEXT: s_max_i32 s10, s0, -1
1837 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x7fffffff
1838 ; GFX6-NEXT: s_min_i32 s11, s0, -1
1839 ; GFX6-NEXT: s_sub_i32 s11, s11, 0x80000000
1840 ; GFX6-NEXT: s_max_i32 s5, s10, s5
1841 ; GFX6-NEXT: s_min_i32 s5, s5, s11
1842 ; GFX6-NEXT: s_sub_i32 s0, s0, s5
1843 ; GFX6-NEXT: s_max_i32 s5, s1, -1
1844 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1845 ; GFX6-NEXT: s_min_i32 s10, s1, -1
1846 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
1847 ; GFX6-NEXT: s_max_i32 s5, s5, s6
1848 ; GFX6-NEXT: s_min_i32 s5, s5, s10
1849 ; GFX6-NEXT: s_sub_i32 s1, s1, s5
1850 ; GFX6-NEXT: s_max_i32 s5, s2, -1
1851 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1852 ; GFX6-NEXT: s_min_i32 s6, s2, -1
1853 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
1854 ; GFX6-NEXT: s_max_i32 s5, s5, s7
1855 ; GFX6-NEXT: s_min_i32 s5, s5, s6
1856 ; GFX6-NEXT: s_sub_i32 s2, s2, s5
1857 ; GFX6-NEXT: s_max_i32 s5, s3, -1
1858 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1859 ; GFX6-NEXT: s_min_i32 s6, s3, -1
1860 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
1861 ; GFX6-NEXT: s_max_i32 s5, s5, s8
1862 ; GFX6-NEXT: s_min_i32 s5, s5, s6
1863 ; GFX6-NEXT: s_sub_i32 s3, s3, s5
1864 ; GFX6-NEXT: s_max_i32 s5, s4, -1
1865 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1866 ; GFX6-NEXT: s_min_i32 s6, s4, -1
1867 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
1868 ; GFX6-NEXT: s_max_i32 s5, s5, s9
1869 ; GFX6-NEXT: s_min_i32 s5, s5, s6
1870 ; GFX6-NEXT: s_sub_i32 s4, s4, s5
1871 ; GFX6-NEXT: ; return to shader part epilog
1873 ; GFX8-LABEL: s_ssubsat_v5i32:
1875 ; GFX8-NEXT: s_max_i32 s10, s0, -1
1876 ; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fffffff
1877 ; GFX8-NEXT: s_min_i32 s11, s0, -1
1878 ; GFX8-NEXT: s_sub_i32 s11, s11, 0x80000000
1879 ; GFX8-NEXT: s_max_i32 s5, s10, s5
1880 ; GFX8-NEXT: s_min_i32 s5, s5, s11
1881 ; GFX8-NEXT: s_sub_i32 s0, s0, s5
1882 ; GFX8-NEXT: s_max_i32 s5, s1, -1
1883 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1884 ; GFX8-NEXT: s_min_i32 s10, s1, -1
1885 ; GFX8-NEXT: s_sub_i32 s10, s10, 0x80000000
1886 ; GFX8-NEXT: s_max_i32 s5, s5, s6
1887 ; GFX8-NEXT: s_min_i32 s5, s5, s10
1888 ; GFX8-NEXT: s_sub_i32 s1, s1, s5
1889 ; GFX8-NEXT: s_max_i32 s5, s2, -1
1890 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1891 ; GFX8-NEXT: s_min_i32 s6, s2, -1
1892 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000
1893 ; GFX8-NEXT: s_max_i32 s5, s5, s7
1894 ; GFX8-NEXT: s_min_i32 s5, s5, s6
1895 ; GFX8-NEXT: s_sub_i32 s2, s2, s5
1896 ; GFX8-NEXT: s_max_i32 s5, s3, -1
1897 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1898 ; GFX8-NEXT: s_min_i32 s6, s3, -1
1899 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000
1900 ; GFX8-NEXT: s_max_i32 s5, s5, s8
1901 ; GFX8-NEXT: s_min_i32 s5, s5, s6
1902 ; GFX8-NEXT: s_sub_i32 s3, s3, s5
1903 ; GFX8-NEXT: s_max_i32 s5, s4, -1
1904 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff
1905 ; GFX8-NEXT: s_min_i32 s6, s4, -1
1906 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000
1907 ; GFX8-NEXT: s_max_i32 s5, s5, s9
1908 ; GFX8-NEXT: s_min_i32 s5, s5, s6
1909 ; GFX8-NEXT: s_sub_i32 s4, s4, s5
1910 ; GFX8-NEXT: ; return to shader part epilog
1912 ; GFX9-LABEL: s_ssubsat_v5i32:
1914 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1915 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1916 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1917 ; GFX9-NEXT: v_mov_b32_e32 v3, s8
1918 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1919 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
1920 ; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp
1921 ; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp
1922 ; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp
1923 ; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp
1924 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1925 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1926 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1927 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1928 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1929 ; GFX9-NEXT: ; return to shader part epilog
1931 ; GFX10PLUS-LABEL: s_ssubsat_v5i32:
1932 ; GFX10PLUS: ; %bb.0:
1933 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s5 clamp
1934 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s6 clamp
1935 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s7 clamp
1936 ; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s8 clamp
1937 ; GFX10PLUS-NEXT: v_sub_nc_i32 v4, s4, s9 clamp
1938 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1939 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1940 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1941 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1942 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
1943 ; GFX10PLUS-NEXT: ; return to shader part epilog
1944 %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1945 ret <5 x i32> %result
1948 define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1949 ; GFX6-LABEL: v_ssubsat_v16i32:
1951 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1952 ; GFX6-NEXT: s_brev_b32 s4, -2
1953 ; GFX6-NEXT: v_max_i32_e32 v31, -1, v0
1954 ; GFX6-NEXT: v_subrev_i32_e32 v31, vcc, s4, v31
1955 ; GFX6-NEXT: v_max_i32_e32 v16, v31, v16
1956 ; GFX6-NEXT: s_brev_b32 s5, 1
1957 ; GFX6-NEXT: v_min_i32_e32 v31, -1, v0
1958 ; GFX6-NEXT: v_subrev_i32_e32 v31, vcc, s5, v31
1959 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v31
1960 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
1961 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v1
1962 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16
1963 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v17
1964 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v1
1965 ; GFX6-NEXT: v_subrev_i32_e32 v17, vcc, s5, v17
1966 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17
1967 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16
1968 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v2
1969 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16
1970 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v2
1971 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v18
1972 ; GFX6-NEXT: v_subrev_i32_e32 v17, vcc, s5, v17
1973 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17
1974 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16
1975 ; GFX6-NEXT: v_bfrev_b32_e32 v16, -2
1976 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3
1977 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
1978 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
1979 ; GFX6-NEXT: v_bfrev_b32_e32 v18, 1
1980 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v3
1981 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18
1982 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
1983 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17
1984 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v4
1985 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
1986 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v4
1987 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v20
1988 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18
1989 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
1990 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17
1991 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v5
1992 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
1993 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v5
1994 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v21
1995 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18
1996 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
1997 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v17
1998 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v6
1999 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2000 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v6
2001 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22
2002 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18
2003 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
2004 ; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32
2005 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17
2006 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v7
2007 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2008 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v7
2009 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23
2010 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2011 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2012 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17
2013 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v8
2014 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2015 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v8
2016 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2017 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v24
2018 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2019 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17
2020 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v9
2021 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2022 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v9
2023 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2024 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v25
2025 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2026 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
2027 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v10
2028 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2029 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v10
2030 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2031 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v26
2032 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2033 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17
2034 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v11
2035 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2036 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v11
2037 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2038 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27
2039 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2040 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17
2041 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v12
2042 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2043 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v12
2044 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2045 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28
2046 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2047 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17
2048 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v13
2049 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2050 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v13
2051 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2052 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29
2053 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2054 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17
2055 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v14
2056 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16
2057 ; GFX6-NEXT: v_min_i32_e32 v20, -1, v14
2058 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v20, v18
2059 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30
2060 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2061 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17
2062 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v15
2063 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v17, v16
2064 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v15
2065 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v18
2066 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2067 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
2068 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17
2069 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16
2070 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2072 ; GFX8-LABEL: v_ssubsat_v16i32:
2074 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2075 ; GFX8-NEXT: s_brev_b32 s4, -2
2076 ; GFX8-NEXT: v_max_i32_e32 v31, -1, v0
2077 ; GFX8-NEXT: v_subrev_u32_e32 v31, vcc, s4, v31
2078 ; GFX8-NEXT: v_max_i32_e32 v16, v31, v16
2079 ; GFX8-NEXT: s_brev_b32 s5, 1
2080 ; GFX8-NEXT: v_min_i32_e32 v31, -1, v0
2081 ; GFX8-NEXT: v_subrev_u32_e32 v31, vcc, s5, v31
2082 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v31
2083 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16
2084 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v1
2085 ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16
2086 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v17
2087 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v1
2088 ; GFX8-NEXT: v_subrev_u32_e32 v17, vcc, s5, v17
2089 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17
2090 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v16
2091 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v2
2092 ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16
2093 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v2
2094 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v18
2095 ; GFX8-NEXT: v_subrev_u32_e32 v17, vcc, s5, v17
2096 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17
2097 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v16
2098 ; GFX8-NEXT: v_bfrev_b32_e32 v16, -2
2099 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3
2100 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2101 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19
2102 ; GFX8-NEXT: v_bfrev_b32_e32 v18, 1
2103 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v3
2104 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18
2105 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2106 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17
2107 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v4
2108 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2109 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v4
2110 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v20
2111 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18
2112 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2113 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17
2114 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v5
2115 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2116 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v5
2117 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v21
2118 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18
2119 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2120 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v17
2121 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v6
2122 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2123 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v6
2124 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22
2125 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18
2126 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2127 ; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32
2128 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17
2129 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v7
2130 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2131 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v7
2132 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23
2133 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2134 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2135 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17
2136 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v8
2137 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2138 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v8
2139 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2140 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v24
2141 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2142 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17
2143 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v9
2144 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2145 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v9
2146 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2147 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v25
2148 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2149 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17
2150 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v10
2151 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2152 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v10
2153 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2154 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v26
2155 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2156 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17
2157 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v11
2158 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2159 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v11
2160 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2161 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27
2162 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2163 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17
2164 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v12
2165 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2166 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v12
2167 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2168 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28
2169 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2170 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17
2171 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v13
2172 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2173 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v13
2174 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2175 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29
2176 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2177 ; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17
2178 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v14
2179 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16
2180 ; GFX8-NEXT: v_min_i32_e32 v20, -1, v14
2181 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v20, v18
2182 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30
2183 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2184 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17
2185 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v15
2186 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v17, v16
2187 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v15
2188 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v18
2189 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2190 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v19
2191 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17
2192 ; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16
2193 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2195 ; GFX9-LABEL: v_ssubsat_v16i32:
2197 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2198 ; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp
2199 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
2200 ; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp
2201 ; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp
2202 ; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp
2203 ; GFX9-NEXT: v_sub_i32 v4, v4, v20 clamp
2204 ; GFX9-NEXT: v_sub_i32 v5, v5, v21 clamp
2205 ; GFX9-NEXT: v_sub_i32 v6, v6, v22 clamp
2206 ; GFX9-NEXT: v_sub_i32 v7, v7, v23 clamp
2207 ; GFX9-NEXT: v_sub_i32 v8, v8, v24 clamp
2208 ; GFX9-NEXT: v_sub_i32 v9, v9, v25 clamp
2209 ; GFX9-NEXT: v_sub_i32 v10, v10, v26 clamp
2210 ; GFX9-NEXT: v_sub_i32 v11, v11, v27 clamp
2211 ; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp
2212 ; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp
2213 ; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp
2214 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2215 ; GFX9-NEXT: v_sub_i32 v15, v15, v16 clamp
2216 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2218 ; GFX10-LABEL: v_ssubsat_v16i32:
2220 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2221 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
2222 ; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp
2223 ; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp
2224 ; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp
2225 ; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp
2226 ; GFX10-NEXT: v_sub_nc_i32 v4, v4, v20 clamp
2227 ; GFX10-NEXT: v_sub_nc_i32 v5, v5, v21 clamp
2228 ; GFX10-NEXT: v_sub_nc_i32 v6, v6, v22 clamp
2229 ; GFX10-NEXT: v_sub_nc_i32 v7, v7, v23 clamp
2230 ; GFX10-NEXT: v_sub_nc_i32 v8, v8, v24 clamp
2231 ; GFX10-NEXT: v_sub_nc_i32 v9, v9, v25 clamp
2232 ; GFX10-NEXT: v_sub_nc_i32 v10, v10, v26 clamp
2233 ; GFX10-NEXT: v_sub_nc_i32 v11, v11, v27 clamp
2234 ; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp
2235 ; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp
2236 ; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp
2237 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2238 ; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp
2239 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2241 ; GFX11-LABEL: v_ssubsat_v16i32:
2243 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2244 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
2245 ; GFX11-NEXT: v_sub_nc_i32 v0, v0, v16 clamp
2246 ; GFX11-NEXT: v_sub_nc_i32 v1, v1, v17 clamp
2247 ; GFX11-NEXT: v_sub_nc_i32 v2, v2, v18 clamp
2248 ; GFX11-NEXT: v_sub_nc_i32 v3, v3, v19 clamp
2249 ; GFX11-NEXT: v_sub_nc_i32 v4, v4, v20 clamp
2250 ; GFX11-NEXT: v_sub_nc_i32 v5, v5, v21 clamp
2251 ; GFX11-NEXT: v_sub_nc_i32 v6, v6, v22 clamp
2252 ; GFX11-NEXT: v_sub_nc_i32 v7, v7, v23 clamp
2253 ; GFX11-NEXT: v_sub_nc_i32 v8, v8, v24 clamp
2254 ; GFX11-NEXT: v_sub_nc_i32 v9, v9, v25 clamp
2255 ; GFX11-NEXT: v_sub_nc_i32 v10, v10, v26 clamp
2256 ; GFX11-NEXT: v_sub_nc_i32 v11, v11, v27 clamp
2257 ; GFX11-NEXT: v_sub_nc_i32 v12, v12, v28 clamp
2258 ; GFX11-NEXT: v_sub_nc_i32 v13, v13, v29 clamp
2259 ; GFX11-NEXT: v_sub_nc_i32 v14, v14, v30 clamp
2260 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2261 ; GFX11-NEXT: v_sub_nc_i32 v15, v15, v31 clamp
2262 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2263 %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2264 ret <16 x i32> %result
2267 define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
2268 ; GFX6-LABEL: s_ssubsat_v16i32:
2270 ; GFX6-NEXT: s_max_i32 s32, s0, -1
2271 ; GFX6-NEXT: s_sub_i32 s32, s32, 0x7fffffff
2272 ; GFX6-NEXT: s_min_i32 s33, s0, -1
2273 ; GFX6-NEXT: s_sub_i32 s33, s33, 0x80000000
2274 ; GFX6-NEXT: s_max_i32 s16, s32, s16
2275 ; GFX6-NEXT: s_min_i32 s16, s16, s33
2276 ; GFX6-NEXT: s_sub_i32 s0, s0, s16
2277 ; GFX6-NEXT: s_max_i32 s16, s1, -1
2278 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2279 ; GFX6-NEXT: s_min_i32 s32, s1, -1
2280 ; GFX6-NEXT: s_sub_i32 s32, s32, 0x80000000
2281 ; GFX6-NEXT: s_max_i32 s16, s16, s17
2282 ; GFX6-NEXT: s_min_i32 s16, s16, s32
2283 ; GFX6-NEXT: s_sub_i32 s1, s1, s16
2284 ; GFX6-NEXT: s_max_i32 s16, s2, -1
2285 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2286 ; GFX6-NEXT: s_min_i32 s17, s2, -1
2287 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2288 ; GFX6-NEXT: s_max_i32 s16, s16, s18
2289 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2290 ; GFX6-NEXT: s_sub_i32 s2, s2, s16
2291 ; GFX6-NEXT: s_max_i32 s16, s3, -1
2292 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2293 ; GFX6-NEXT: s_min_i32 s17, s3, -1
2294 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2295 ; GFX6-NEXT: s_max_i32 s16, s16, s19
2296 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2297 ; GFX6-NEXT: s_sub_i32 s3, s3, s16
2298 ; GFX6-NEXT: s_max_i32 s16, s4, -1
2299 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2300 ; GFX6-NEXT: s_min_i32 s17, s4, -1
2301 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2302 ; GFX6-NEXT: s_max_i32 s16, s16, s20
2303 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2304 ; GFX6-NEXT: s_sub_i32 s4, s4, s16
2305 ; GFX6-NEXT: s_max_i32 s16, s5, -1
2306 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2307 ; GFX6-NEXT: s_min_i32 s17, s5, -1
2308 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2309 ; GFX6-NEXT: s_max_i32 s16, s16, s21
2310 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2311 ; GFX6-NEXT: s_sub_i32 s5, s5, s16
2312 ; GFX6-NEXT: s_max_i32 s16, s6, -1
2313 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2314 ; GFX6-NEXT: s_min_i32 s17, s6, -1
2315 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2316 ; GFX6-NEXT: s_max_i32 s16, s16, s22
2317 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2318 ; GFX6-NEXT: s_sub_i32 s6, s6, s16
2319 ; GFX6-NEXT: s_max_i32 s16, s7, -1
2320 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2321 ; GFX6-NEXT: s_min_i32 s17, s7, -1
2322 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2323 ; GFX6-NEXT: s_max_i32 s16, s16, s23
2324 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2325 ; GFX6-NEXT: s_sub_i32 s7, s7, s16
2326 ; GFX6-NEXT: s_max_i32 s16, s8, -1
2327 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2328 ; GFX6-NEXT: s_min_i32 s17, s8, -1
2329 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2330 ; GFX6-NEXT: s_max_i32 s16, s16, s24
2331 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2332 ; GFX6-NEXT: s_sub_i32 s8, s8, s16
2333 ; GFX6-NEXT: s_max_i32 s16, s9, -1
2334 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2335 ; GFX6-NEXT: s_min_i32 s17, s9, -1
2336 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2337 ; GFX6-NEXT: s_max_i32 s16, s16, s25
2338 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2339 ; GFX6-NEXT: s_sub_i32 s9, s9, s16
2340 ; GFX6-NEXT: s_max_i32 s16, s10, -1
2341 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2342 ; GFX6-NEXT: s_min_i32 s17, s10, -1
2343 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2344 ; GFX6-NEXT: s_max_i32 s16, s16, s26
2345 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2346 ; GFX6-NEXT: s_sub_i32 s10, s10, s16
2347 ; GFX6-NEXT: s_max_i32 s16, s11, -1
2348 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2349 ; GFX6-NEXT: s_min_i32 s17, s11, -1
2350 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2351 ; GFX6-NEXT: s_max_i32 s16, s16, s27
2352 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2353 ; GFX6-NEXT: s_sub_i32 s11, s11, s16
2354 ; GFX6-NEXT: s_max_i32 s16, s12, -1
2355 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2356 ; GFX6-NEXT: s_min_i32 s17, s12, -1
2357 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2358 ; GFX6-NEXT: s_max_i32 s16, s16, s28
2359 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2360 ; GFX6-NEXT: s_sub_i32 s12, s12, s16
2361 ; GFX6-NEXT: s_max_i32 s16, s13, -1
2362 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2363 ; GFX6-NEXT: s_min_i32 s17, s13, -1
2364 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2365 ; GFX6-NEXT: s_max_i32 s16, s16, s29
2366 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2367 ; GFX6-NEXT: s_sub_i32 s13, s13, s16
2368 ; GFX6-NEXT: s_max_i32 s16, s14, -1
2369 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2370 ; GFX6-NEXT: s_min_i32 s17, s14, -1
2371 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2372 ; GFX6-NEXT: s_max_i32 s16, s16, s30
2373 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2374 ; GFX6-NEXT: s_sub_i32 s14, s14, s16
2375 ; GFX6-NEXT: s_max_i32 s16, s15, -1
2376 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2377 ; GFX6-NEXT: s_min_i32 s17, s15, -1
2378 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
2379 ; GFX6-NEXT: s_max_i32 s16, s16, s31
2380 ; GFX6-NEXT: s_min_i32 s16, s16, s17
2381 ; GFX6-NEXT: s_sub_i32 s15, s15, s16
2382 ; GFX6-NEXT: ; return to shader part epilog
2384 ; GFX8-LABEL: s_ssubsat_v16i32:
2386 ; GFX8-NEXT: s_max_i32 s32, s0, -1
2387 ; GFX8-NEXT: s_sub_i32 s32, s32, 0x7fffffff
2388 ; GFX8-NEXT: s_min_i32 s33, s0, -1
2389 ; GFX8-NEXT: s_sub_i32 s33, s33, 0x80000000
2390 ; GFX8-NEXT: s_max_i32 s16, s32, s16
2391 ; GFX8-NEXT: s_min_i32 s16, s16, s33
2392 ; GFX8-NEXT: s_sub_i32 s0, s0, s16
2393 ; GFX8-NEXT: s_max_i32 s16, s1, -1
2394 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2395 ; GFX8-NEXT: s_min_i32 s32, s1, -1
2396 ; GFX8-NEXT: s_sub_i32 s32, s32, 0x80000000
2397 ; GFX8-NEXT: s_max_i32 s16, s16, s17
2398 ; GFX8-NEXT: s_min_i32 s16, s16, s32
2399 ; GFX8-NEXT: s_sub_i32 s1, s1, s16
2400 ; GFX8-NEXT: s_max_i32 s16, s2, -1
2401 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2402 ; GFX8-NEXT: s_min_i32 s17, s2, -1
2403 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2404 ; GFX8-NEXT: s_max_i32 s16, s16, s18
2405 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2406 ; GFX8-NEXT: s_sub_i32 s2, s2, s16
2407 ; GFX8-NEXT: s_max_i32 s16, s3, -1
2408 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2409 ; GFX8-NEXT: s_min_i32 s17, s3, -1
2410 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2411 ; GFX8-NEXT: s_max_i32 s16, s16, s19
2412 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2413 ; GFX8-NEXT: s_sub_i32 s3, s3, s16
2414 ; GFX8-NEXT: s_max_i32 s16, s4, -1
2415 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2416 ; GFX8-NEXT: s_min_i32 s17, s4, -1
2417 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2418 ; GFX8-NEXT: s_max_i32 s16, s16, s20
2419 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2420 ; GFX8-NEXT: s_sub_i32 s4, s4, s16
2421 ; GFX8-NEXT: s_max_i32 s16, s5, -1
2422 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2423 ; GFX8-NEXT: s_min_i32 s17, s5, -1
2424 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2425 ; GFX8-NEXT: s_max_i32 s16, s16, s21
2426 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2427 ; GFX8-NEXT: s_sub_i32 s5, s5, s16
2428 ; GFX8-NEXT: s_max_i32 s16, s6, -1
2429 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2430 ; GFX8-NEXT: s_min_i32 s17, s6, -1
2431 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2432 ; GFX8-NEXT: s_max_i32 s16, s16, s22
2433 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2434 ; GFX8-NEXT: s_sub_i32 s6, s6, s16
2435 ; GFX8-NEXT: s_max_i32 s16, s7, -1
2436 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2437 ; GFX8-NEXT: s_min_i32 s17, s7, -1
2438 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2439 ; GFX8-NEXT: s_max_i32 s16, s16, s23
2440 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2441 ; GFX8-NEXT: s_sub_i32 s7, s7, s16
2442 ; GFX8-NEXT: s_max_i32 s16, s8, -1
2443 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2444 ; GFX8-NEXT: s_min_i32 s17, s8, -1
2445 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2446 ; GFX8-NEXT: s_max_i32 s16, s16, s24
2447 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2448 ; GFX8-NEXT: s_sub_i32 s8, s8, s16
2449 ; GFX8-NEXT: s_max_i32 s16, s9, -1
2450 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2451 ; GFX8-NEXT: s_min_i32 s17, s9, -1
2452 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2453 ; GFX8-NEXT: s_max_i32 s16, s16, s25
2454 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2455 ; GFX8-NEXT: s_sub_i32 s9, s9, s16
2456 ; GFX8-NEXT: s_max_i32 s16, s10, -1
2457 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2458 ; GFX8-NEXT: s_min_i32 s17, s10, -1
2459 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2460 ; GFX8-NEXT: s_max_i32 s16, s16, s26
2461 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2462 ; GFX8-NEXT: s_sub_i32 s10, s10, s16
2463 ; GFX8-NEXT: s_max_i32 s16, s11, -1
2464 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2465 ; GFX8-NEXT: s_min_i32 s17, s11, -1
2466 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2467 ; GFX8-NEXT: s_max_i32 s16, s16, s27
2468 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2469 ; GFX8-NEXT: s_sub_i32 s11, s11, s16
2470 ; GFX8-NEXT: s_max_i32 s16, s12, -1
2471 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2472 ; GFX8-NEXT: s_min_i32 s17, s12, -1
2473 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2474 ; GFX8-NEXT: s_max_i32 s16, s16, s28
2475 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2476 ; GFX8-NEXT: s_sub_i32 s12, s12, s16
2477 ; GFX8-NEXT: s_max_i32 s16, s13, -1
2478 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2479 ; GFX8-NEXT: s_min_i32 s17, s13, -1
2480 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2481 ; GFX8-NEXT: s_max_i32 s16, s16, s29
2482 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2483 ; GFX8-NEXT: s_sub_i32 s13, s13, s16
2484 ; GFX8-NEXT: s_max_i32 s16, s14, -1
2485 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2486 ; GFX8-NEXT: s_min_i32 s17, s14, -1
2487 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2488 ; GFX8-NEXT: s_max_i32 s16, s16, s30
2489 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2490 ; GFX8-NEXT: s_sub_i32 s14, s14, s16
2491 ; GFX8-NEXT: s_max_i32 s16, s15, -1
2492 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff
2493 ; GFX8-NEXT: s_min_i32 s17, s15, -1
2494 ; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000
2495 ; GFX8-NEXT: s_max_i32 s16, s16, s31
2496 ; GFX8-NEXT: s_min_i32 s16, s16, s17
2497 ; GFX8-NEXT: s_sub_i32 s15, s15, s16
2498 ; GFX8-NEXT: ; return to shader part epilog
2500 ; GFX9-LABEL: s_ssubsat_v16i32:
2502 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
2503 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
2504 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
2505 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
2506 ; GFX9-NEXT: v_mov_b32_e32 v4, s20
2507 ; GFX9-NEXT: v_mov_b32_e32 v5, s21
2508 ; GFX9-NEXT: v_mov_b32_e32 v6, s22
2509 ; GFX9-NEXT: v_mov_b32_e32 v7, s23
2510 ; GFX9-NEXT: v_mov_b32_e32 v8, s24
2511 ; GFX9-NEXT: v_mov_b32_e32 v9, s25
2512 ; GFX9-NEXT: v_mov_b32_e32 v10, s26
2513 ; GFX9-NEXT: v_mov_b32_e32 v11, s27
2514 ; GFX9-NEXT: v_mov_b32_e32 v12, s28
2515 ; GFX9-NEXT: v_mov_b32_e32 v13, s29
2516 ; GFX9-NEXT: v_mov_b32_e32 v14, s30
2517 ; GFX9-NEXT: v_mov_b32_e32 v15, s31
2518 ; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
2519 ; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp
2520 ; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp
2521 ; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp
2522 ; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp
2523 ; GFX9-NEXT: v_sub_i32 v5, s5, v5 clamp
2524 ; GFX9-NEXT: v_sub_i32 v6, s6, v6 clamp
2525 ; GFX9-NEXT: v_sub_i32 v7, s7, v7 clamp
2526 ; GFX9-NEXT: v_sub_i32 v8, s8, v8 clamp
2527 ; GFX9-NEXT: v_sub_i32 v9, s9, v9 clamp
2528 ; GFX9-NEXT: v_sub_i32 v10, s10, v10 clamp
2529 ; GFX9-NEXT: v_sub_i32 v11, s11, v11 clamp
2530 ; GFX9-NEXT: v_sub_i32 v12, s12, v12 clamp
2531 ; GFX9-NEXT: v_sub_i32 v13, s13, v13 clamp
2532 ; GFX9-NEXT: v_sub_i32 v14, s14, v14 clamp
2533 ; GFX9-NEXT: v_sub_i32 v15, s15, v15 clamp
2534 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2535 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2536 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2537 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
2538 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
2539 ; GFX9-NEXT: v_readfirstlane_b32 s5, v5
2540 ; GFX9-NEXT: v_readfirstlane_b32 s6, v6
2541 ; GFX9-NEXT: v_readfirstlane_b32 s7, v7
2542 ; GFX9-NEXT: v_readfirstlane_b32 s8, v8
2543 ; GFX9-NEXT: v_readfirstlane_b32 s9, v9
2544 ; GFX9-NEXT: v_readfirstlane_b32 s10, v10
2545 ; GFX9-NEXT: v_readfirstlane_b32 s11, v11
2546 ; GFX9-NEXT: v_readfirstlane_b32 s12, v12
2547 ; GFX9-NEXT: v_readfirstlane_b32 s13, v13
2548 ; GFX9-NEXT: v_readfirstlane_b32 s14, v14
2549 ; GFX9-NEXT: v_readfirstlane_b32 s15, v15
2550 ; GFX9-NEXT: ; return to shader part epilog
2552 ; GFX10PLUS-LABEL: s_ssubsat_v16i32:
2553 ; GFX10PLUS: ; %bb.0:
2554 ; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s16 clamp
2555 ; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s17 clamp
2556 ; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s18 clamp
2557 ; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s19 clamp
2558 ; GFX10PLUS-NEXT: v_sub_nc_i32 v4, s4, s20 clamp
2559 ; GFX10PLUS-NEXT: v_sub_nc_i32 v5, s5, s21 clamp
2560 ; GFX10PLUS-NEXT: v_sub_nc_i32 v6, s6, s22 clamp
2561 ; GFX10PLUS-NEXT: v_sub_nc_i32 v7, s7, s23 clamp
2562 ; GFX10PLUS-NEXT: v_sub_nc_i32 v8, s8, s24 clamp
2563 ; GFX10PLUS-NEXT: v_sub_nc_i32 v9, s9, s25 clamp
2564 ; GFX10PLUS-NEXT: v_sub_nc_i32 v10, s10, s26 clamp
2565 ; GFX10PLUS-NEXT: v_sub_nc_i32 v11, s11, s27 clamp
2566 ; GFX10PLUS-NEXT: v_sub_nc_i32 v12, s12, s28 clamp
2567 ; GFX10PLUS-NEXT: v_sub_nc_i32 v13, s13, s29 clamp
2568 ; GFX10PLUS-NEXT: v_sub_nc_i32 v14, s14, s30 clamp
2569 ; GFX10PLUS-NEXT: v_sub_nc_i32 v15, s15, s31 clamp
2570 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2571 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2572 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
2573 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
2574 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
2575 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5
2576 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6
2577 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7
2578 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8
2579 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9
2580 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10
2581 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11
2582 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12
2583 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13
2584 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14
2585 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15
2586 ; GFX10PLUS-NEXT: ; return to shader part epilog
2587 %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2588 ret <16 x i32> %result
2591 define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
2592 ; GFX6-LABEL: v_ssubsat_i16:
2594 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2595 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2596 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
2597 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2598 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
2599 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0
2600 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3
2601 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
2602 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3
2603 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
2604 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2605 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2607 ; GFX8-LABEL: v_ssubsat_i16:
2609 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2610 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0
2611 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2
2612 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0
2613 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3
2614 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1
2615 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3
2616 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
2617 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2619 ; GFX9-LABEL: v_ssubsat_i16:
2621 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2622 ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp
2623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2625 ; GFX10PLUS-LABEL: v_ssubsat_i16:
2626 ; GFX10PLUS: ; %bb.0:
2627 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2628 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
2629 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2630 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2634 define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
2635 ; GFX6-LABEL: s_ssubsat_i16:
2637 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2638 ; GFX6-NEXT: s_max_i32 s2, s0, -1
2639 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2640 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
2641 ; GFX6-NEXT: s_min_i32 s3, s0, -1
2642 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000
2643 ; GFX6-NEXT: s_max_i32 s1, s2, s1
2644 ; GFX6-NEXT: s_min_i32 s1, s1, s3
2645 ; GFX6-NEXT: s_sub_i32 s0, s0, s1
2646 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
2647 ; GFX6-NEXT: ; return to shader part epilog
2649 ; GFX8-LABEL: s_ssubsat_i16:
2651 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
2652 ; GFX8-NEXT: s_sext_i32_i16 s3, -1
2653 ; GFX8-NEXT: s_max_i32 s4, s2, s3
2654 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
2655 ; GFX8-NEXT: s_min_i32 s2, s2, s3
2656 ; GFX8-NEXT: s_sext_i32_i16 s3, s4
2657 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2658 ; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
2659 ; GFX8-NEXT: s_max_i32 s1, s3, s1
2660 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2661 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
2662 ; GFX8-NEXT: s_min_i32 s1, s1, s2
2663 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
2664 ; GFX8-NEXT: ; return to shader part epilog
2666 ; GFX9-LABEL: s_ssubsat_i16:
2668 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
2669 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
2670 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2671 ; GFX9-NEXT: ; return to shader part epilog
2673 ; GFX10PLUS-LABEL: s_ssubsat_i16:
2674 ; GFX10PLUS: ; %bb.0:
2675 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
2676 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2677 ; GFX10PLUS-NEXT: ; return to shader part epilog
2678 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2682 define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
2683 ; GFX6-LABEL: ssubsat_i16_sv:
2685 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2686 ; GFX6-NEXT: s_max_i32 s1, s0, -1
2687 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2688 ; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff
2689 ; GFX6-NEXT: s_min_i32 s2, s0, -1
2690 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000
2691 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0
2692 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0
2693 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
2694 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2695 ; GFX6-NEXT: ; return to shader part epilog
2697 ; GFX8-LABEL: ssubsat_i16_sv:
2699 ; GFX8-NEXT: s_sext_i32_i16 s1, s0
2700 ; GFX8-NEXT: s_sext_i32_i16 s2, -1
2701 ; GFX8-NEXT: s_max_i32 s3, s1, s2
2702 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fff
2703 ; GFX8-NEXT: s_min_i32 s1, s1, s2
2704 ; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000
2705 ; GFX8-NEXT: v_max_i16_e32 v0, s3, v0
2706 ; GFX8-NEXT: v_min_i16_e32 v0, s1, v0
2707 ; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0
2708 ; GFX8-NEXT: ; return to shader part epilog
2710 ; GFX9-LABEL: ssubsat_i16_sv:
2712 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
2713 ; GFX9-NEXT: ; return to shader part epilog
2715 ; GFX10PLUS-LABEL: ssubsat_i16_sv:
2716 ; GFX10PLUS: ; %bb.0:
2717 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, v0 clamp
2718 ; GFX10PLUS-NEXT: ; return to shader part epilog
2719 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2720 %cast = bitcast i16 %result to half
2724 define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
2725 ; GFX6-LABEL: ssubsat_i16_vs:
2727 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2728 ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0
2729 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2730 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
2731 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0
2732 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2
2733 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1
2734 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
2735 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
2736 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2737 ; GFX6-NEXT: ; return to shader part epilog
2739 ; GFX8-LABEL: ssubsat_i16_vs:
2741 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v0
2742 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1
2743 ; GFX8-NEXT: v_min_i16_e32 v2, -1, v0
2744 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2
2745 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1
2746 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2
2747 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1
2748 ; GFX8-NEXT: ; return to shader part epilog
2750 ; GFX9-LABEL: ssubsat_i16_vs:
2752 ; GFX9-NEXT: v_sub_i16 v0, v0, s0 clamp
2753 ; GFX9-NEXT: ; return to shader part epilog
2755 ; GFX10PLUS-LABEL: ssubsat_i16_vs:
2756 ; GFX10PLUS: ; %bb.0:
2757 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, s0 clamp
2758 ; GFX10PLUS-NEXT: ; return to shader part epilog
2759 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2760 %cast = bitcast i16 %result to half
2764 define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
2765 ; GFX6-LABEL: v_ssubsat_v2i16:
2767 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2768 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2769 ; GFX6-NEXT: s_brev_b32 s4, -2
2770 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
2771 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2772 ; GFX6-NEXT: s_brev_b32 s5, 1
2773 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
2774 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
2775 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5
2776 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
2777 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
2778 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2779 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
2780 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
2781 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
2782 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
2783 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
2784 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
2785 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
2786 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
2787 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
2788 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2789 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2790 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2792 ; GFX8-LABEL: v_ssubsat_v2i16:
2794 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2795 ; GFX8-NEXT: v_max_i16_e32 v3, -1, v0
2796 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3
2797 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v0
2798 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2799 ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4
2800 ; GFX8-NEXT: v_max_i16_e32 v3, v3, v1
2801 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4
2802 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2
2803 ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4
2804 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v2
2805 ; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5
2806 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2807 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5
2808 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3
2809 ; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2810 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2811 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2813 ; GFX9-LABEL: v_ssubsat_v2i16:
2815 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2816 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
2817 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2819 ; GFX10PLUS-LABEL: v_ssubsat_v2i16:
2820 ; GFX10PLUS: ; %bb.0:
2821 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2822 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
2823 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2824 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2825 ret <2 x i16> %result
2828 define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
2829 ; GFX6-LABEL: s_ssubsat_v2i16:
2831 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2832 ; GFX6-NEXT: s_max_i32 s4, s0, -1
2833 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2834 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff
2835 ; GFX6-NEXT: s_min_i32 s5, s0, -1
2836 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000
2837 ; GFX6-NEXT: s_max_i32 s2, s4, s2
2838 ; GFX6-NEXT: s_min_i32 s2, s2, s5
2839 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2840 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
2841 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16
2842 ; GFX6-NEXT: s_max_i32 s3, s1, -1
2843 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff
2844 ; GFX6-NEXT: s_min_i32 s4, s1, -1
2845 ; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000
2846 ; GFX6-NEXT: s_max_i32 s2, s3, s2
2847 ; GFX6-NEXT: s_min_i32 s2, s2, s4
2848 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
2849 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
2850 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
2851 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2852 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
2853 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2854 ; GFX6-NEXT: s_or_b32 s0, s0, s1
2855 ; GFX6-NEXT: ; return to shader part epilog
2857 ; GFX8-LABEL: s_ssubsat_v2i16:
2859 ; GFX8-NEXT: s_sext_i32_i16 s4, s0
2860 ; GFX8-NEXT: s_sext_i32_i16 s5, -1
2861 ; GFX8-NEXT: s_max_i32 s6, s4, s5
2862 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
2863 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
2864 ; GFX8-NEXT: s_min_i32 s4, s4, s5
2865 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
2866 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2867 ; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000
2868 ; GFX8-NEXT: s_max_i32 s1, s6, s1
2869 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2870 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
2871 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
2872 ; GFX8-NEXT: s_min_i32 s1, s1, s4
2873 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
2874 ; GFX8-NEXT: s_sext_i32_i16 s1, s2
2875 ; GFX8-NEXT: s_max_i32 s4, s1, s5
2876 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
2877 ; GFX8-NEXT: s_min_i32 s1, s1, s5
2878 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
2879 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
2880 ; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000
2881 ; GFX8-NEXT: s_max_i32 s3, s4, s3
2882 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
2883 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2884 ; GFX8-NEXT: s_min_i32 s1, s3, s1
2885 ; GFX8-NEXT: s_sub_i32 s1, s2, s1
2886 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
2887 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
2888 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
2889 ; GFX8-NEXT: s_or_b32 s0, s0, s1
2890 ; GFX8-NEXT: ; return to shader part epilog
2892 ; GFX9-LABEL: s_ssubsat_v2i16:
2894 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
2895 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
2896 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2897 ; GFX9-NEXT: ; return to shader part epilog
2899 ; GFX10PLUS-LABEL: s_ssubsat_v2i16:
2900 ; GFX10PLUS: ; %bb.0:
2901 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
2902 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2903 ; GFX10PLUS-NEXT: ; return to shader part epilog
2904 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2905 %cast = bitcast <2 x i16> %result to i32
2909 define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
2910 ; GFX6-LABEL: ssubsat_v2i16_sv:
2912 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2913 ; GFX6-NEXT: s_max_i32 s2, s0, -1
2914 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2915 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff
2916 ; GFX6-NEXT: s_min_i32 s3, s0, -1
2917 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000
2918 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0
2919 ; GFX6-NEXT: v_min_i32_e32 v0, s3, v0
2920 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
2921 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
2922 ; GFX6-NEXT: s_max_i32 s1, s0, -1
2923 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2924 ; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff
2925 ; GFX6-NEXT: s_min_i32 s2, s0, -1
2926 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000
2927 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1
2928 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1
2929 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
2930 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2931 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2932 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2933 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2934 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2935 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
2936 ; GFX6-NEXT: ; return to shader part epilog
2938 ; GFX8-LABEL: ssubsat_v2i16_sv:
2940 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
2941 ; GFX8-NEXT: s_sext_i32_i16 s3, -1
2942 ; GFX8-NEXT: s_max_i32 s4, s2, s3
2943 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
2944 ; GFX8-NEXT: s_min_i32 s2, s2, s3
2945 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
2946 ; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
2947 ; GFX8-NEXT: v_max_i16_e32 v1, s4, v0
2948 ; GFX8-NEXT: v_min_i16_e32 v1, s2, v1
2949 ; GFX8-NEXT: s_sext_i32_i16 s2, s1
2950 ; GFX8-NEXT: s_max_i32 s4, s2, s3
2951 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
2952 ; GFX8-NEXT: s_min_i32 s2, s2, s3
2953 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2954 ; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
2955 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2956 ; GFX8-NEXT: v_min_i16_e32 v0, s2, v0
2957 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2958 ; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1
2959 ; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2960 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
2961 ; GFX8-NEXT: ; return to shader part epilog
2963 ; GFX9-LABEL: ssubsat_v2i16_sv:
2965 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
2966 ; GFX9-NEXT: ; return to shader part epilog
2968 ; GFX10PLUS-LABEL: ssubsat_v2i16_sv:
2969 ; GFX10PLUS: ; %bb.0:
2970 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
2971 ; GFX10PLUS-NEXT: ; return to shader part epilog
2972 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2973 %cast = bitcast <2 x i16> %result to float
2977 define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
2978 ; GFX6-LABEL: ssubsat_v2i16_vs:
2980 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2981 ; GFX6-NEXT: s_brev_b32 s2, -2
2982 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
2983 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2984 ; GFX6-NEXT: s_brev_b32 s3, 1
2985 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2
2986 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0
2987 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3
2988 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2
2989 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
2990 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2991 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
2992 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1
2993 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
2994 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2
2995 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v1
2996 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3
2997 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2
2998 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
2999 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
3000 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3001 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3002 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3003 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3004 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3005 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3006 ; GFX6-NEXT: ; return to shader part epilog
3008 ; GFX8-LABEL: ssubsat_v2i16_vs:
3010 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0
3011 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2
3012 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0
3013 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
3014 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3
3015 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2
3016 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3
3017 ; GFX8-NEXT: v_max_i16_e32 v3, -1, v1
3018 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
3019 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3
3020 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v1
3021 ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4
3022 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3
3023 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4
3024 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2
3025 ; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3026 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
3027 ; GFX8-NEXT: ; return to shader part epilog
3029 ; GFX9-LABEL: ssubsat_v2i16_vs:
3031 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 clamp
3032 ; GFX9-NEXT: ; return to shader part epilog
3034 ; GFX10PLUS-LABEL: ssubsat_v2i16_vs:
3035 ; GFX10PLUS: ; %bb.0:
3036 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, s0 clamp
3037 ; GFX10PLUS-NEXT: ; return to shader part epilog
3038 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
3039 %cast = bitcast <2 x i16> %result to float
3043 ; FIXME: v3i16 insert/extract
3044 ; define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
3045 ; %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3046 ; ret <3 x i16> %result
3049 ; define amdgpu_ps <3 x i16> @s_ssubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
3050 ; %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3051 ; ret <3 x i16> %result
3054 define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
3055 ; GFX6-LABEL: v_ssubsat_v4i16:
3057 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3058 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3059 ; GFX6-NEXT: s_brev_b32 s4, -2
3060 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0
3061 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3062 ; GFX6-NEXT: s_brev_b32 s5, 1
3063 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8
3064 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0
3065 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10
3066 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4
3067 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10
3068 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3069 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
3070 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
3071 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1
3072 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5
3073 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1
3074 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8
3075 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4
3076 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3077 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2
3078 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8
3079 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2
3080 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
3081 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
3082 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
3083 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2
3084 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6
3085 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4
3086 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3087 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6
3088 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3
3089 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1
3090 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
3091 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
3092 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
3093 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3
3094 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11
3095 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4
3096 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3097 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6
3098 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3099 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
3100 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3101 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2
3102 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3
3103 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3104 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3105 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3106 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
3107 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
3108 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3109 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3110 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3112 ; GFX8-LABEL: v_ssubsat_v4i16:
3114 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3115 ; GFX8-NEXT: v_max_i16_e32 v6, -1, v0
3116 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6
3117 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v0
3118 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
3119 ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7
3120 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v2
3121 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7
3122 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v4
3123 ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7
3124 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v4
3125 ; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8
3126 ; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3127 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v1
3128 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v8
3129 ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7
3130 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v1
3131 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3132 ; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8
3133 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v3
3134 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v8
3135 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v5
3136 ; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8
3137 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v5
3138 ; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9
3139 ; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3140 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v9
3141 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6
3142 ; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3143 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
3144 ; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7
3145 ; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3146 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
3147 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3149 ; GFX9-LABEL: v_ssubsat_v4i16:
3151 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp
3153 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp
3154 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3156 ; GFX10PLUS-LABEL: v_ssubsat_v4i16:
3157 ; GFX10PLUS: ; %bb.0:
3158 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3159 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v2 clamp
3160 ; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v3 clamp
3161 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3162 %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3163 %cast = bitcast <4 x i16> %result to <2 x float>
3164 ret <2 x float> %cast
3167 define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
3168 ; GFX6-LABEL: s_ssubsat_v4i16:
3170 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
3171 ; GFX6-NEXT: s_max_i32 s8, s0, -1
3172 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3173 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff
3174 ; GFX6-NEXT: s_min_i32 s9, s0, -1
3175 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000
3176 ; GFX6-NEXT: s_max_i32 s4, s8, s4
3177 ; GFX6-NEXT: s_min_i32 s4, s4, s9
3178 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3179 ; GFX6-NEXT: s_sub_i32 s0, s0, s4
3180 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16
3181 ; GFX6-NEXT: s_max_i32 s5, s1, -1
3182 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
3183 ; GFX6-NEXT: s_min_i32 s8, s1, -1
3184 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
3185 ; GFX6-NEXT: s_max_i32 s4, s5, s4
3186 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3187 ; GFX6-NEXT: s_min_i32 s4, s4, s8
3188 ; GFX6-NEXT: s_max_i32 s5, s2, -1
3189 ; GFX6-NEXT: s_sub_i32 s1, s1, s4
3190 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16
3191 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
3192 ; GFX6-NEXT: s_min_i32 s6, s2, -1
3193 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
3194 ; GFX6-NEXT: s_max_i32 s4, s5, s4
3195 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3196 ; GFX6-NEXT: s_min_i32 s4, s4, s6
3197 ; GFX6-NEXT: s_max_i32 s5, s3, -1
3198 ; GFX6-NEXT: s_sub_i32 s2, s2, s4
3199 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16
3200 ; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff
3201 ; GFX6-NEXT: s_min_i32 s6, s3, -1
3202 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000
3203 ; GFX6-NEXT: s_max_i32 s4, s5, s4
3204 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
3205 ; GFX6-NEXT: s_min_i32 s4, s4, s6
3206 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
3207 ; GFX6-NEXT: s_sub_i32 s3, s3, s4
3208 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
3209 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16
3210 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16
3211 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
3212 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3213 ; GFX6-NEXT: s_or_b32 s0, s0, s1
3214 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
3215 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
3216 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3217 ; GFX6-NEXT: s_or_b32 s1, s1, s2
3218 ; GFX6-NEXT: ; return to shader part epilog
3220 ; GFX8-LABEL: s_ssubsat_v4i16:
3222 ; GFX8-NEXT: s_sext_i32_i16 s8, s0
3223 ; GFX8-NEXT: s_sext_i32_i16 s9, -1
3224 ; GFX8-NEXT: s_max_i32 s10, s8, s9
3225 ; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff
3226 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16
3227 ; GFX8-NEXT: s_min_i32 s8, s8, s9
3228 ; GFX8-NEXT: s_sext_i32_i16 s10, s10
3229 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3230 ; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000
3231 ; GFX8-NEXT: s_max_i32 s2, s10, s2
3232 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3233 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
3234 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16
3235 ; GFX8-NEXT: s_min_i32 s2, s2, s8
3236 ; GFX8-NEXT: s_sub_i32 s0, s0, s2
3237 ; GFX8-NEXT: s_sext_i32_i16 s2, s4
3238 ; GFX8-NEXT: s_max_i32 s8, s2, s9
3239 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
3240 ; GFX8-NEXT: s_min_i32 s2, s2, s9
3241 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
3242 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3243 ; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
3244 ; GFX8-NEXT: s_max_i32 s6, s8, s6
3245 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3246 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3247 ; GFX8-NEXT: s_min_i32 s2, s6, s2
3248 ; GFX8-NEXT: s_sub_i32 s2, s4, s2
3249 ; GFX8-NEXT: s_sext_i32_i16 s4, s1
3250 ; GFX8-NEXT: s_max_i32 s6, s4, s9
3251 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
3252 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16
3253 ; GFX8-NEXT: s_min_i32 s4, s4, s9
3254 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3255 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3256 ; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000
3257 ; GFX8-NEXT: s_max_i32 s3, s6, s3
3258 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3259 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3260 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16
3261 ; GFX8-NEXT: s_min_i32 s3, s3, s4
3262 ; GFX8-NEXT: s_sub_i32 s1, s1, s3
3263 ; GFX8-NEXT: s_sext_i32_i16 s3, s5
3264 ; GFX8-NEXT: s_max_i32 s4, s3, s9
3265 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
3266 ; GFX8-NEXT: s_min_i32 s3, s3, s9
3267 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3268 ; GFX8-NEXT: s_sext_i32_i16 s6, s7
3269 ; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000
3270 ; GFX8-NEXT: s_max_i32 s4, s4, s6
3271 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3272 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3273 ; GFX8-NEXT: s_min_i32 s3, s4, s3
3274 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
3275 ; GFX8-NEXT: s_sub_i32 s3, s5, s3
3276 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
3277 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
3278 ; GFX8-NEXT: s_or_b32 s0, s0, s2
3279 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
3280 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
3281 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
3282 ; GFX8-NEXT: s_or_b32 s1, s1, s2
3283 ; GFX8-NEXT: ; return to shader part epilog
3285 ; GFX9-LABEL: s_ssubsat_v4i16:
3287 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3288 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3289 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
3290 ; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp
3291 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3292 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3293 ; GFX9-NEXT: ; return to shader part epilog
3295 ; GFX10PLUS-LABEL: s_ssubsat_v4i16:
3296 ; GFX10PLUS: ; %bb.0:
3297 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s2 clamp
3298 ; GFX10PLUS-NEXT: v_pk_sub_i16 v1, s1, s3 clamp
3299 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
3300 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
3301 ; GFX10PLUS-NEXT: ; return to shader part epilog
3302 %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3303 %cast = bitcast <4 x i16> %result to <2 x i32>
3308 ; define <5 x i16> @v_ssubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
3309 ; %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3310 ; ret <5 x i16> %result
3313 ; define amdgpu_ps <5 x i16> @s_ssubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
3314 ; %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3315 ; ret <5 x i16> %result
3318 define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
3319 ; GFX6-LABEL: v_ssubsat_v6i16:
3321 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3323 ; GFX6-NEXT: s_brev_b32 s4, -2
3324 ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0
3325 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
3326 ; GFX6-NEXT: s_brev_b32 s5, 1
3327 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12
3328 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0
3329 ; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14
3330 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6
3331 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v14
3332 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3333 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
3334 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7
3335 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v1
3336 ; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s4, v7
3337 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v1
3338 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12
3339 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6
3340 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3341 ; GFX6-NEXT: v_bfrev_b32_e32 v13, -2
3342 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v12
3343 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2
3344 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
3345 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8
3346 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13
3347 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v2
3348 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8
3349 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6
3350 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3351 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8
3352 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3
3353 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1
3354 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
3355 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9
3356 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13
3357 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3
3358 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15
3359 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6
3360 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3361 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8
3362 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4
3363 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
3364 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10
3365 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13
3366 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v4
3367 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15
3368 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6
3369 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
3370 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8
3371 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5
3372 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
3373 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11
3374 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13
3375 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5
3376 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3377 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15
3378 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6
3379 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3380 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8
3381 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3382 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2
3383 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3
3384 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
3385 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3386 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3387 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5
3388 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3389 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
3390 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
3391 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4
3392 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3393 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5
3394 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3395 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4
3396 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3397 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
3398 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3400 ; GFX8-LABEL: v_ssubsat_v6i16:
3402 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3403 ; GFX8-NEXT: v_max_i16_e32 v9, -1, v0
3404 ; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9
3405 ; GFX8-NEXT: v_min_i16_e32 v10, -1, v0
3406 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
3407 ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10
3408 ; GFX8-NEXT: v_max_i16_e32 v9, v9, v3
3409 ; GFX8-NEXT: v_min_i16_e32 v9, v9, v10
3410 ; GFX8-NEXT: v_max_i16_e32 v10, -1, v6
3411 ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10
3412 ; GFX8-NEXT: v_min_i16_e32 v11, -1, v6
3413 ; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11
3414 ; GFX8-NEXT: v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3415 ; GFX8-NEXT: v_max_i16_e32 v10, -1, v1
3416 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v11
3417 ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10
3418 ; GFX8-NEXT: v_min_i16_e32 v11, -1, v1
3419 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
3420 ; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11
3421 ; GFX8-NEXT: v_max_i16_e32 v10, v10, v4
3422 ; GFX8-NEXT: v_min_i16_e32 v10, v10, v11
3423 ; GFX8-NEXT: v_max_i16_e32 v11, -1, v7
3424 ; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11
3425 ; GFX8-NEXT: v_min_i16_e32 v12, -1, v7
3426 ; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12
3427 ; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3428 ; GFX8-NEXT: v_max_i16_e32 v11, -1, v2
3429 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v12
3430 ; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11
3431 ; GFX8-NEXT: v_min_i16_e32 v12, -1, v2
3432 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
3433 ; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12
3434 ; GFX8-NEXT: v_max_i16_e32 v11, v11, v5
3435 ; GFX8-NEXT: v_min_i16_e32 v11, v11, v12
3436 ; GFX8-NEXT: v_max_i16_e32 v12, -1, v8
3437 ; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12
3438 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v8
3439 ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13
3440 ; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3441 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9
3442 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3443 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v13
3444 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
3445 ; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10
3446 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3447 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
3448 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11
3449 ; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3450 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
3451 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3453 ; GFX9-LABEL: v_ssubsat_v6i16:
3455 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3456 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 clamp
3457 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 clamp
3458 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 clamp
3459 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3461 ; GFX10PLUS-LABEL: v_ssubsat_v6i16:
3462 ; GFX10PLUS: ; %bb.0:
3463 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3464 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v3 clamp
3465 ; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v4 clamp
3466 ; GFX10PLUS-NEXT: v_pk_sub_i16 v2, v2, v5 clamp
3467 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3468 %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3469 %cast = bitcast <6 x i16> %result to <3 x float>
3470 ret <3 x float> %cast
3473 define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
3474 ; GFX6-LABEL: s_ssubsat_v6i16:
3476 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
3477 ; GFX6-NEXT: s_max_i32 s12, s0, -1
3478 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
3479 ; GFX6-NEXT: s_sub_i32 s12, s12, 0x7fffffff
3480 ; GFX6-NEXT: s_min_i32 s13, s0, -1
3481 ; GFX6-NEXT: s_sub_i32 s13, s13, 0x80000000
3482 ; GFX6-NEXT: s_max_i32 s6, s12, s6
3483 ; GFX6-NEXT: s_min_i32 s6, s6, s13
3484 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3485 ; GFX6-NEXT: s_sub_i32 s0, s0, s6
3486 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16
3487 ; GFX6-NEXT: s_max_i32 s7, s1, -1
3488 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff
3489 ; GFX6-NEXT: s_min_i32 s12, s1, -1
3490 ; GFX6-NEXT: s_sub_i32 s12, s12, 0x80000000
3491 ; GFX6-NEXT: s_max_i32 s6, s7, s6
3492 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3493 ; GFX6-NEXT: s_min_i32 s6, s6, s12
3494 ; GFX6-NEXT: s_max_i32 s7, s2, -1
3495 ; GFX6-NEXT: s_sub_i32 s1, s1, s6
3496 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16
3497 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff
3498 ; GFX6-NEXT: s_min_i32 s8, s2, -1
3499 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
3500 ; GFX6-NEXT: s_max_i32 s6, s7, s6
3501 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3502 ; GFX6-NEXT: s_min_i32 s6, s6, s8
3503 ; GFX6-NEXT: s_max_i32 s7, s3, -1
3504 ; GFX6-NEXT: s_sub_i32 s2, s2, s6
3505 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16
3506 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff
3507 ; GFX6-NEXT: s_min_i32 s8, s3, -1
3508 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
3509 ; GFX6-NEXT: s_max_i32 s6, s7, s6
3510 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3511 ; GFX6-NEXT: s_min_i32 s6, s6, s8
3512 ; GFX6-NEXT: s_max_i32 s7, s4, -1
3513 ; GFX6-NEXT: s_sub_i32 s3, s3, s6
3514 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16
3515 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff
3516 ; GFX6-NEXT: s_min_i32 s8, s4, -1
3517 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
3518 ; GFX6-NEXT: s_max_i32 s6, s7, s6
3519 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
3520 ; GFX6-NEXT: s_min_i32 s6, s6, s8
3521 ; GFX6-NEXT: s_max_i32 s7, s5, -1
3522 ; GFX6-NEXT: s_sub_i32 s4, s4, s6
3523 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16
3524 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff
3525 ; GFX6-NEXT: s_min_i32 s8, s5, -1
3526 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
3527 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000
3528 ; GFX6-NEXT: s_max_i32 s6, s7, s6
3529 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
3530 ; GFX6-NEXT: s_min_i32 s6, s6, s8
3531 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
3532 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16
3533 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16
3534 ; GFX6-NEXT: s_sub_i32 s5, s5, s6
3535 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
3536 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3537 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16
3538 ; GFX6-NEXT: s_or_b32 s0, s0, s1
3539 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
3540 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
3541 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16
3542 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3543 ; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
3544 ; GFX6-NEXT: s_or_b32 s1, s1, s2
3545 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
3546 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3547 ; GFX6-NEXT: s_or_b32 s2, s2, s3
3548 ; GFX6-NEXT: ; return to shader part epilog
3550 ; GFX8-LABEL: s_ssubsat_v6i16:
3552 ; GFX8-NEXT: s_sext_i32_i16 s12, s0
3553 ; GFX8-NEXT: s_sext_i32_i16 s13, -1
3554 ; GFX8-NEXT: s_max_i32 s14, s12, s13
3555 ; GFX8-NEXT: s_sub_i32 s14, s14, 0x7fff
3556 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16
3557 ; GFX8-NEXT: s_min_i32 s12, s12, s13
3558 ; GFX8-NEXT: s_sext_i32_i16 s14, s14
3559 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3560 ; GFX8-NEXT: s_sub_i32 s12, s12, 0xffff8000
3561 ; GFX8-NEXT: s_max_i32 s3, s14, s3
3562 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3563 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
3564 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16
3565 ; GFX8-NEXT: s_min_i32 s3, s3, s12
3566 ; GFX8-NEXT: s_sub_i32 s0, s0, s3
3567 ; GFX8-NEXT: s_sext_i32_i16 s3, s6
3568 ; GFX8-NEXT: s_max_i32 s12, s3, s13
3569 ; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff
3570 ; GFX8-NEXT: s_min_i32 s3, s3, s13
3571 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
3572 ; GFX8-NEXT: s_sext_i32_i16 s9, s9
3573 ; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000
3574 ; GFX8-NEXT: s_max_i32 s9, s12, s9
3575 ; GFX8-NEXT: s_sext_i32_i16 s9, s9
3576 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3577 ; GFX8-NEXT: s_min_i32 s3, s9, s3
3578 ; GFX8-NEXT: s_sub_i32 s3, s6, s3
3579 ; GFX8-NEXT: s_sext_i32_i16 s6, s1
3580 ; GFX8-NEXT: s_max_i32 s9, s6, s13
3581 ; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff
3582 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16
3583 ; GFX8-NEXT: s_min_i32 s6, s6, s13
3584 ; GFX8-NEXT: s_sext_i32_i16 s9, s9
3585 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3586 ; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000
3587 ; GFX8-NEXT: s_max_i32 s4, s9, s4
3588 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3589 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3590 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16
3591 ; GFX8-NEXT: s_min_i32 s4, s4, s6
3592 ; GFX8-NEXT: s_sub_i32 s1, s1, s4
3593 ; GFX8-NEXT: s_sext_i32_i16 s4, s7
3594 ; GFX8-NEXT: s_max_i32 s6, s4, s13
3595 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
3596 ; GFX8-NEXT: s_min_i32 s4, s4, s13
3597 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3598 ; GFX8-NEXT: s_sext_i32_i16 s9, s10
3599 ; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000
3600 ; GFX8-NEXT: s_max_i32 s6, s6, s9
3601 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3602 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3603 ; GFX8-NEXT: s_min_i32 s4, s6, s4
3604 ; GFX8-NEXT: s_sext_i32_i16 s6, s2
3605 ; GFX8-NEXT: s_sub_i32 s4, s7, s4
3606 ; GFX8-NEXT: s_max_i32 s7, s6, s13
3607 ; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff
3608 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16
3609 ; GFX8-NEXT: s_min_i32 s6, s6, s13
3610 ; GFX8-NEXT: s_sext_i32_i16 s7, s7
3611 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
3612 ; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000
3613 ; GFX8-NEXT: s_max_i32 s5, s7, s5
3614 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
3615 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3616 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16
3617 ; GFX8-NEXT: s_min_i32 s5, s5, s6
3618 ; GFX8-NEXT: s_sub_i32 s2, s2, s5
3619 ; GFX8-NEXT: s_sext_i32_i16 s5, s8
3620 ; GFX8-NEXT: s_max_i32 s6, s5, s13
3621 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
3622 ; GFX8-NEXT: s_min_i32 s5, s5, s13
3623 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3624 ; GFX8-NEXT: s_sext_i32_i16 s7, s11
3625 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
3626 ; GFX8-NEXT: s_max_i32 s6, s6, s7
3627 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
3628 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3629 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
3630 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
3631 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16
3632 ; GFX8-NEXT: s_min_i32 s5, s6, s5
3633 ; GFX8-NEXT: s_or_b32 s0, s0, s3
3634 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
3635 ; GFX8-NEXT: s_sub_i32 s5, s8, s5
3636 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
3637 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16
3638 ; GFX8-NEXT: s_or_b32 s1, s1, s3
3639 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s5
3640 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
3641 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16
3642 ; GFX8-NEXT: s_or_b32 s2, s2, s3
3643 ; GFX8-NEXT: ; return to shader part epilog
3645 ; GFX9-LABEL: s_ssubsat_v6i16:
3647 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
3648 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3649 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
3650 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
3651 ; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp
3652 ; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp
3653 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3654 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3655 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
3656 ; GFX9-NEXT: ; return to shader part epilog
3658 ; GFX10PLUS-LABEL: s_ssubsat_v6i16:
3659 ; GFX10PLUS: ; %bb.0:
3660 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s3 clamp
3661 ; GFX10PLUS-NEXT: v_pk_sub_i16 v1, s1, s4 clamp
3662 ; GFX10PLUS-NEXT: v_pk_sub_i16 v2, s2, s5 clamp
3663 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
3664 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
3665 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
3666 ; GFX10PLUS-NEXT: ; return to shader part epilog
3667 %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3668 %cast = bitcast <6 x i16> %result to <3 x i32>
3672 define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
3673 ; GFX6-LABEL: v_ssubsat_v8i16:
3675 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3676 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3677 ; GFX6-NEXT: s_brev_b32 s4, -2
3678 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0
3679 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
3680 ; GFX6-NEXT: s_brev_b32 s5, 1
3681 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16
3682 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0
3683 ; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18
3684 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8
3685 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v18
3686 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3687 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
3688 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9
3689 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v1
3690 ; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s4, v9
3691 ; GFX6-NEXT: v_min_i32_e32 v16, -1, v1
3692 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s5, v16
3693 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3694 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3695 ; GFX6-NEXT: v_bfrev_b32_e32 v17, -2
3696 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v16
3697 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2
3698 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
3699 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10
3700 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
3701 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v2
3702 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10
3703 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3704 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3705 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10
3706 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3
3707 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1
3708 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
3709 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11
3710 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
3711 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3
3712 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
3713 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3714 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3715 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10
3716 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4
3717 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8
3718 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12
3719 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
3720 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v4
3721 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
3722 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3723 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
3724 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10
3725 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5
3726 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
3727 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13
3728 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
3729 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v5
3730 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
3731 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3732 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
3733 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10
3734 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6
3735 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8
3736 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14
3737 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
3738 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6
3739 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
3740 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3741 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
3742 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10
3743 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v7
3744 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3745 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
3746 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15
3747 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17
3748 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7
3749 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3750 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
3751 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8
3752 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3753 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2
3754 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3
3755 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10
3756 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3757 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3758 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5
3759 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8
3760 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3761 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
3762 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
3763 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4
3764 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7
3765 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3766 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5
3767 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6
3768 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3769 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4
3770 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3771 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7
3772 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
3773 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6
3774 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3775 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
3776 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3778 ; GFX8-LABEL: v_ssubsat_v8i16:
3780 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3781 ; GFX8-NEXT: v_max_i16_e32 v12, -1, v0
3782 ; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12
3783 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v0
3784 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0
3785 ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13
3786 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v4
3787 ; GFX8-NEXT: v_min_i16_e32 v12, v12, v13
3788 ; GFX8-NEXT: v_max_i16_e32 v13, -1, v8
3789 ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13
3790 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v8
3791 ; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14
3792 ; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3793 ; GFX8-NEXT: v_max_i16_e32 v13, -1, v1
3794 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v14
3795 ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13
3796 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v1
3797 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
3798 ; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14
3799 ; GFX8-NEXT: v_max_i16_e32 v13, v13, v5
3800 ; GFX8-NEXT: v_min_i16_e32 v13, v13, v14
3801 ; GFX8-NEXT: v_max_i16_e32 v14, -1, v9
3802 ; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14
3803 ; GFX8-NEXT: v_min_i16_e32 v15, -1, v9
3804 ; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15
3805 ; GFX8-NEXT: v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3806 ; GFX8-NEXT: v_max_i16_e32 v14, -1, v2
3807 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v15
3808 ; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14
3809 ; GFX8-NEXT: v_min_i16_e32 v15, -1, v2
3810 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2
3811 ; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15
3812 ; GFX8-NEXT: v_max_i16_e32 v14, v14, v6
3813 ; GFX8-NEXT: v_min_i16_e32 v14, v14, v15
3814 ; GFX8-NEXT: v_max_i16_e32 v15, -1, v10
3815 ; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15
3816 ; GFX8-NEXT: v_min_i16_e32 v16, -1, v10
3817 ; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16
3818 ; GFX8-NEXT: v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3819 ; GFX8-NEXT: v_max_i16_e32 v15, -1, v3
3820 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v16
3821 ; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15
3822 ; GFX8-NEXT: v_min_i16_e32 v16, -1, v3
3823 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3
3824 ; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16
3825 ; GFX8-NEXT: v_max_i16_e32 v15, v15, v7
3826 ; GFX8-NEXT: v_min_i16_e32 v15, v15, v16
3827 ; GFX8-NEXT: v_max_i16_e32 v16, -1, v11
3828 ; GFX8-NEXT: v_subrev_u16_e32 v16, 0x7fff, v16
3829 ; GFX8-NEXT: v_min_i16_e32 v17, -1, v11
3830 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12
3831 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3832 ; GFX8-NEXT: v_subrev_u16_e32 v17, 0x8000, v17
3833 ; GFX8-NEXT: v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3834 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
3835 ; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13
3836 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3837 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v17
3838 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
3839 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14
3840 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3841 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
3842 ; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15
3843 ; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3844 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
3845 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3847 ; GFX9-LABEL: v_ssubsat_v8i16:
3849 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 clamp
3851 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v5 clamp
3852 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v6 clamp
3853 ; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 clamp
3854 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3856 ; GFX10PLUS-LABEL: v_ssubsat_v8i16:
3857 ; GFX10PLUS: ; %bb.0:
3858 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3859 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v4 clamp
3860 ; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v5 clamp
3861 ; GFX10PLUS-NEXT: v_pk_sub_i16 v2, v2, v6 clamp
3862 ; GFX10PLUS-NEXT: v_pk_sub_i16 v3, v3, v7 clamp
3863 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3864 %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
3865 %cast = bitcast <8 x i16> %result to <4 x float>
3866 ret <4 x float> %cast
3869 define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
3870 ; GFX6-LABEL: s_ssubsat_v8i16:
3872 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
3873 ; GFX6-NEXT: s_max_i32 s16, s0, -1
3874 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16
3875 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff
3876 ; GFX6-NEXT: s_min_i32 s17, s0, -1
3877 ; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000
3878 ; GFX6-NEXT: s_max_i32 s8, s16, s8
3879 ; GFX6-NEXT: s_min_i32 s8, s8, s17
3880 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3881 ; GFX6-NEXT: s_sub_i32 s0, s0, s8
3882 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16
3883 ; GFX6-NEXT: s_max_i32 s9, s1, -1
3884 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3885 ; GFX6-NEXT: s_min_i32 s16, s1, -1
3886 ; GFX6-NEXT: s_sub_i32 s16, s16, 0x80000000
3887 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3888 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3889 ; GFX6-NEXT: s_min_i32 s8, s8, s16
3890 ; GFX6-NEXT: s_max_i32 s9, s2, -1
3891 ; GFX6-NEXT: s_sub_i32 s1, s1, s8
3892 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16
3893 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3894 ; GFX6-NEXT: s_min_i32 s10, s2, -1
3895 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
3896 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3897 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3898 ; GFX6-NEXT: s_min_i32 s8, s8, s10
3899 ; GFX6-NEXT: s_max_i32 s9, s3, -1
3900 ; GFX6-NEXT: s_sub_i32 s2, s2, s8
3901 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16
3902 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3903 ; GFX6-NEXT: s_min_i32 s10, s3, -1
3904 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
3905 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3906 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3907 ; GFX6-NEXT: s_min_i32 s8, s8, s10
3908 ; GFX6-NEXT: s_max_i32 s9, s4, -1
3909 ; GFX6-NEXT: s_sub_i32 s3, s3, s8
3910 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16
3911 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3912 ; GFX6-NEXT: s_min_i32 s10, s4, -1
3913 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
3914 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3915 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
3916 ; GFX6-NEXT: s_min_i32 s8, s8, s10
3917 ; GFX6-NEXT: s_max_i32 s9, s5, -1
3918 ; GFX6-NEXT: s_sub_i32 s4, s4, s8
3919 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16
3920 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3921 ; GFX6-NEXT: s_min_i32 s10, s5, -1
3922 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
3923 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3924 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
3925 ; GFX6-NEXT: s_min_i32 s8, s8, s10
3926 ; GFX6-NEXT: s_max_i32 s9, s6, -1
3927 ; GFX6-NEXT: s_sub_i32 s5, s5, s8
3928 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16
3929 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3930 ; GFX6-NEXT: s_min_i32 s10, s6, -1
3931 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
3932 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3933 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16
3934 ; GFX6-NEXT: s_min_i32 s8, s8, s10
3935 ; GFX6-NEXT: s_max_i32 s9, s7, -1
3936 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
3937 ; GFX6-NEXT: s_sub_i32 s6, s6, s8
3938 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16
3939 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff
3940 ; GFX6-NEXT: s_min_i32 s10, s7, -1
3941 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
3942 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000
3943 ; GFX6-NEXT: s_max_i32 s8, s9, s8
3944 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
3945 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16
3946 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16
3947 ; GFX6-NEXT: s_min_i32 s8, s8, s10
3948 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
3949 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3950 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16
3951 ; GFX6-NEXT: s_sub_i32 s7, s7, s8
3952 ; GFX6-NEXT: s_or_b32 s0, s0, s1
3953 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
3954 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
3955 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16
3956 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16
3957 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3958 ; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
3959 ; GFX6-NEXT: s_ashr_i32 s6, s6, 16
3960 ; GFX6-NEXT: s_or_b32 s1, s1, s2
3961 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
3962 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3963 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
3964 ; GFX6-NEXT: s_or_b32 s2, s2, s3
3965 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
3966 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3967 ; GFX6-NEXT: s_or_b32 s3, s3, s4
3968 ; GFX6-NEXT: ; return to shader part epilog
3970 ; GFX8-LABEL: s_ssubsat_v8i16:
3972 ; GFX8-NEXT: s_sext_i32_i16 s16, s0
3973 ; GFX8-NEXT: s_sext_i32_i16 s17, -1
3974 ; GFX8-NEXT: s_max_i32 s18, s16, s17
3975 ; GFX8-NEXT: s_sub_i32 s18, s18, 0x7fff
3976 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16
3977 ; GFX8-NEXT: s_min_i32 s16, s16, s17
3978 ; GFX8-NEXT: s_sext_i32_i16 s18, s18
3979 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3980 ; GFX8-NEXT: s_sub_i32 s16, s16, 0xffff8000
3981 ; GFX8-NEXT: s_max_i32 s4, s18, s4
3982 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3983 ; GFX8-NEXT: s_sext_i32_i16 s16, s16
3984 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16
3985 ; GFX8-NEXT: s_min_i32 s4, s4, s16
3986 ; GFX8-NEXT: s_sub_i32 s0, s0, s4
3987 ; GFX8-NEXT: s_sext_i32_i16 s4, s8
3988 ; GFX8-NEXT: s_max_i32 s16, s4, s17
3989 ; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fff
3990 ; GFX8-NEXT: s_min_i32 s4, s4, s17
3991 ; GFX8-NEXT: s_sext_i32_i16 s16, s16
3992 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
3993 ; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000
3994 ; GFX8-NEXT: s_max_i32 s12, s16, s12
3995 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
3996 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3997 ; GFX8-NEXT: s_min_i32 s4, s12, s4
3998 ; GFX8-NEXT: s_sub_i32 s4, s8, s4
3999 ; GFX8-NEXT: s_sext_i32_i16 s8, s1
4000 ; GFX8-NEXT: s_max_i32 s12, s8, s17
4001 ; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff
4002 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16
4003 ; GFX8-NEXT: s_min_i32 s8, s8, s17
4004 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
4005 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4006 ; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000
4007 ; GFX8-NEXT: s_max_i32 s5, s12, s5
4008 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4009 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4010 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16
4011 ; GFX8-NEXT: s_min_i32 s5, s5, s8
4012 ; GFX8-NEXT: s_sub_i32 s1, s1, s5
4013 ; GFX8-NEXT: s_sext_i32_i16 s5, s9
4014 ; GFX8-NEXT: s_max_i32 s8, s5, s17
4015 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
4016 ; GFX8-NEXT: s_min_i32 s5, s5, s17
4017 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4018 ; GFX8-NEXT: s_sext_i32_i16 s12, s13
4019 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
4020 ; GFX8-NEXT: s_max_i32 s8, s8, s12
4021 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4022 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4023 ; GFX8-NEXT: s_min_i32 s5, s8, s5
4024 ; GFX8-NEXT: s_sext_i32_i16 s8, s2
4025 ; GFX8-NEXT: s_sub_i32 s5, s9, s5
4026 ; GFX8-NEXT: s_max_i32 s9, s8, s17
4027 ; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff
4028 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16
4029 ; GFX8-NEXT: s_min_i32 s8, s8, s17
4030 ; GFX8-NEXT: s_sext_i32_i16 s9, s9
4031 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
4032 ; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000
4033 ; GFX8-NEXT: s_max_i32 s6, s9, s6
4034 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
4035 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4036 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16
4037 ; GFX8-NEXT: s_min_i32 s6, s6, s8
4038 ; GFX8-NEXT: s_sub_i32 s2, s2, s6
4039 ; GFX8-NEXT: s_sext_i32_i16 s6, s10
4040 ; GFX8-NEXT: s_max_i32 s8, s6, s17
4041 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
4042 ; GFX8-NEXT: s_min_i32 s6, s6, s17
4043 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4044 ; GFX8-NEXT: s_sext_i32_i16 s9, s14
4045 ; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000
4046 ; GFX8-NEXT: s_max_i32 s8, s8, s9
4047 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4048 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
4049 ; GFX8-NEXT: s_min_i32 s6, s8, s6
4050 ; GFX8-NEXT: s_sext_i32_i16 s8, s3
4051 ; GFX8-NEXT: s_max_i32 s9, s8, s17
4052 ; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff
4053 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16
4054 ; GFX8-NEXT: s_min_i32 s8, s8, s17
4055 ; GFX8-NEXT: s_sext_i32_i16 s9, s9
4056 ; GFX8-NEXT: s_sext_i32_i16 s7, s7
4057 ; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000
4058 ; GFX8-NEXT: s_max_i32 s7, s9, s7
4059 ; GFX8-NEXT: s_sext_i32_i16 s7, s7
4060 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4061 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16
4062 ; GFX8-NEXT: s_min_i32 s7, s7, s8
4063 ; GFX8-NEXT: s_sub_i32 s3, s3, s7
4064 ; GFX8-NEXT: s_sext_i32_i16 s7, s11
4065 ; GFX8-NEXT: s_max_i32 s8, s7, s17
4066 ; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
4067 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
4068 ; GFX8-NEXT: s_min_i32 s7, s7, s17
4069 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4070 ; GFX8-NEXT: s_sext_i32_i16 s9, s15
4071 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
4072 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4073 ; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000
4074 ; GFX8-NEXT: s_max_i32 s8, s8, s9
4075 ; GFX8-NEXT: s_or_b32 s0, s0, s4
4076 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
4077 ; GFX8-NEXT: s_sub_i32 s6, s10, s6
4078 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
4079 ; GFX8-NEXT: s_sext_i32_i16 s7, s7
4080 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
4081 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4082 ; GFX8-NEXT: s_min_i32 s7, s8, s7
4083 ; GFX8-NEXT: s_or_b32 s1, s1, s4
4084 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
4085 ; GFX8-NEXT: s_sub_i32 s7, s11, s7
4086 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
4087 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4088 ; GFX8-NEXT: s_or_b32 s2, s2, s4
4089 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
4090 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
4091 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4092 ; GFX8-NEXT: s_or_b32 s3, s3, s4
4093 ; GFX8-NEXT: ; return to shader part epilog
4095 ; GFX9-LABEL: s_ssubsat_v8i16:
4097 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4098 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4099 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
4100 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
4101 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
4102 ; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp
4103 ; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp
4104 ; GFX9-NEXT: v_pk_sub_i16 v3, s3, v3 clamp
4105 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
4106 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
4107 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
4108 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
4109 ; GFX9-NEXT: ; return to shader part epilog
4111 ; GFX10PLUS-LABEL: s_ssubsat_v8i16:
4112 ; GFX10PLUS: ; %bb.0:
4113 ; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s4 clamp
4114 ; GFX10PLUS-NEXT: v_pk_sub_i16 v1, s1, s5 clamp
4115 ; GFX10PLUS-NEXT: v_pk_sub_i16 v2, s2, s6 clamp
4116 ; GFX10PLUS-NEXT: v_pk_sub_i16 v3, s3, s7 clamp
4117 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
4118 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
4119 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
4120 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
4121 ; GFX10PLUS-NEXT: ; return to shader part epilog
4122 %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
4123 %cast = bitcast <8 x i16> %result to <4 x i32>
4127 define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
4128 ; GFX6-LABEL: v_ssubsat_i48:
4130 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4131 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
4132 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc
4133 ; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
4134 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
4135 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
4136 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4137 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4138 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4139 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0
4140 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5
4141 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4142 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
4143 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
4144 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4146 ; GFX8-LABEL: v_ssubsat_i48:
4148 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4149 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
4150 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc
4151 ; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
4152 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
4153 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
4154 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4155 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4156 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4157 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0
4158 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5
4159 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4160 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
4161 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
4162 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4164 ; GFX9-LABEL: v_ssubsat_i48:
4166 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4167 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4168 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
4169 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
4170 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
4171 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4172 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4173 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4174 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4175 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4176 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4177 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4178 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4179 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4181 ; GFX10-LABEL: v_ssubsat_i48:
4183 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4184 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4185 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
4186 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
4187 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4188 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
4189 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4190 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
4191 ; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
4192 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
4193 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4194 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4195 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4196 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4198 ; GFX11-LABEL: v_ssubsat_i48:
4200 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4201 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4202 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
4203 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
4204 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4205 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
4206 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4207 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
4208 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
4209 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
4210 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4211 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4212 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4213 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4217 define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
4218 ; GFX6-LABEL: s_ssubsat_i48:
4220 ; GFX6-NEXT: s_sub_u32 s4, s0, s2
4221 ; GFX6-NEXT: s_subb_u32 s3, s1, s3
4222 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4223 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
4224 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
4225 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
4226 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
4227 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4228 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4229 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31
4230 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15
4231 ; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000
4232 ; GFX6-NEXT: v_mov_b32_e32 v0, s5
4233 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
4234 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
4235 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
4236 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
4237 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4238 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4239 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
4240 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
4241 ; GFX6-NEXT: ; return to shader part epilog
4243 ; GFX8-LABEL: s_ssubsat_i48:
4245 ; GFX8-NEXT: s_sub_u32 s4, s0, s2
4246 ; GFX8-NEXT: s_subb_u32 s3, s1, s3
4247 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4248 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
4249 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
4250 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
4251 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
4252 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4253 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4254 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31
4255 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15
4256 ; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000
4257 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
4258 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
4259 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
4260 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
4261 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
4262 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4263 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4264 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
4265 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
4266 ; GFX8-NEXT: ; return to shader part epilog
4268 ; GFX9-LABEL: s_ssubsat_i48:
4270 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4271 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
4272 ; GFX9-NEXT: s_sub_u32 s4, s0, s2
4273 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
4274 ; GFX9-NEXT: s_subb_u32 s5, s1, s3
4275 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
4276 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4277 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4278 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31
4279 ; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
4280 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4281 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4282 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
4283 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
4284 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
4285 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4286 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4287 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4288 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
4289 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
4290 ; GFX9-NEXT: ; return to shader part epilog
4292 ; GFX10-LABEL: s_ssubsat_i48:
4294 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4295 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
4296 ; GFX10-NEXT: s_sub_u32 s4, s0, s2
4297 ; GFX10-NEXT: s_subb_u32 s5, s1, s3
4298 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
4299 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4300 ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
4301 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
4302 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31
4303 ; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
4304 ; GFX10-NEXT: s_xor_b32 s0, s1, s0
4305 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4306 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4307 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4308 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
4309 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
4310 ; GFX10-NEXT: ; return to shader part epilog
4312 ; GFX11-LABEL: s_ssubsat_i48:
4314 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4315 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
4316 ; GFX11-NEXT: s_sub_u32 s4, s0, s2
4317 ; GFX11-NEXT: s_subb_u32 s5, s1, s3
4318 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4319 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4320 ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
4321 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31
4322 ; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
4323 ; GFX11-NEXT: s_xor_b32 s0, s1, s0
4324 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4325 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4326 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4327 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
4328 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
4329 ; GFX11-NEXT: ; return to shader part epilog
4330 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4334 define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
4335 ; GFX6-LABEL: ssubsat_i48_sv:
4337 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4338 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0
4339 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc
4340 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
4341 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4342 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
4343 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4344 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4345 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4346 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4347 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0
4348 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4349 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4350 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4351 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
4352 ; GFX6-NEXT: ; return to shader part epilog
4354 ; GFX8-LABEL: ssubsat_i48_sv:
4356 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4357 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
4358 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc
4359 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
4360 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4361 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
4362 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4363 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4364 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4365 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4366 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0
4367 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4368 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4369 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4370 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
4371 ; GFX8-NEXT: ; return to shader part epilog
4373 ; GFX9-LABEL: ssubsat_i48_sv:
4375 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4376 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4377 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
4378 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
4379 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
4380 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4381 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4382 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4383 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4384 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4385 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4386 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4387 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4388 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
4389 ; GFX9-NEXT: ; return to shader part epilog
4391 ; GFX10-LABEL: ssubsat_i48_sv:
4393 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4394 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4395 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
4396 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4397 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4398 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4399 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
4400 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4401 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4402 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4403 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4404 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4405 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
4406 ; GFX10-NEXT: ; return to shader part epilog
4408 ; GFX11-LABEL: ssubsat_i48_sv:
4410 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4411 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4412 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
4413 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4414 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4415 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4416 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
4417 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4418 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4419 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4420 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4421 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
4422 ; GFX11-NEXT: ; return to shader part epilog
4423 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4424 %ext.result = zext i48 %result to i64
4425 %cast = bitcast i64 %ext.result to <2 x float>
4426 ret <2 x float> %cast
4429 define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
4430 ; GFX6-LABEL: ssubsat_i48_vs:
4432 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4433 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0
4434 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
4435 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
4436 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
4437 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4438 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4439 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4440 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4441 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4442 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0
4443 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4444 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4445 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4446 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
4447 ; GFX6-NEXT: ; return to shader part epilog
4449 ; GFX8-LABEL: ssubsat_i48_vs:
4451 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4452 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0
4453 ; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
4454 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
4455 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
4456 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4457 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4458 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4459 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4460 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4461 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0
4462 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4463 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4464 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4465 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
4466 ; GFX8-NEXT: ; return to shader part epilog
4468 ; GFX9-LABEL: ssubsat_i48_vs:
4470 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4471 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
4472 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
4473 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0
4474 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4475 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
4476 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0
4477 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4478 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4479 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4480 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4481 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4482 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4483 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
4484 ; GFX9-NEXT: ; return to shader part epilog
4486 ; GFX10-LABEL: ssubsat_i48_vs:
4488 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4489 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4490 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
4491 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4492 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
4493 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4494 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4495 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4496 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4497 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4498 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4499 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4500 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
4501 ; GFX10-NEXT: ; return to shader part epilog
4503 ; GFX11-LABEL: ssubsat_i48_vs:
4505 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4506 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4507 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
4508 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4509 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
4510 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4511 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4512 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4513 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4514 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4515 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4516 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
4517 ; GFX11-NEXT: ; return to shader part epilog
4518 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4519 %ext.result = zext i48 %result to i64
4520 %cast = bitcast i64 %ext.result to <2 x float>
4521 ret <2 x float> %cast
4524 define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
4525 ; GFX6-LABEL: v_ssubsat_i64:
4527 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4528 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
4529 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
4530 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4531 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4532 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4533 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
4534 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4535 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4536 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4537 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4539 ; GFX8-LABEL: v_ssubsat_i64:
4541 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4542 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
4543 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
4544 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4545 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4546 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4547 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
4548 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4549 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4550 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4551 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4553 ; GFX9-LABEL: v_ssubsat_i64:
4555 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4556 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
4557 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
4558 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4559 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
4560 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4561 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4562 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4563 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4564 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4565 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4567 ; GFX10-LABEL: v_ssubsat_i64:
4569 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4570 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
4571 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4572 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]
4573 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4574 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4575 ; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
4576 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
4577 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4578 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4579 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4581 ; GFX11-LABEL: v_ssubsat_i64:
4583 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4584 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
4585 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4586 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
4587 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4588 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4589 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
4590 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4591 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4592 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4593 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4597 define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
4598 ; GFX6-LABEL: s_ssubsat_i64:
4600 ; GFX6-NEXT: s_sub_u32 s4, s0, s2
4601 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
4602 ; GFX6-NEXT: s_subb_u32 s5, s1, s3
4603 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
4604 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4605 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4606 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31
4607 ; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000
4608 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
4609 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
4610 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
4611 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
4612 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
4613 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4614 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4615 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
4616 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
4617 ; GFX6-NEXT: ; return to shader part epilog
4619 ; GFX8-LABEL: s_ssubsat_i64:
4621 ; GFX8-NEXT: s_sub_u32 s4, s0, s2
4622 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
4623 ; GFX8-NEXT: s_subb_u32 s5, s1, s3
4624 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
4625 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4626 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4627 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31
4628 ; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000
4629 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
4630 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
4631 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
4632 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
4633 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
4634 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4635 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4636 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
4637 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
4638 ; GFX8-NEXT: ; return to shader part epilog
4640 ; GFX9-LABEL: s_ssubsat_i64:
4642 ; GFX9-NEXT: s_sub_u32 s4, s0, s2
4643 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
4644 ; GFX9-NEXT: s_subb_u32 s5, s1, s3
4645 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
4646 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4647 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4648 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31
4649 ; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
4650 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4651 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4652 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
4653 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
4654 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
4655 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4656 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4657 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
4658 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
4659 ; GFX9-NEXT: ; return to shader part epilog
4661 ; GFX10-LABEL: s_ssubsat_i64:
4663 ; GFX10-NEXT: s_sub_u32 s4, s0, s2
4664 ; GFX10-NEXT: s_subb_u32 s5, s1, s3
4665 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
4666 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4667 ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
4668 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
4669 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31
4670 ; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
4671 ; GFX10-NEXT: s_xor_b32 s0, s1, s0
4672 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4673 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4674 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
4675 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
4676 ; GFX10-NEXT: ; return to shader part epilog
4678 ; GFX11-LABEL: s_ssubsat_i64:
4680 ; GFX11-NEXT: s_sub_u32 s4, s0, s2
4681 ; GFX11-NEXT: s_subb_u32 s5, s1, s3
4682 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4683 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4684 ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
4685 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31
4686 ; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
4687 ; GFX11-NEXT: s_xor_b32 s0, s1, s0
4688 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4689 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4690 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
4691 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
4692 ; GFX11-NEXT: ; return to shader part epilog
4693 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4697 define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
4698 ; GFX6-LABEL: ssubsat_i64_sv:
4700 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4701 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0
4702 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
4703 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4704 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4705 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4706 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
4707 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4708 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4709 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4710 ; GFX6-NEXT: ; return to shader part epilog
4712 ; GFX8-LABEL: ssubsat_i64_sv:
4714 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4715 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
4716 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
4717 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4718 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4719 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4720 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
4721 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4722 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4723 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4724 ; GFX8-NEXT: ; return to shader part epilog
4726 ; GFX9-LABEL: ssubsat_i64_sv:
4728 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
4729 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
4730 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
4731 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4732 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
4733 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4734 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4735 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4736 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4737 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4738 ; GFX9-NEXT: ; return to shader part epilog
4740 ; GFX10-LABEL: ssubsat_i64_sv:
4742 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
4743 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4744 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4745 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4746 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
4747 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4748 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4749 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4750 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4751 ; GFX10-NEXT: ; return to shader part epilog
4753 ; GFX11-LABEL: ssubsat_i64_sv:
4755 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
4756 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4757 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4758 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4759 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
4760 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4761 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4762 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4763 ; GFX11-NEXT: ; return to shader part epilog
4764 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4765 %cast = bitcast i64 %result to <2 x float>
4766 ret <2 x float> %cast
4769 define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
4770 ; GFX6-LABEL: ssubsat_i64_vs:
4772 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4773 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0
4774 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
4775 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4776 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4777 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4778 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
4779 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4780 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4781 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4782 ; GFX6-NEXT: ; return to shader part epilog
4784 ; GFX8-LABEL: ssubsat_i64_vs:
4786 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4787 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0
4788 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
4789 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4790 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4791 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4792 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
4793 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4794 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4795 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4796 ; GFX8-NEXT: ; return to shader part epilog
4798 ; GFX9-LABEL: ssubsat_i64_vs:
4800 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
4801 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
4802 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4803 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4804 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
4805 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4806 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4807 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4808 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4809 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4810 ; GFX9-NEXT: ; return to shader part epilog
4812 ; GFX10-LABEL: ssubsat_i64_vs:
4814 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
4815 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4816 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
4817 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4818 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4819 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4820 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4821 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4822 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4823 ; GFX10-NEXT: ; return to shader part epilog
4825 ; GFX11-LABEL: ssubsat_i64_vs:
4827 ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
4828 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4829 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
4830 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4831 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4832 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4833 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4834 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4835 ; GFX11-NEXT: ; return to shader part epilog
4836 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4837 %cast = bitcast i64 %result to <2 x float>
4838 ret <2 x float> %cast
4841 define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
4842 ; GFX6-LABEL: v_ssubsat_v2i64:
4844 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4845 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
4846 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc
4847 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4848 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
4849 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9
4850 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
4851 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
4852 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4853 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
4854 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
4855 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6
4856 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc
4857 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4858 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
4859 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5
4860 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
4861 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4862 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
4863 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
4864 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4866 ; GFX8-LABEL: v_ssubsat_v2i64:
4868 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4869 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4
4870 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc
4871 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4872 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
4873 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9
4874 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
4875 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1
4876 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4877 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
4878 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
4879 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6
4880 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc
4881 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4882 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
4883 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5
4884 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2
4885 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4886 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
4887 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
4888 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4890 ; GFX9-LABEL: v_ssubsat_v2i64:
4892 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4893 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4
4894 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
4895 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4896 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
4897 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9
4898 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
4899 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
4900 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4901 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
4902 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
4903 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6
4904 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
4905 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4906 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
4907 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
4908 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2
4909 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4910 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
4911 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
4912 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4914 ; GFX10-LABEL: v_ssubsat_v2i64:
4916 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4917 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
4918 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4919 ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6
4920 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4921 ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9
4922 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4923 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5]
4924 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11
4925 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
4926 ; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12
4927 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
4928 ; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4
4929 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
4930 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo
4931 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
4932 ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5
4933 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo
4934 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo
4935 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4937 ; GFX11-LABEL: v_ssubsat_v2i64:
4939 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4940 ; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
4941 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4942 ; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6
4943 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4944 ; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9
4945 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4946 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[4:5]
4947 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11
4948 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
4949 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7]
4950 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12
4951 ; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4
4952 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4953 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
4954 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1
4955 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3
4956 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4957 %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4958 ret <2 x i64> %result
4961 define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
4962 ; GFX6-LABEL: s_ssubsat_v2i64:
4964 ; GFX6-NEXT: s_sub_u32 s8, s0, s4
4965 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
4966 ; GFX6-NEXT: s_subb_u32 s9, s1, s5
4967 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
4968 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4969 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
4970 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31
4971 ; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
4972 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
4973 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
4974 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
4975 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
4976 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
4977 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
4978 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
4979 ; GFX6-NEXT: s_sub_u32 s0, s2, s6
4980 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
4981 ; GFX6-NEXT: s_subb_u32 s1, s3, s7
4982 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
4983 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4984 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
4985 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31
4986 ; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
4987 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
4988 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
4989 ; GFX6-NEXT: v_mov_b32_e32 v4, s0
4990 ; GFX6-NEXT: v_mov_b32_e32 v5, s1
4991 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc
4992 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4993 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4994 ; GFX6-NEXT: v_readfirstlane_b32 s0, v2
4995 ; GFX6-NEXT: v_readfirstlane_b32 s1, v3
4996 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
4997 ; GFX6-NEXT: v_readfirstlane_b32 s3, v1
4998 ; GFX6-NEXT: ; return to shader part epilog
5000 ; GFX8-LABEL: s_ssubsat_v2i64:
5002 ; GFX8-NEXT: s_sub_u32 s8, s0, s4
5003 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
5004 ; GFX8-NEXT: s_subb_u32 s9, s1, s5
5005 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
5006 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5007 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
5008 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31
5009 ; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
5010 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
5011 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
5012 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
5013 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
5014 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
5015 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
5016 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
5017 ; GFX8-NEXT: s_sub_u32 s0, s2, s6
5018 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
5019 ; GFX8-NEXT: s_subb_u32 s1, s3, s7
5020 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
5021 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5022 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
5023 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31
5024 ; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
5025 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
5026 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
5027 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
5028 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
5029 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc
5030 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
5031 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
5032 ; GFX8-NEXT: v_readfirstlane_b32 s0, v2
5033 ; GFX8-NEXT: v_readfirstlane_b32 s1, v3
5034 ; GFX8-NEXT: v_readfirstlane_b32 s2, v0
5035 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1
5036 ; GFX8-NEXT: ; return to shader part epilog
5038 ; GFX9-LABEL: s_ssubsat_v2i64:
5040 ; GFX9-NEXT: s_sub_u32 s8, s0, s4
5041 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5042 ; GFX9-NEXT: s_subb_u32 s9, s1, s5
5043 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5044 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5045 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
5046 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31
5047 ; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
5048 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5049 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5050 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
5051 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
5052 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
5053 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
5054 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
5055 ; GFX9-NEXT: s_sub_u32 s0, s2, s6
5056 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5057 ; GFX9-NEXT: s_subb_u32 s1, s3, s7
5058 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5059 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5060 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
5061 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31
5062 ; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
5063 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5064 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5065 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
5066 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5067 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc
5068 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
5069 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
5070 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
5071 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
5072 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0
5073 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1
5074 ; GFX9-NEXT: ; return to shader part epilog
5076 ; GFX10-LABEL: s_ssubsat_v2i64:
5078 ; GFX10-NEXT: s_sub_u32 s8, s0, s4
5079 ; GFX10-NEXT: s_subb_u32 s9, s1, s5
5080 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
5081 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5082 ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
5083 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31
5084 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
5085 ; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000
5086 ; GFX10-NEXT: s_xor_b32 s8, s1, s0
5087 ; GFX10-NEXT: s_sub_u32 s0, s2, s6
5088 ; GFX10-NEXT: s_subb_u32 s1, s3, s7
5089 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
5090 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5091 ; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
5092 ; GFX10-NEXT: v_mov_b32_e32 v3, s1
5093 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
5094 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31
5095 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
5096 ; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
5097 ; GFX10-NEXT: s_xor_b32 s1, s3, s2
5098 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
5099 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
5100 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
5101 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
5102 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
5103 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
5104 ; GFX10-NEXT: ; return to shader part epilog
5106 ; GFX11-LABEL: s_ssubsat_v2i64:
5108 ; GFX11-NEXT: s_sub_u32 s8, s0, s4
5109 ; GFX11-NEXT: s_subb_u32 s9, s1, s5
5110 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
5111 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5112 ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
5113 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31
5114 ; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000
5115 ; GFX11-NEXT: s_xor_b32 s8, s1, s0
5116 ; GFX11-NEXT: s_sub_u32 s0, s2, s6
5117 ; GFX11-NEXT: s_subb_u32 s1, s3, s7
5118 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
5119 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5120 ; GFX11-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
5121 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
5122 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31
5123 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
5124 ; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
5125 ; GFX11-NEXT: s_xor_b32 s1, s3, s2
5126 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
5127 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
5128 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
5129 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
5130 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2
5131 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
5132 ; GFX11-NEXT: ; return to shader part epilog
5133 %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
5134 ret <2 x i64> %result
5137 define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
5138 ; GFX6-LABEL: s_ssubsat_i128:
5140 ; GFX6-NEXT: s_sub_u32 s8, s0, s4
5141 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5142 ; GFX6-NEXT: s_subb_u32 s9, s1, s5
5143 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
5144 ; GFX6-NEXT: s_subb_u32 s10, s2, s6
5145 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
5146 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5147 ; GFX6-NEXT: s_subb_u32 s11, s3, s7
5148 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
5149 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5150 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
5151 ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
5152 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
5153 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
5154 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5155 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
5156 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
5157 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0
5158 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5159 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5160 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5161 ; GFX6-NEXT: s_ashr_i32 s0, s11, 31
5162 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5163 ; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
5164 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
5165 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
5166 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
5167 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5168 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5169 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5170 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5171 ; GFX6-NEXT: v_mov_b32_e32 v4, s10
5172 ; GFX6-NEXT: v_mov_b32_e32 v5, s11
5173 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
5174 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
5175 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
5176 ; GFX6-NEXT: v_readfirstlane_b32 s1, v2
5177 ; GFX6-NEXT: v_readfirstlane_b32 s2, v1
5178 ; GFX6-NEXT: v_readfirstlane_b32 s3, v3
5179 ; GFX6-NEXT: ; return to shader part epilog
5181 ; GFX8-LABEL: s_ssubsat_i128:
5183 ; GFX8-NEXT: s_sub_u32 s8, s0, s4
5184 ; GFX8-NEXT: s_subb_u32 s9, s1, s5
5185 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
5186 ; GFX8-NEXT: s_subb_u32 s10, s2, s6
5187 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
5188 ; GFX8-NEXT: s_subb_u32 s11, s3, s7
5189 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
5190 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5191 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
5192 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
5193 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
5194 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5195 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
5196 ; GFX8-NEXT: s_and_b32 s0, 1, s2
5197 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5198 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5199 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
5200 ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0
5201 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5202 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
5203 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
5204 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5205 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5206 ; GFX8-NEXT: s_and_b32 s0, 1, s2
5207 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5208 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5209 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
5210 ; GFX8-NEXT: s_ashr_i32 s0, s11, 31
5211 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5212 ; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
5213 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
5214 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
5215 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
5216 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5217 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5218 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5219 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
5220 ; GFX8-NEXT: v_mov_b32_e32 v4, s10
5221 ; GFX8-NEXT: v_mov_b32_e32 v5, s11
5222 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
5223 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
5224 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
5225 ; GFX8-NEXT: v_readfirstlane_b32 s1, v2
5226 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1
5227 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
5228 ; GFX8-NEXT: ; return to shader part epilog
5230 ; GFX9-LABEL: s_ssubsat_i128:
5232 ; GFX9-NEXT: s_sub_u32 s8, s0, s4
5233 ; GFX9-NEXT: s_subb_u32 s9, s1, s5
5234 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5235 ; GFX9-NEXT: s_subb_u32 s10, s2, s6
5236 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
5237 ; GFX9-NEXT: s_subb_u32 s11, s3, s7
5238 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5239 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5240 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5241 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
5242 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
5243 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5244 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
5245 ; GFX9-NEXT: s_and_b32 s0, 1, s2
5246 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5247 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5248 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
5249 ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0
5250 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5251 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
5252 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
5253 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5254 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5255 ; GFX9-NEXT: s_and_b32 s0, 1, s2
5256 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5257 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5258 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
5259 ; GFX9-NEXT: s_ashr_i32 s0, s11, 31
5260 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5261 ; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
5262 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
5263 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
5264 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
5265 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5266 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5267 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5268 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5269 ; GFX9-NEXT: v_mov_b32_e32 v4, s10
5270 ; GFX9-NEXT: v_mov_b32_e32 v5, s11
5271 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
5272 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
5273 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
5274 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
5275 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
5276 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
5277 ; GFX9-NEXT: ; return to shader part epilog
5279 ; GFX10-LABEL: s_ssubsat_i128:
5281 ; GFX10-NEXT: s_sub_u32 s8, s0, s4
5282 ; GFX10-NEXT: s_subb_u32 s9, s1, s5
5283 ; GFX10-NEXT: s_subb_u32 s10, s2, s6
5284 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
5285 ; GFX10-NEXT: s_subb_u32 s11, s3, s7
5286 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
5287 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0
5288 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5289 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
5290 ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0
5291 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5292 ; GFX10-NEXT: s_and_b32 s0, 1, s12
5293 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
5294 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
5295 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
5296 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5297 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0
5298 ; GFX10-NEXT: s_ashr_i32 s0, s11, 31
5299 ; GFX10-NEXT: s_and_b32 s1, 1, s1
5300 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
5301 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5302 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
5303 ; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
5304 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5305 ; GFX10-NEXT: v_mov_b32_e32 v2, s9
5306 ; GFX10-NEXT: v_mov_b32_e32 v3, s11
5307 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
5308 ; GFX10-NEXT: v_mov_b32_e32 v1, s8
5309 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5310 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5311 ; GFX10-NEXT: v_mov_b32_e32 v0, s10
5312 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5313 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5314 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5315 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5316 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
5317 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
5318 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0
5319 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
5320 ; GFX10-NEXT: ; return to shader part epilog
5322 ; GFX11-LABEL: s_ssubsat_i128:
5324 ; GFX11-NEXT: s_sub_u32 s8, s0, s4
5325 ; GFX11-NEXT: s_subb_u32 s9, s1, s5
5326 ; GFX11-NEXT: s_subb_u32 s10, s2, s6
5327 ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
5328 ; GFX11-NEXT: s_subb_u32 s11, s3, s7
5329 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
5330 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0
5331 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5332 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
5333 ; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0
5334 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5335 ; GFX11-NEXT: s_and_b32 s0, 1, s12
5336 ; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
5337 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
5338 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
5339 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5340 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5341 ; GFX11-NEXT: s_ashr_i32 s0, s11, 31
5342 ; GFX11-NEXT: s_and_b32 s1, 1, s1
5343 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
5344 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5345 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
5346 ; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
5347 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
5348 ; GFX11-NEXT: v_mov_b32_e32 v3, s11
5349 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5350 ; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0
5351 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5352 ; GFX11-NEXT: v_mov_b32_e32 v0, s10
5353 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5354 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5355 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5356 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5357 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
5358 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2
5359 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
5360 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
5361 ; GFX11-NEXT: ; return to shader part epilog
5362 %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5366 define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
5367 ; GFX6-LABEL: ssubsat_i128_sv:
5369 ; GFX6-NEXT: v_mov_b32_e32 v5, s1
5370 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s0, v0
5371 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
5372 ; GFX6-NEXT: v_mov_b32_e32 v6, s2
5373 ; GFX6-NEXT: v_mov_b32_e32 v7, s3
5374 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
5375 ; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
5376 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5377 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
5378 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5379 ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
5380 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5381 ; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
5382 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5383 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5384 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5385 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5386 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5387 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5388 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5389 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8
5390 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
5391 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
5392 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5393 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5394 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5395 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5396 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5397 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5398 ; GFX6-NEXT: ; return to shader part epilog
5400 ; GFX8-LABEL: ssubsat_i128_sv:
5402 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
5403 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s0, v0
5404 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
5405 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
5406 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
5407 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
5408 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
5409 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5410 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
5411 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5412 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
5413 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5414 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
5415 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5416 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5417 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5418 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5419 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5420 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5421 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5422 ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8
5423 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
5424 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
5425 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5426 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5427 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5428 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5429 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5430 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5431 ; GFX8-NEXT: ; return to shader part epilog
5433 ; GFX9-LABEL: ssubsat_i128_sv:
5435 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5436 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s0, v0
5437 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
5438 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
5439 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
5440 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc
5441 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
5442 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5443 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
5444 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5445 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
5446 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5447 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
5448 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5449 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5450 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5451 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5452 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5453 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5454 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5455 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
5456 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
5457 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
5458 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5459 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5460 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5461 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5462 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5463 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5464 ; GFX9-NEXT: ; return to shader part epilog
5466 ; GFX10-LABEL: ssubsat_i128_sv:
5468 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0
5469 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5470 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5471 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5472 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
5473 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5474 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
5475 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
5476 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
5477 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5478 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
5479 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5480 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7]
5481 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
5482 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5483 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5484 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5485 ; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
5486 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8
5487 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5488 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5489 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5490 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5491 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5492 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
5493 ; GFX10-NEXT: ; return to shader part epilog
5495 ; GFX11-LABEL: ssubsat_i128_sv:
5497 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0
5498 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5499 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5500 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5501 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
5502 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5503 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
5504 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
5505 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
5506 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5507 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
5508 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5509 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7]
5510 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
5511 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5512 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5513 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5514 ; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
5515 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8
5516 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
5517 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5518 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5519 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5520 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
5521 ; GFX11-NEXT: ; return to shader part epilog
5522 %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5523 %cast = bitcast i128 %result to <4 x float>
5524 ret <4 x float> %cast
5527 define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
5528 ; GFX6-LABEL: ssubsat_i128_vs:
5530 ; GFX6-NEXT: v_mov_b32_e32 v5, s1
5531 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v0
5532 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc
5533 ; GFX6-NEXT: v_mov_b32_e32 v6, s2
5534 ; GFX6-NEXT: v_mov_b32_e32 v7, s3
5535 ; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
5536 ; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
5537 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5538 ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5539 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5540 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5541 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5542 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5543 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5544 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5545 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5546 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0
5547 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5548 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5549 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5550 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5551 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
5552 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
5553 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5554 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5555 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5556 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5557 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5558 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5559 ; GFX6-NEXT: ; return to shader part epilog
5561 ; GFX8-LABEL: ssubsat_i128_vs:
5563 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
5564 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s0, v0
5565 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc
5566 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
5567 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
5568 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
5569 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
5570 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5571 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5572 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5573 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5574 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0
5575 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5576 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5577 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0
5578 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5579 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5580 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5581 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5582 ; GFX8-NEXT: s_and_b32 s0, 1, s4
5583 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5584 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5585 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
5586 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5587 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
5588 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
5589 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5590 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5591 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5592 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5593 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5594 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5595 ; GFX8-NEXT: ; return to shader part epilog
5597 ; GFX9-LABEL: ssubsat_i128_vs:
5599 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5600 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s0, v0
5601 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
5602 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
5603 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
5604 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v6, vcc
5605 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
5606 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5607 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5608 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5609 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5610 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0
5611 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5612 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5613 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0
5614 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5615 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5616 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5617 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5618 ; GFX9-NEXT: s_and_b32 s0, 1, s4
5619 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5620 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5621 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
5622 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5623 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
5624 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
5625 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5626 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5627 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5628 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5629 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5630 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5631 ; GFX9-NEXT: ; return to shader part epilog
5633 ; GFX10-LABEL: ssubsat_i128_vs:
5635 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0
5636 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5637 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5638 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5639 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5640 ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
5641 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
5642 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0
5643 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5644 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5645 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
5646 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
5647 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5648 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5649 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
5650 ; GFX10-NEXT: s_and_b32 s0, 1, s4
5651 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5652 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5653 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5654 ; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
5655 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
5656 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
5657 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5658 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5659 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5660 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5661 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5662 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
5663 ; GFX10-NEXT: ; return to shader part epilog
5665 ; GFX11-LABEL: ssubsat_i128_vs:
5667 ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0
5668 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5669 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5670 ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5671 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5672 ; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
5673 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
5674 ; GFX11-NEXT: s_cselect_b32 s4, 1, 0
5675 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5676 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5677 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
5678 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
5679 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5680 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5681 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5682 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
5683 ; GFX11-NEXT: s_and_b32 s0, 1, s4
5684 ; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
5685 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5686 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5687 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
5688 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5689 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
5690 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5691 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5692 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5693 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
5694 ; GFX11-NEXT: ; return to shader part epilog
5695 %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5696 %cast = bitcast i128 %result to <4 x float>
5697 ret <4 x float> %cast
5700 define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
5701 ; GFX6-LABEL: v_ssubsat_v2i128:
5703 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5704 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v0, v8
5705 ; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc
5706 ; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc
5707 ; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc
5708 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5709 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5710 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5711 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5712 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5713 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5714 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5715 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5716 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5717 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5718 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5719 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5720 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5721 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v19
5722 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
5723 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
5724 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5725 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5726 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
5727 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
5728 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
5729 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
5730 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12
5731 ; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc
5732 ; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc
5733 ; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc
5734 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5735 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
5736 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5737 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5738 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5739 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
5740 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5741 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5742 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5743 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
5744 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5745 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
5746 ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4
5747 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11
5748 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6
5749 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
5750 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
5751 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
5752 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
5753 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
5754 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
5755 ; GFX6-NEXT: s_setpc_b64 s[30:31]
5757 ; GFX8-LABEL: v_ssubsat_v2i128:
5759 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5760 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v0, v8
5761 ; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc
5762 ; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc
5763 ; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc
5764 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5765 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5766 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5767 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5768 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5769 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5770 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5771 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5772 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5773 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5774 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5775 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5776 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
5777 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v19
5778 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
5779 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
5780 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5781 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5782 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
5783 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
5784 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
5785 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
5786 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12
5787 ; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc
5788 ; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc
5789 ; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc
5790 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5791 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
5792 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5793 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5794 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5795 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
5796 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5797 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5798 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5799 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
5800 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5801 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
5802 ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4
5803 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11
5804 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6
5805 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
5806 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
5807 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
5808 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
5809 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
5810 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
5811 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5813 ; GFX9-LABEL: v_ssubsat_v2i128:
5815 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5816 ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v0, v8
5817 ; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v1, v9, vcc
5818 ; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc
5819 ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc
5820 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5821 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5822 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5823 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5824 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5825 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5826 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5827 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5828 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5829 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5830 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5831 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
5832 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
5833 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19
5834 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
5835 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
5836 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5837 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5838 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
5839 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
5840 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
5841 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
5842 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12
5843 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc
5844 ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc
5845 ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc
5846 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5847 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
5848 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5849 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5850 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5851 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
5852 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5853 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5854 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5855 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
5856 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5857 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
5858 ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
5859 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
5860 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6
5861 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
5862 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
5863 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
5864 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
5865 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
5866 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
5867 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5869 ; GFX10-LABEL: v_ssubsat_v2i128:
5871 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5872 ; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8
5873 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
5874 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
5875 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
5876 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
5877 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5878 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
5879 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5880 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
5881 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5882 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
5883 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5884 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11]
5885 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5886 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12
5887 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
5888 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo
5889 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo
5890 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5891 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
5892 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5]
5893 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
5894 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5895 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7]
5896 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5897 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5898 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13]
5899 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5900 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15]
5901 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
5902 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7]
5903 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21
5904 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5905 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5906 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19
5907 ; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6
5908 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
5909 ; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3
5910 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5911 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
5912 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
5913 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo
5914 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v1
5915 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
5916 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
5917 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5
5918 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4
5919 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4
5920 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4
5921 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, v7, s4
5922 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5924 ; GFX11-LABEL: v_ssubsat_v2i128:
5926 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5927 ; GFX11-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8
5928 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
5929 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
5930 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
5931 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
5932 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5933 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
5934 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5935 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
5936 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5937 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
5938 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5939 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11]
5940 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5941 ; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12
5942 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
5943 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo
5944 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo
5945 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5946 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
5947 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5]
5948 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5949 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5950 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7]
5951 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5952 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13]
5953 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5954 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15]
5955 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
5956 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7]
5957 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21
5958 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5959 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5960 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19
5961 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
5962 ; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6
5963 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
5964 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5965 ; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3
5966 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
5967 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
5968 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1
5969 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
5970 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
5971 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
5972 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0
5973 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0
5974 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0
5975 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0
5976 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5977 %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
5978 ret <2 x i128> %result
5981 define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
5982 ; GFX6-LABEL: s_ssubsat_v2i128:
5984 ; GFX6-NEXT: s_sub_u32 s16, s0, s8
5985 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5986 ; GFX6-NEXT: s_subb_u32 s17, s1, s9
5987 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
5988 ; GFX6-NEXT: s_subb_u32 s18, s2, s10
5989 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
5990 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
5991 ; GFX6-NEXT: s_subb_u32 s19, s3, s11
5992 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
5993 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5994 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
5995 ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
5996 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
5997 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1]
5998 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5999 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
6000 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
6001 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0
6002 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
6003 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
6004 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
6005 ; GFX6-NEXT: s_ashr_i32 s0, s19, 31
6006 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
6007 ; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
6008 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
6009 ; GFX6-NEXT: v_mov_b32_e32 v2, s16
6010 ; GFX6-NEXT: v_mov_b32_e32 v3, s17
6011 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6012 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
6013 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
6014 ; GFX6-NEXT: v_mov_b32_e32 v0, s1
6015 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
6016 ; GFX6-NEXT: v_mov_b32_e32 v3, s19
6017 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
6018 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
6019 ; GFX6-NEXT: s_sub_u32 s0, s4, s12
6020 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
6021 ; GFX6-NEXT: s_subb_u32 s1, s5, s13
6022 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
6023 ; GFX6-NEXT: s_subb_u32 s2, s6, s14
6024 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
6025 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
6026 ; GFX6-NEXT: s_subb_u32 s3, s7, s15
6027 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
6028 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6029 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
6030 ; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
6031 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
6032 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
6033 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6034 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
6035 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
6036 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0
6037 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
6038 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
6039 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
6040 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31
6041 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
6042 ; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
6043 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
6044 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
6045 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
6046 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6047 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
6048 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
6049 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
6050 ; GFX6-NEXT: v_mov_b32_e32 v8, s2
6051 ; GFX6-NEXT: v_mov_b32_e32 v9, s3
6052 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
6053 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
6054 ; GFX6-NEXT: v_readfirstlane_b32 s0, v4
6055 ; GFX6-NEXT: v_readfirstlane_b32 s1, v5
6056 ; GFX6-NEXT: v_readfirstlane_b32 s2, v6
6057 ; GFX6-NEXT: v_readfirstlane_b32 s3, v7
6058 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
6059 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2
6060 ; GFX6-NEXT: v_readfirstlane_b32 s6, v1
6061 ; GFX6-NEXT: v_readfirstlane_b32 s7, v3
6062 ; GFX6-NEXT: ; return to shader part epilog
6064 ; GFX8-LABEL: s_ssubsat_v2i128:
6066 ; GFX8-NEXT: s_sub_u32 s16, s0, s8
6067 ; GFX8-NEXT: s_subb_u32 s17, s1, s9
6068 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
6069 ; GFX8-NEXT: s_subb_u32 s18, s2, s10
6070 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
6071 ; GFX8-NEXT: s_subb_u32 s19, s3, s11
6072 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
6073 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
6074 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
6075 ; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
6076 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
6077 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6078 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
6079 ; GFX8-NEXT: s_and_b32 s0, 1, s2
6080 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6081 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
6082 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
6083 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0
6084 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6085 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
6086 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
6087 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6088 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
6089 ; GFX8-NEXT: s_and_b32 s0, 1, s2
6090 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
6091 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
6092 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
6093 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31
6094 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
6095 ; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
6096 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
6097 ; GFX8-NEXT: v_mov_b32_e32 v2, s16
6098 ; GFX8-NEXT: v_mov_b32_e32 v3, s17
6099 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6100 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
6101 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
6102 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
6103 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
6104 ; GFX8-NEXT: v_mov_b32_e32 v3, s19
6105 ; GFX8-NEXT: s_sub_u32 s0, s4, s12
6106 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
6107 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
6108 ; GFX8-NEXT: s_subb_u32 s1, s5, s13
6109 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
6110 ; GFX8-NEXT: s_subb_u32 s2, s6, s14
6111 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
6112 ; GFX8-NEXT: s_subb_u32 s3, s7, s15
6113 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
6114 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
6115 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
6116 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6117 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0
6118 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6119 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
6120 ; GFX8-NEXT: s_and_b32 s4, 1, s6
6121 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6122 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
6123 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
6124 ; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0
6125 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6126 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
6127 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0
6128 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6129 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
6130 ; GFX8-NEXT: s_and_b32 s4, 1, s6
6131 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
6132 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
6133 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
6134 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31
6135 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
6136 ; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
6137 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
6138 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
6139 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
6140 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6141 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
6142 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
6143 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
6144 ; GFX8-NEXT: v_mov_b32_e32 v8, s2
6145 ; GFX8-NEXT: v_mov_b32_e32 v9, s3
6146 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
6147 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
6148 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4
6149 ; GFX8-NEXT: v_readfirstlane_b32 s1, v5
6150 ; GFX8-NEXT: v_readfirstlane_b32 s2, v6
6151 ; GFX8-NEXT: v_readfirstlane_b32 s3, v7
6152 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0
6153 ; GFX8-NEXT: v_readfirstlane_b32 s5, v2
6154 ; GFX8-NEXT: v_readfirstlane_b32 s6, v1
6155 ; GFX8-NEXT: v_readfirstlane_b32 s7, v3
6156 ; GFX8-NEXT: ; return to shader part epilog
6158 ; GFX9-LABEL: s_ssubsat_v2i128:
6160 ; GFX9-NEXT: s_sub_u32 s16, s0, s8
6161 ; GFX9-NEXT: s_subb_u32 s17, s1, s9
6162 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
6163 ; GFX9-NEXT: s_subb_u32 s18, s2, s10
6164 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
6165 ; GFX9-NEXT: s_subb_u32 s19, s3, s11
6166 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6167 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
6168 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6169 ; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
6170 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
6171 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6172 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
6173 ; GFX9-NEXT: s_and_b32 s0, 1, s2
6174 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6175 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
6176 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
6177 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
6178 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6179 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
6180 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
6181 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6182 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
6183 ; GFX9-NEXT: s_and_b32 s0, 1, s2
6184 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
6185 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
6186 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
6187 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31
6188 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
6189 ; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
6190 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
6191 ; GFX9-NEXT: v_mov_b32_e32 v2, s16
6192 ; GFX9-NEXT: v_mov_b32_e32 v3, s17
6193 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6194 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
6195 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
6196 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
6197 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
6198 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
6199 ; GFX9-NEXT: s_sub_u32 s0, s4, s12
6200 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
6201 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
6202 ; GFX9-NEXT: s_subb_u32 s1, s5, s13
6203 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
6204 ; GFX9-NEXT: s_subb_u32 s2, s6, s14
6205 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
6206 ; GFX9-NEXT: s_subb_u32 s3, s7, s15
6207 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
6208 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
6209 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
6210 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6211 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0
6212 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6213 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
6214 ; GFX9-NEXT: s_and_b32 s4, 1, s6
6215 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6216 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
6217 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
6218 ; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
6219 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6220 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
6221 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0
6222 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6223 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
6224 ; GFX9-NEXT: s_and_b32 s4, 1, s6
6225 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
6226 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
6227 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
6228 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
6229 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
6230 ; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
6231 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6232 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
6233 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
6234 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6235 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
6236 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
6237 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
6238 ; GFX9-NEXT: v_mov_b32_e32 v8, s2
6239 ; GFX9-NEXT: v_mov_b32_e32 v9, s3
6240 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
6241 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
6242 ; GFX9-NEXT: v_readfirstlane_b32 s0, v4
6243 ; GFX9-NEXT: v_readfirstlane_b32 s1, v5
6244 ; GFX9-NEXT: v_readfirstlane_b32 s2, v6
6245 ; GFX9-NEXT: v_readfirstlane_b32 s3, v7
6246 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0
6247 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2
6248 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1
6249 ; GFX9-NEXT: v_readfirstlane_b32 s7, v3
6250 ; GFX9-NEXT: ; return to shader part epilog
6252 ; GFX10-LABEL: s_ssubsat_v2i128:
6254 ; GFX10-NEXT: s_sub_u32 s18, s0, s8
6255 ; GFX10-NEXT: s_subb_u32 s19, s1, s9
6256 ; GFX10-NEXT: s_subb_u32 s16, s2, s10
6257 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
6258 ; GFX10-NEXT: s_subb_u32 s17, s3, s11
6259 ; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
6260 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0
6261 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
6262 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6263 ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
6264 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
6265 ; GFX10-NEXT: s_and_b32 s0, 1, s20
6266 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
6267 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
6268 ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
6269 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
6270 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0
6271 ; GFX10-NEXT: s_ashr_i32 s8, s17, 31
6272 ; GFX10-NEXT: s_and_b32 s1, 1, s1
6273 ; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000
6274 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
6275 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6276 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
6277 ; GFX10-NEXT: s_sub_u32 s0, s4, s12
6278 ; GFX10-NEXT: s_subb_u32 s1, s5, s13
6279 ; GFX10-NEXT: s_subb_u32 s2, s6, s14
6280 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6281 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
6282 ; GFX10-NEXT: s_subb_u32 s3, s7, s15
6283 ; GFX10-NEXT: v_mov_b32_e32 v5, s0
6284 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6285 ; GFX10-NEXT: v_mov_b32_e32 v6, s1
6286 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
6287 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
6288 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6289 ; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0
6290 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0
6291 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
6292 ; GFX10-NEXT: v_mov_b32_e32 v7, s3
6293 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
6294 ; GFX10-NEXT: s_and_b32 s4, 1, s10
6295 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
6296 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
6297 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
6298 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
6299 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0
6300 ; GFX10-NEXT: s_ashr_i32 s4, s3, 31
6301 ; GFX10-NEXT: s_and_b32 s5, 1, s5
6302 ; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
6303 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
6304 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6305 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
6306 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo
6307 ; GFX10-NEXT: v_mov_b32_e32 v3, s18
6308 ; GFX10-NEXT: v_mov_b32_e32 v4, s19
6309 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
6310 ; GFX10-NEXT: v_mov_b32_e32 v0, s16
6311 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
6312 ; GFX10-NEXT: v_mov_b32_e32 v2, s17
6313 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo
6314 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
6315 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
6316 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo
6317 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
6318 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4
6319 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
6320 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
6321 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0
6322 ; GFX10-NEXT: v_readfirstlane_b32 s3, v2
6323 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6324 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6325 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6326 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6327 ; GFX10-NEXT: v_readfirstlane_b32 s0, v3
6328 ; GFX10-NEXT: v_readfirstlane_b32 s4, v5
6329 ; GFX10-NEXT: v_readfirstlane_b32 s5, v6
6330 ; GFX10-NEXT: v_readfirstlane_b32 s6, v1
6331 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7
6332 ; GFX10-NEXT: ; return to shader part epilog
6334 ; GFX11-LABEL: s_ssubsat_v2i128:
6336 ; GFX11-NEXT: s_sub_u32 s16, s0, s8
6337 ; GFX11-NEXT: s_subb_u32 s17, s1, s9
6338 ; GFX11-NEXT: s_subb_u32 s18, s2, s10
6339 ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
6340 ; GFX11-NEXT: s_subb_u32 s19, s3, s11
6341 ; GFX11-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
6342 ; GFX11-NEXT: s_cselect_b32 s20, 1, 0
6343 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
6344 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
6345 ; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
6346 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
6347 ; GFX11-NEXT: s_and_b32 s0, 1, s20
6348 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
6349 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
6350 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
6351 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
6352 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0
6353 ; GFX11-NEXT: s_ashr_i32 s8, s19, 31
6354 ; GFX11-NEXT: s_and_b32 s1, 1, s1
6355 ; GFX11-NEXT: s_add_u32 s9, s8, 0x80000000
6356 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
6357 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6358 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
6359 ; GFX11-NEXT: s_sub_u32 s0, s4, s12
6360 ; GFX11-NEXT: s_subb_u32 s1, s5, s13
6361 ; GFX11-NEXT: s_subb_u32 s2, s6, s14
6362 ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6363 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
6364 ; GFX11-NEXT: s_subb_u32 s3, s7, s15
6365 ; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
6366 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6367 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
6368 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
6369 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6370 ; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0
6371 ; GFX11-NEXT: s_cselect_b32 s10, 1, 0
6372 ; GFX11-NEXT: v_mov_b32_e32 v5, s0
6373 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
6374 ; GFX11-NEXT: s_and_b32 s4, 1, s10
6375 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
6376 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
6377 ; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
6378 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
6379 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0
6380 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
6381 ; GFX11-NEXT: s_and_b32 s5, 1, s5
6382 ; GFX11-NEXT: s_ashr_i32 s4, s3, 31
6383 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6384 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
6385 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
6386 ; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
6387 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16
6388 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
6389 ; GFX11-NEXT: v_mov_b32_e32 v0, s18
6390 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
6391 ; GFX11-NEXT: v_mov_b32_e32 v4, s17
6392 ; GFX11-NEXT: v_mov_b32_e32 v2, s19
6393 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo
6394 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo
6395 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
6396 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
6397 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
6398 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
6399 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
6400 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
6401 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
6402 ; GFX11-NEXT: v_readfirstlane_b32 s3, v2
6403 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6404 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6405 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6406 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6407 ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
6408 ; GFX11-NEXT: v_readfirstlane_b32 s4, v5
6409 ; GFX11-NEXT: v_readfirstlane_b32 s5, v6
6410 ; GFX11-NEXT: v_readfirstlane_b32 s6, v1
6411 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7
6412 ; GFX11-NEXT: ; return to shader part epilog
6413 %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
6414 ret <2 x i128> %result
6417 declare i7 @llvm.ssub.sat.i7(i7, i7) #0
6418 declare i8 @llvm.ssub.sat.i8(i8, i8) #0
6419 declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) #0
6420 declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) #0
6422 declare i16 @llvm.ssub.sat.i16(i16, i16) #0
6423 declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) #0
6424 declare <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16>, <3 x i16>) #0
6425 declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) #0
6426 declare <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16>, <5 x i16>) #0
6427 declare <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16>, <6 x i16>) #0
6428 declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) #0
6430 declare i24 @llvm.ssub.sat.i24(i24, i24) #0
6432 declare i32 @llvm.ssub.sat.i32(i32, i32) #0
6433 declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) #0
6434 declare <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32>, <3 x i32>) #0
6435 declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) #0
6436 declare <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32>, <5 x i32>) #0
6437 declare <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32>, <16 x i32>) #0
6439 declare i48 @llvm.ssub.sat.i48(i48, i48) #0
6441 declare i64 @llvm.ssub.sat.i64(i64, i64) #0
6442 declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) #0
6444 declare i128 @llvm.ssub.sat.i128(i128, i128) #0
6445 declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>) #0
6447 attributes #0 = { nounwind readnone speculatable willreturn }