1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
9 ; GFX6-LABEL: v_saddsat_i7:
11 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0
13 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
14 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1
15 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
17 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
18 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
19 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
20 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
21 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0
22 ; GFX6-NEXT: s_setpc_b64 s[30:31]
24 ; GFX8-LABEL: v_saddsat_i7:
26 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0
28 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0
29 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1
30 ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0
31 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3
32 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2
33 ; GFX8-NEXT: v_max_i16_e32 v1, v3, v1
34 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2
35 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
36 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 9, v0
37 ; GFX8-NEXT: s_setpc_b64 s[30:31]
39 ; GFX9-LABEL: v_saddsat_i7:
41 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0
43 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1
44 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
45 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
46 ; GFX9-NEXT: s_setpc_b64 s[30:31]
48 ; GFX10PLUS-LABEL: v_saddsat_i7:
50 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0
52 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1
53 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
54 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
55 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
56 %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
60 define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
61 ; GFX6-LABEL: s_saddsat_i7:
63 ; GFX6-NEXT: s_lshl_b32 s0, s0, 25
64 ; GFX6-NEXT: s_min_i32 s3, s0, 0
65 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25
66 ; GFX6-NEXT: s_max_i32 s2, s0, 0
67 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3
68 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
69 ; GFX6-NEXT: s_max_i32 s1, s3, s1
70 ; GFX6-NEXT: s_min_i32 s1, s1, s2
71 ; GFX6-NEXT: s_add_i32 s0, s0, s1
72 ; GFX6-NEXT: s_ashr_i32 s0, s0, 25
73 ; GFX6-NEXT: ; return to shader part epilog
75 ; GFX8-LABEL: s_saddsat_i7:
77 ; GFX8-NEXT: s_lshl_b32 s0, s0, 9
78 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
79 ; GFX8-NEXT: s_sext_i32_i16 s3, 0
80 ; GFX8-NEXT: s_max_i32 s4, s2, s3
81 ; GFX8-NEXT: s_min_i32 s2, s2, s3
82 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9
83 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
84 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
85 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
86 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
87 ; GFX8-NEXT: s_max_i32 s1, s2, s1
88 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
89 ; GFX8-NEXT: s_sext_i32_i16 s2, s4
90 ; GFX8-NEXT: s_min_i32 s1, s1, s2
91 ; GFX8-NEXT: s_add_i32 s0, s0, s1
92 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
93 ; GFX8-NEXT: s_ashr_i32 s0, s0, 9
94 ; GFX8-NEXT: ; return to shader part epilog
96 ; GFX9-LABEL: s_saddsat_i7:
98 ; GFX9-NEXT: s_lshl_b32 s1, s1, 9
99 ; GFX9-NEXT: s_lshl_b32 s0, s0, 9
100 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
101 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
102 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
103 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
104 ; GFX9-NEXT: ; return to shader part epilog
106 ; GFX10PLUS-LABEL: s_saddsat_i7:
107 ; GFX10PLUS: ; %bb.0:
108 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
109 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
110 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
111 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
112 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
113 ; GFX10PLUS-NEXT: ; return to shader part epilog
114 %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
118 define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
119 ; GFX6-LABEL: v_saddsat_i8:
121 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
123 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
124 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
125 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
126 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
127 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
128 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
129 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
130 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
131 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0
132 ; GFX6-NEXT: s_setpc_b64 s[30:31]
134 ; GFX8-LABEL: v_saddsat_i8:
136 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
138 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0
139 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
140 ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0
141 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3
142 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2
143 ; GFX8-NEXT: v_max_i16_e32 v1, v3, v1
144 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2
145 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
146 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0
147 ; GFX8-NEXT: s_setpc_b64 s[30:31]
149 ; GFX9-LABEL: v_saddsat_i8:
151 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
153 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
154 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
155 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
156 ; GFX9-NEXT: s_setpc_b64 s[30:31]
158 ; GFX10PLUS-LABEL: v_saddsat_i8:
159 ; GFX10PLUS: ; %bb.0:
160 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
162 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
163 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
164 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
165 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
166 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
170 define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
171 ; GFX6-LABEL: s_saddsat_i8:
173 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
174 ; GFX6-NEXT: s_min_i32 s3, s0, 0
175 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
176 ; GFX6-NEXT: s_max_i32 s2, s0, 0
177 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3
178 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
179 ; GFX6-NEXT: s_max_i32 s1, s3, s1
180 ; GFX6-NEXT: s_min_i32 s1, s1, s2
181 ; GFX6-NEXT: s_add_i32 s0, s0, s1
182 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24
183 ; GFX6-NEXT: ; return to shader part epilog
185 ; GFX8-LABEL: s_saddsat_i8:
187 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
188 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
189 ; GFX8-NEXT: s_sext_i32_i16 s3, 0
190 ; GFX8-NEXT: s_max_i32 s4, s2, s3
191 ; GFX8-NEXT: s_min_i32 s2, s2, s3
192 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
193 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
194 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
195 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
196 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
197 ; GFX8-NEXT: s_max_i32 s1, s2, s1
198 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
199 ; GFX8-NEXT: s_sext_i32_i16 s2, s4
200 ; GFX8-NEXT: s_min_i32 s1, s1, s2
201 ; GFX8-NEXT: s_add_i32 s0, s0, s1
202 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
203 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8
204 ; GFX8-NEXT: ; return to shader part epilog
206 ; GFX9-LABEL: s_saddsat_i8:
208 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
209 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
210 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
211 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
212 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
213 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
214 ; GFX9-NEXT: ; return to shader part epilog
216 ; GFX10PLUS-LABEL: s_saddsat_i8:
217 ; GFX10PLUS: ; %bb.0:
218 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
219 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
220 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
221 ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
222 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
223 ; GFX10PLUS-NEXT: ; return to shader part epilog
224 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
228 define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
229 ; GFX6-LABEL: v_saddsat_v2i8:
231 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
233 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
234 ; GFX6-NEXT: s_brev_b32 s5, 1
235 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
236 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
237 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
238 ; GFX6-NEXT: s_brev_b32 s4, -2
239 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
240 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5
241 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
242 ; GFX6-NEXT: v_max_i32_e32 v1, v5, v1
243 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4
244 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
245 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
246 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
247 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
248 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
249 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4
250 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3
251 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
252 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
253 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
254 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1
255 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0
256 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
257 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
258 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
259 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
260 ; GFX6-NEXT: s_setpc_b64 s[30:31]
262 ; GFX8-LABEL: v_saddsat_v2i8:
264 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
266 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
267 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
268 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v0
269 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
270 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
271 ; GFX8-NEXT: v_max_i16_e32 v4, 0, v0
272 ; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5
273 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4
274 ; GFX8-NEXT: v_max_i16_e32 v1, v5, v1
275 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
276 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v3
277 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
278 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3
279 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4
280 ; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1
281 ; GFX8-NEXT: v_max_i16_e32 v2, v4, v2
282 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
283 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
284 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
285 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
287 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
288 ; GFX8-NEXT: s_setpc_b64 s[30:31]
290 ; GFX9-LABEL: v_saddsat_v2i8:
292 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
294 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
295 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
296 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
297 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
298 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
299 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
300 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
301 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
302 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
303 ; GFX9-NEXT: s_movk_i32 s4, 0xff
304 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
306 ; GFX9-NEXT: s_setpc_b64 s[30:31]
308 ; GFX10-LABEL: v_saddsat_v2i8:
310 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
312 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
313 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
314 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
315 ; GFX10-NEXT: s_movk_i32 s4, 0xff
316 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
317 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
318 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
319 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
320 ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp
321 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
322 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
323 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
324 ; GFX10-NEXT: s_setpc_b64 s[30:31]
326 ; GFX11-LABEL: v_saddsat_v2i8:
328 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
329 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
330 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
331 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
332 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
333 ; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
334 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
335 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
336 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
337 ; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp
338 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
339 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
340 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
341 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
342 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
343 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
344 ; GFX11-NEXT: s_setpc_b64 s[30:31]
345 %lhs = bitcast i16 %lhs.arg to <2 x i8>
346 %rhs = bitcast i16 %rhs.arg to <2 x i8>
347 %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
348 %cast.result = bitcast <2 x i8> %result to i16
352 define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
353 ; GFX6-LABEL: s_saddsat_v2i8:
355 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
356 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
357 ; GFX6-NEXT: s_min_i32 s5, s0, 0
358 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8
359 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
360 ; GFX6-NEXT: s_max_i32 s4, s0, 0
361 ; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5
362 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4
363 ; GFX6-NEXT: s_max_i32 s1, s5, s1
364 ; GFX6-NEXT: s_min_i32 s1, s1, s4
365 ; GFX6-NEXT: s_add_i32 s0, s0, s1
366 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
367 ; GFX6-NEXT: s_min_i32 s4, s1, 0
368 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
369 ; GFX6-NEXT: s_max_i32 s3, s1, 0
370 ; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4
371 ; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3
372 ; GFX6-NEXT: s_max_i32 s2, s4, s2
373 ; GFX6-NEXT: s_min_i32 s2, s2, s3
374 ; GFX6-NEXT: s_add_i32 s1, s1, s2
375 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24
376 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24
377 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff
378 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff
379 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
380 ; GFX6-NEXT: s_or_b32 s0, s0, s1
381 ; GFX6-NEXT: ; return to shader part epilog
383 ; GFX8-LABEL: s_saddsat_v2i8:
385 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
386 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
387 ; GFX8-NEXT: s_sext_i32_i16 s4, s0
388 ; GFX8-NEXT: s_sext_i32_i16 s5, 0
389 ; GFX8-NEXT: s_max_i32 s6, s4, s5
390 ; GFX8-NEXT: s_min_i32 s4, s4, s5
391 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8
392 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
393 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
394 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
395 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
396 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
397 ; GFX8-NEXT: s_max_i32 s1, s4, s1
398 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
399 ; GFX8-NEXT: s_sext_i32_i16 s4, s6
400 ; GFX8-NEXT: s_min_i32 s1, s1, s4
401 ; GFX8-NEXT: s_add_i32 s0, s0, s1
402 ; GFX8-NEXT: s_lshl_b32 s1, s2, 8
403 ; GFX8-NEXT: s_lshl_b32 s2, s3, 8
404 ; GFX8-NEXT: s_sext_i32_i16 s3, s1
405 ; GFX8-NEXT: s_max_i32 s4, s3, s5
406 ; GFX8-NEXT: s_min_i32 s3, s3, s5
407 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
408 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
409 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
410 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
411 ; GFX8-NEXT: s_max_i32 s2, s3, s2
412 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
413 ; GFX8-NEXT: s_sext_i32_i16 s3, s4
414 ; GFX8-NEXT: s_min_i32 s2, s2, s3
415 ; GFX8-NEXT: s_add_i32 s1, s1, s2
416 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
417 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
418 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8
419 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8
420 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff
421 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff
422 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
423 ; GFX8-NEXT: s_or_b32 s0, s0, s1
424 ; GFX8-NEXT: ; return to shader part epilog
426 ; GFX9-LABEL: s_saddsat_v2i8:
428 ; GFX9-NEXT: s_lshr_b32 s2, s0, 8
429 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
430 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8
431 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
432 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
433 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
434 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
435 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
436 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16
437 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
438 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
439 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
440 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
441 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
442 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
443 ; GFX9-NEXT: s_movk_i32 s0, 0xff
444 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
446 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
447 ; GFX9-NEXT: ; return to shader part epilog
449 ; GFX10-LABEL: s_saddsat_v2i8:
451 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
452 ; GFX10-NEXT: s_lshr_b32 s3, s1, 8
453 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
454 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
455 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
456 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
457 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
458 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8
459 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
460 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
461 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
462 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
463 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
464 ; GFX10-NEXT: s_movk_i32 s0, 0xff
465 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
466 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
467 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
468 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
469 ; GFX10-NEXT: ; return to shader part epilog
471 ; GFX11-LABEL: s_saddsat_v2i8:
473 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
474 ; GFX11-NEXT: s_lshr_b32 s3, s1, 8
475 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
476 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
477 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
478 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
479 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
480 ; GFX11-NEXT: s_lshl_b32 s2, s2, 8
481 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
482 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8
483 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
484 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
485 ; GFX11-NEXT: v_pk_add_i16 v0, s0, s1 clamp
486 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
487 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
488 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
489 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
490 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
491 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
492 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
493 ; GFX11-NEXT: ; return to shader part epilog
494 %lhs = bitcast i16 %lhs.arg to <2 x i8>
495 %rhs = bitcast i16 %rhs.arg to <2 x i8>
496 %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
497 %cast.result = bitcast <2 x i8> %result to i16
501 define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
502 ; GFX6-LABEL: v_saddsat_v4i8:
504 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
506 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
507 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0
508 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
509 ; GFX6-NEXT: s_brev_b32 s5, 1
510 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0
511 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1
512 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1
513 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1
514 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
515 ; GFX6-NEXT: s_brev_b32 s4, -2
516 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0
517 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10
518 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8
519 ; GFX6-NEXT: v_max_i32_e32 v1, v10, v1
520 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v8
521 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
522 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
523 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1
524 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5
525 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1
526 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
527 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
528 ; GFX6-NEXT: v_max_i32_e32 v2, v8, v2
529 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
530 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
531 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
532 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6
533 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2
534 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2
535 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2
536 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6
537 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
538 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3
539 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5
540 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
541 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4
542 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1
543 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3
544 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1
545 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7
546 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3
547 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6
548 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0
549 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
550 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
551 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
552 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2
553 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5
554 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
555 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
556 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
557 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
558 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2
559 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3
560 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
561 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
562 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3
563 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
564 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
565 ; GFX6-NEXT: s_setpc_b64 s[30:31]
567 ; GFX8-LABEL: v_saddsat_v4i8:
569 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
571 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
572 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
573 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0
574 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
575 ; GFX8-NEXT: v_min_i16_e32 v9, 0, v0
576 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
577 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
578 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1
579 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
580 ; GFX8-NEXT: v_max_i16_e32 v8, 0, v0
581 ; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9
582 ; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8
583 ; GFX8-NEXT: v_max_i16_e32 v1, v9, v1
584 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8
585 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v3
586 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
587 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3
588 ; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8
589 ; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1
590 ; GFX8-NEXT: v_max_i16_e32 v2, v8, v2
591 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
592 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4
593 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
594 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6
595 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v2
596 ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2
597 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6
598 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4
599 ; GFX8-NEXT: v_max_i16_e32 v3, v6, v3
600 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4
601 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v3
602 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5
603 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v3
604 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7
605 ; GFX8-NEXT: v_max_i16_e32 v5, 0, v3
606 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6
607 ; GFX8-NEXT: v_sub_u16_e32 v5, 0x7fff, v5
608 ; GFX8-NEXT: v_max_i16_e32 v4, v6, v4
609 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v5
610 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4
611 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
612 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
614 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
615 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
618 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
620 ; GFX8-NEXT: s_setpc_b64 s[30:31]
622 ; GFX9-LABEL: v_saddsat_v4i8:
624 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
626 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
627 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
628 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
629 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
630 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
631 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
632 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
633 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
634 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
635 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
636 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
637 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
638 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
639 ; GFX9-NEXT: v_pk_add_i16 v2, v2, v3 clamp
640 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
641 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
642 ; GFX9-NEXT: v_mov_b32_e32 v2, 8
643 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
644 ; GFX9-NEXT: s_movk_i32 s4, 0xff
645 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
646 ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
647 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0
648 ; GFX9-NEXT: v_mov_b32_e32 v3, 24
649 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
650 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
651 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
652 ; GFX9-NEXT: s_setpc_b64 s[30:31]
654 ; GFX10-LABEL: v_saddsat_v4i8:
656 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
658 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
659 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
660 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
661 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
662 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
663 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
664 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
665 ; GFX10-NEXT: v_mov_b32_e32 v4, 24
666 ; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
667 ; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
668 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
669 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
670 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
671 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
672 ; GFX10-NEXT: v_pk_add_i16 v2, v2, v3 clamp
673 ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp
674 ; GFX10-NEXT: v_mov_b32_e32 v1, 8
675 ; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1]
676 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
677 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
678 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0
679 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
680 ; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1
681 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
682 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
683 ; GFX10-NEXT: s_setpc_b64 s[30:31]
685 ; GFX11-LABEL: v_saddsat_v4i8:
687 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
689 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
690 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
691 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
692 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
693 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
694 ; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
695 ; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
696 ; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
697 ; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
698 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
699 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
700 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
701 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
702 ; GFX11-NEXT: v_pk_add_i16 v2, v2, v3 clamp
703 ; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp
704 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
705 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
706 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8
707 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0
708 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
709 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
710 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
711 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
712 ; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2
713 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0
714 ; GFX11-NEXT: s_setpc_b64 s[30:31]
715 %lhs = bitcast i32 %lhs.arg to <4 x i8>
716 %rhs = bitcast i32 %rhs.arg to <4 x i8>
717 %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
718 %cast.result = bitcast <4 x i8> %result to i32
722 define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
723 ; GFX6-LABEL: s_saddsat_v4i8:
725 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
726 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
727 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24
728 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
729 ; GFX6-NEXT: s_min_i32 s9, s0, 0
730 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8
731 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16
732 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24
733 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
734 ; GFX6-NEXT: s_max_i32 s8, s0, 0
735 ; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9
736 ; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8
737 ; GFX6-NEXT: s_max_i32 s1, s9, s1
738 ; GFX6-NEXT: s_min_i32 s1, s1, s8
739 ; GFX6-NEXT: s_add_i32 s0, s0, s1
740 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
741 ; GFX6-NEXT: s_min_i32 s8, s1, 0
742 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24
743 ; GFX6-NEXT: s_max_i32 s5, s1, 0
744 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
745 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
746 ; GFX6-NEXT: s_max_i32 s2, s8, s2
747 ; GFX6-NEXT: s_min_i32 s2, s2, s5
748 ; GFX6-NEXT: s_add_i32 s1, s1, s2
749 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
750 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24
751 ; GFX6-NEXT: s_min_i32 s6, s2, 0
752 ; GFX6-NEXT: s_max_i32 s5, s2, 0
753 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
754 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
755 ; GFX6-NEXT: s_max_i32 s3, s6, s3
756 ; GFX6-NEXT: s_min_i32 s3, s3, s5
757 ; GFX6-NEXT: s_add_i32 s2, s2, s3
758 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24
759 ; GFX6-NEXT: s_min_i32 s6, s3, 0
760 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24
761 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24
762 ; GFX6-NEXT: s_max_i32 s5, s3, 0
763 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
764 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24
765 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
766 ; GFX6-NEXT: s_max_i32 s4, s6, s4
767 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff
768 ; GFX6-NEXT: s_ashr_i32 s2, s2, 24
769 ; GFX6-NEXT: s_min_i32 s4, s4, s5
770 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff
771 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
772 ; GFX6-NEXT: s_add_i32 s3, s3, s4
773 ; GFX6-NEXT: s_or_b32 s0, s0, s1
774 ; GFX6-NEXT: s_and_b32 s1, s2, 0xff
775 ; GFX6-NEXT: s_ashr_i32 s3, s3, 24
776 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
777 ; GFX6-NEXT: s_or_b32 s0, s0, s1
778 ; GFX6-NEXT: s_and_b32 s1, s3, 0xff
779 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
780 ; GFX6-NEXT: s_or_b32 s0, s0, s1
781 ; GFX6-NEXT: ; return to shader part epilog
783 ; GFX8-LABEL: s_saddsat_v4i8:
785 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
786 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16
787 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24
788 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
789 ; GFX8-NEXT: s_sext_i32_i16 s8, s0
790 ; GFX8-NEXT: s_sext_i32_i16 s9, 0
791 ; GFX8-NEXT: s_max_i32 s10, s8, s9
792 ; GFX8-NEXT: s_min_i32 s8, s8, s9
793 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8
794 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16
795 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24
796 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
797 ; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8
798 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
799 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
800 ; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
801 ; GFX8-NEXT: s_max_i32 s1, s8, s1
802 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
803 ; GFX8-NEXT: s_sext_i32_i16 s8, s10
804 ; GFX8-NEXT: s_min_i32 s1, s1, s8
805 ; GFX8-NEXT: s_add_i32 s0, s0, s1
806 ; GFX8-NEXT: s_lshl_b32 s1, s2, 8
807 ; GFX8-NEXT: s_lshl_b32 s2, s5, 8
808 ; GFX8-NEXT: s_sext_i32_i16 s5, s1
809 ; GFX8-NEXT: s_max_i32 s8, s5, s9
810 ; GFX8-NEXT: s_min_i32 s5, s5, s9
811 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
812 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
813 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
814 ; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8
815 ; GFX8-NEXT: s_max_i32 s2, s5, s2
816 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
817 ; GFX8-NEXT: s_sext_i32_i16 s5, s8
818 ; GFX8-NEXT: s_min_i32 s2, s2, s5
819 ; GFX8-NEXT: s_add_i32 s1, s1, s2
820 ; GFX8-NEXT: s_lshl_b32 s2, s3, 8
821 ; GFX8-NEXT: s_sext_i32_i16 s5, s2
822 ; GFX8-NEXT: s_lshl_b32 s3, s6, 8
823 ; GFX8-NEXT: s_max_i32 s6, s5, s9
824 ; GFX8-NEXT: s_min_i32 s5, s5, s9
825 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
826 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
827 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
828 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
829 ; GFX8-NEXT: s_max_i32 s3, s5, s3
830 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
831 ; GFX8-NEXT: s_sext_i32_i16 s5, s6
832 ; GFX8-NEXT: s_min_i32 s3, s3, s5
833 ; GFX8-NEXT: s_add_i32 s2, s2, s3
834 ; GFX8-NEXT: s_lshl_b32 s3, s4, 8
835 ; GFX8-NEXT: s_sext_i32_i16 s5, s3
836 ; GFX8-NEXT: s_max_i32 s6, s5, s9
837 ; GFX8-NEXT: s_min_i32 s5, s5, s9
838 ; GFX8-NEXT: s_lshl_b32 s4, s7, 8
839 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
840 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
841 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
842 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
843 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
844 ; GFX8-NEXT: s_max_i32 s4, s5, s4
845 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
846 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8
847 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
848 ; GFX8-NEXT: s_sext_i32_i16 s5, s6
849 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8
850 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
851 ; GFX8-NEXT: s_min_i32 s4, s4, s5
852 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff
853 ; GFX8-NEXT: s_ashr_i32 s2, s2, 8
854 ; GFX8-NEXT: s_add_i32 s3, s3, s4
855 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff
856 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
857 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
858 ; GFX8-NEXT: s_or_b32 s0, s0, s1
859 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
860 ; GFX8-NEXT: s_ashr_i32 s3, s3, 8
861 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
862 ; GFX8-NEXT: s_or_b32 s0, s0, s1
863 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff
864 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24
865 ; GFX8-NEXT: s_or_b32 s0, s0, s1
866 ; GFX8-NEXT: ; return to shader part epilog
868 ; GFX9-LABEL: s_saddsat_v4i8:
870 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8
871 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
872 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24
873 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
874 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
875 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16
876 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
877 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
878 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8
879 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
880 ; GFX9-NEXT: s_lshr_b32 s6, s3, 16
881 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16
882 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24
883 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
884 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
885 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
886 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
887 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16
888 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
889 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
890 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
891 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
892 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16
893 ; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
894 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
895 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
896 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
897 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
898 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
899 ; GFX9-NEXT: s_mov_b32 s2, 8
900 ; GFX9-NEXT: v_pk_add_i16 v1, s3, v1 clamp
901 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
902 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
903 ; GFX9-NEXT: s_movk_i32 s0, 0xff
904 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
905 ; GFX9-NEXT: s_mov_b32 s5, 24
906 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2
907 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
908 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
909 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
910 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
911 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
912 ; GFX9-NEXT: ; return to shader part epilog
914 ; GFX10-LABEL: s_saddsat_v4i8:
916 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
917 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
918 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24
919 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8
920 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
921 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
922 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16
923 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24
924 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
925 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
926 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
927 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
928 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
929 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
930 ; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
931 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
932 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
933 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
934 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
935 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16
936 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
937 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
938 ; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
939 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
940 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
941 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
942 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
943 ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp
944 ; GFX10-NEXT: s_mov_b32 s0, 8
945 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
946 ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
947 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
948 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
949 ; GFX10-NEXT: s_mov_b32 s0, 24
950 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
951 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
952 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
953 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
954 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
955 ; GFX10-NEXT: ; return to shader part epilog
957 ; GFX11-LABEL: s_saddsat_v4i8:
959 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
960 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24
961 ; GFX11-NEXT: s_lshr_b32 s4, s1, 8
962 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24
963 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2
964 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3
965 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4
966 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
967 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5
968 ; GFX11-NEXT: s_lshr_b32 s5, s3, 16
969 ; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008
970 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
971 ; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
972 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8
973 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
974 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
975 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16
976 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16
977 ; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp
978 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
979 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
980 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
981 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8
982 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
983 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
984 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
985 ; GFX11-NEXT: v_pk_add_i16 v1, s0, s1 clamp
986 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
987 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
988 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
989 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
990 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
991 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2
992 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
993 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
994 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
995 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
996 ; GFX11-NEXT: ; return to shader part epilog
997 %lhs = bitcast i32 %lhs.arg to <4 x i8>
998 %rhs = bitcast i32 %rhs.arg to <4 x i8>
999 %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
1000 %cast.result = bitcast <4 x i8> %result to i32
1001 ret i32 %cast.result
1004 define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
1005 ; GFX6-LABEL: v_saddsat_i24:
1007 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1008 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1009 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
1010 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1011 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
1012 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
1013 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
1014 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
1015 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
1016 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1017 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1018 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1020 ; GFX8-LABEL: v_saddsat_i24:
1022 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
1024 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 24
1025 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24
1026 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0
1027 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24
1028 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0
1029 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3
1030 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0
1031 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
1032 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1033 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1035 ; GFX9-LABEL: v_saddsat_i24:
1037 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1039 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1040 ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp
1041 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1042 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1044 ; GFX10PLUS-LABEL: v_saddsat_i24:
1045 ; GFX10PLUS: ; %bb.0:
1046 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1048 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1049 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp
1050 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1051 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1052 %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
1056 define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
1057 ; GFX6-LABEL: s_saddsat_i24:
1059 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8
1060 ; GFX6-NEXT: s_min_i32 s3, s0, 0
1061 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
1062 ; GFX6-NEXT: s_max_i32 s2, s0, 0
1063 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3
1064 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
1065 ; GFX6-NEXT: s_max_i32 s1, s3, s1
1066 ; GFX6-NEXT: s_min_i32 s1, s1, s2
1067 ; GFX6-NEXT: s_add_i32 s0, s0, s1
1068 ; GFX6-NEXT: s_ashr_i32 s0, s0, 8
1069 ; GFX6-NEXT: ; return to shader part epilog
1071 ; GFX8-LABEL: s_saddsat_i24:
1073 ; GFX8-NEXT: s_add_i32 s2, s0, s1
1074 ; GFX8-NEXT: s_bfe_i32 s3, s2, 0x180000
1075 ; GFX8-NEXT: s_bfe_i32 s0, s0, 0x180000
1076 ; GFX8-NEXT: s_cmp_lt_i32 s3, s0
1077 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0
1078 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000
1079 ; GFX8-NEXT: s_cmp_lt_i32 s1, 0
1080 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0
1081 ; GFX8-NEXT: s_xor_b32 s0, s1, s0
1082 ; GFX8-NEXT: s_ashr_i32 s1, s3, 23
1083 ; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000
1084 ; GFX8-NEXT: s_and_b32 s0, s0, 1
1085 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0
1086 ; GFX8-NEXT: s_cselect_b32 s0, s1, s2
1087 ; GFX8-NEXT: ; return to shader part epilog
1089 ; GFX9-LABEL: s_saddsat_i24:
1091 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
1092 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
1093 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1094 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1095 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1096 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1097 ; GFX9-NEXT: ; return to shader part epilog
1099 ; GFX10PLUS-LABEL: s_saddsat_i24:
1100 ; GFX10PLUS: ; %bb.0:
1101 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
1102 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
1103 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp
1104 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
1105 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1106 ; GFX10PLUS-NEXT: ; return to shader part epilog
1107 %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
1111 define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
1112 ; GFX6-LABEL: v_saddsat_i32:
1114 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1115 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
1116 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
1117 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
1118 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
1119 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
1120 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
1121 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1122 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1124 ; GFX8-LABEL: v_saddsat_i32:
1126 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1127 ; GFX8-NEXT: v_min_i32_e32 v3, 0, v0
1128 ; GFX8-NEXT: v_max_i32_e32 v2, 0, v0
1129 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x80000000, v3
1130 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2
1131 ; GFX8-NEXT: v_max_i32_e32 v1, v3, v1
1132 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v2
1133 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
1134 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1136 ; GFX9-LABEL: v_saddsat_i32:
1138 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139 ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp
1140 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1142 ; GFX10PLUS-LABEL: v_saddsat_i32:
1143 ; GFX10PLUS: ; %bb.0:
1144 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1145 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp
1146 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1147 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1151 define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
1152 ; GFX6-LABEL: s_saddsat_i32:
1154 ; GFX6-NEXT: s_min_i32 s3, s0, 0
1155 ; GFX6-NEXT: s_max_i32 s2, s0, 0
1156 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3
1157 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
1158 ; GFX6-NEXT: s_max_i32 s1, s3, s1
1159 ; GFX6-NEXT: s_min_i32 s1, s1, s2
1160 ; GFX6-NEXT: s_add_i32 s0, s0, s1
1161 ; GFX6-NEXT: ; return to shader part epilog
1163 ; GFX8-LABEL: s_saddsat_i32:
1165 ; GFX8-NEXT: s_min_i32 s3, s0, 0
1166 ; GFX8-NEXT: s_max_i32 s2, s0, 0
1167 ; GFX8-NEXT: s_sub_i32 s3, 0x80000000, s3
1168 ; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2
1169 ; GFX8-NEXT: s_max_i32 s1, s3, s1
1170 ; GFX8-NEXT: s_min_i32 s1, s1, s2
1171 ; GFX8-NEXT: s_add_i32 s0, s0, s1
1172 ; GFX8-NEXT: ; return to shader part epilog
1174 ; GFX9-LABEL: s_saddsat_i32:
1176 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1177 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1178 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1179 ; GFX9-NEXT: ; return to shader part epilog
1181 ; GFX10PLUS-LABEL: s_saddsat_i32:
1182 ; GFX10PLUS: ; %bb.0:
1183 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp
1184 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1185 ; GFX10PLUS-NEXT: ; return to shader part epilog
1186 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1190 define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
1191 ; GFX6-LABEL: saddsat_i32_sv:
1193 ; GFX6-NEXT: s_min_i32 s2, s0, 0
1194 ; GFX6-NEXT: s_max_i32 s1, s0, 0
1195 ; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2
1196 ; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1
1197 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0
1198 ; GFX6-NEXT: v_min_i32_e32 v0, s1, v0
1199 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
1200 ; GFX6-NEXT: ; return to shader part epilog
1202 ; GFX8-LABEL: saddsat_i32_sv:
1204 ; GFX8-NEXT: s_min_i32 s2, s0, 0
1205 ; GFX8-NEXT: s_max_i32 s1, s0, 0
1206 ; GFX8-NEXT: s_sub_i32 s2, 0x80000000, s2
1207 ; GFX8-NEXT: s_sub_i32 s1, 0x7fffffff, s1
1208 ; GFX8-NEXT: v_max_i32_e32 v0, s2, v0
1209 ; GFX8-NEXT: v_min_i32_e32 v0, s1, v0
1210 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1211 ; GFX8-NEXT: ; return to shader part epilog
1213 ; GFX9-LABEL: saddsat_i32_sv:
1215 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1216 ; GFX9-NEXT: ; return to shader part epilog
1218 ; GFX10PLUS-LABEL: saddsat_i32_sv:
1219 ; GFX10PLUS: ; %bb.0:
1220 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, v0 clamp
1221 ; GFX10PLUS-NEXT: ; return to shader part epilog
1222 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1223 %cast = bitcast i32 %result to float
1227 define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
1228 ; GFX6-LABEL: saddsat_i32_vs:
1230 ; GFX6-NEXT: v_min_i32_e32 v2, 0, v0
1231 ; GFX6-NEXT: v_max_i32_e32 v1, 0, v0
1232 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2
1233 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
1234 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2
1235 ; GFX6-NEXT: v_min_i32_e32 v1, v2, v1
1236 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1237 ; GFX6-NEXT: ; return to shader part epilog
1239 ; GFX8-LABEL: saddsat_i32_vs:
1241 ; GFX8-NEXT: v_min_i32_e32 v2, 0, v0
1242 ; GFX8-NEXT: v_max_i32_e32 v1, 0, v0
1243 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x80000000, v2
1244 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0x7fffffff, v1
1245 ; GFX8-NEXT: v_max_i32_e32 v2, s0, v2
1246 ; GFX8-NEXT: v_min_i32_e32 v1, v2, v1
1247 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
1248 ; GFX8-NEXT: ; return to shader part epilog
1250 ; GFX9-LABEL: saddsat_i32_vs:
1252 ; GFX9-NEXT: v_add_i32 v0, v0, s0 clamp
1253 ; GFX9-NEXT: ; return to shader part epilog
1255 ; GFX10PLUS-LABEL: saddsat_i32_vs:
1256 ; GFX10PLUS: ; %bb.0:
1257 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, s0 clamp
1258 ; GFX10PLUS-NEXT: ; return to shader part epilog
1259 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1260 %cast = bitcast i32 %result to float
1264 define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
1265 ; GFX6-LABEL: v_saddsat_v2i32:
1267 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1268 ; GFX6-NEXT: s_brev_b32 s5, 1
1269 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
1270 ; GFX6-NEXT: s_brev_b32 s4, -2
1271 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
1272 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5
1273 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
1274 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2
1275 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
1276 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
1277 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1278 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1
1279 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4
1280 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
1281 ; GFX6-NEXT: v_max_i32_e32 v3, v4, v3
1282 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2
1283 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
1284 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1286 ; GFX8-LABEL: v_saddsat_v2i32:
1288 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289 ; GFX8-NEXT: s_brev_b32 s5, 1
1290 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v0
1291 ; GFX8-NEXT: s_brev_b32 s4, -2
1292 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v0
1293 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5
1294 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4
1295 ; GFX8-NEXT: v_max_i32_e32 v2, v5, v2
1296 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4
1297 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v1
1298 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1299 ; GFX8-NEXT: v_max_i32_e32 v2, 0, v1
1300 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4
1301 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2
1302 ; GFX8-NEXT: v_max_i32_e32 v3, v4, v3
1303 ; GFX8-NEXT: v_min_i32_e32 v2, v3, v2
1304 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
1305 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1307 ; GFX9-LABEL: v_saddsat_v2i32:
1309 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1310 ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp
1311 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
1312 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1314 ; GFX10PLUS-LABEL: v_saddsat_v2i32:
1315 ; GFX10PLUS: ; %bb.0:
1316 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1317 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v2 clamp
1318 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v3 clamp
1319 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1320 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1321 ret <2 x i32> %result
1324 define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1325 ; GFX6-LABEL: s_saddsat_v2i32:
1327 ; GFX6-NEXT: s_min_i32 s5, s0, 0
1328 ; GFX6-NEXT: s_max_i32 s4, s0, 0
1329 ; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5
1330 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1331 ; GFX6-NEXT: s_max_i32 s2, s5, s2
1332 ; GFX6-NEXT: s_min_i32 s2, s2, s4
1333 ; GFX6-NEXT: s_min_i32 s4, s1, 0
1334 ; GFX6-NEXT: s_add_i32 s0, s0, s2
1335 ; GFX6-NEXT: s_max_i32 s2, s1, 0
1336 ; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4
1337 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
1338 ; GFX6-NEXT: s_max_i32 s3, s4, s3
1339 ; GFX6-NEXT: s_min_i32 s2, s3, s2
1340 ; GFX6-NEXT: s_add_i32 s1, s1, s2
1341 ; GFX6-NEXT: ; return to shader part epilog
1343 ; GFX8-LABEL: s_saddsat_v2i32:
1345 ; GFX8-NEXT: s_min_i32 s5, s0, 0
1346 ; GFX8-NEXT: s_max_i32 s4, s0, 0
1347 ; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5
1348 ; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1349 ; GFX8-NEXT: s_max_i32 s2, s5, s2
1350 ; GFX8-NEXT: s_min_i32 s2, s2, s4
1351 ; GFX8-NEXT: s_min_i32 s4, s1, 0
1352 ; GFX8-NEXT: s_add_i32 s0, s0, s2
1353 ; GFX8-NEXT: s_max_i32 s2, s1, 0
1354 ; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4
1355 ; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2
1356 ; GFX8-NEXT: s_max_i32 s3, s4, s3
1357 ; GFX8-NEXT: s_min_i32 s2, s3, s2
1358 ; GFX8-NEXT: s_add_i32 s1, s1, s2
1359 ; GFX8-NEXT: ; return to shader part epilog
1361 ; GFX9-LABEL: s_saddsat_v2i32:
1363 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1364 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1365 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1366 ; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp
1367 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1368 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1369 ; GFX9-NEXT: ; return to shader part epilog
1371 ; GFX10PLUS-LABEL: s_saddsat_v2i32:
1372 ; GFX10PLUS: ; %bb.0:
1373 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s2 clamp
1374 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s3 clamp
1375 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1376 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1377 ; GFX10PLUS-NEXT: ; return to shader part epilog
1378 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1379 ret <2 x i32> %result
1382 define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1383 ; GFX6-LABEL: v_saddsat_v3i32:
1385 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1386 ; GFX6-NEXT: s_brev_b32 s5, 1
1387 ; GFX6-NEXT: v_min_i32_e32 v7, 0, v0
1388 ; GFX6-NEXT: s_brev_b32 s4, -2
1389 ; GFX6-NEXT: v_max_i32_e32 v6, 0, v0
1390 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7
1391 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s4, v6
1392 ; GFX6-NEXT: v_max_i32_e32 v3, v7, v3
1393 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6
1394 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v1
1395 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
1396 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
1397 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6
1398 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3
1399 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
1400 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3
1401 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v2
1402 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
1403 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v2
1404 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4
1405 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3
1406 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5
1407 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3
1408 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1409 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1411 ; GFX8-LABEL: v_saddsat_v3i32:
1413 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414 ; GFX8-NEXT: s_brev_b32 s5, 1
1415 ; GFX8-NEXT: v_min_i32_e32 v7, 0, v0
1416 ; GFX8-NEXT: s_brev_b32 s4, -2
1417 ; GFX8-NEXT: v_max_i32_e32 v6, 0, v0
1418 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7
1419 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s4, v6
1420 ; GFX8-NEXT: v_max_i32_e32 v3, v7, v3
1421 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6
1422 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v1
1423 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
1424 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v1
1425 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6
1426 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
1427 ; GFX8-NEXT: v_max_i32_e32 v4, v6, v4
1428 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3
1429 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v2
1430 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
1431 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v2
1432 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4
1433 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
1434 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5
1435 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3
1436 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1437 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1439 ; GFX9-LABEL: v_saddsat_v3i32:
1441 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442 ; GFX9-NEXT: v_add_i32 v0, v0, v3 clamp
1443 ; GFX9-NEXT: v_add_i32 v1, v1, v4 clamp
1444 ; GFX9-NEXT: v_add_i32 v2, v2, v5 clamp
1445 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1447 ; GFX10PLUS-LABEL: v_saddsat_v3i32:
1448 ; GFX10PLUS: ; %bb.0:
1449 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1450 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v3 clamp
1451 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v4 clamp
1452 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, v2, v5 clamp
1453 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1454 %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1455 ret <3 x i32> %result
1458 define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1459 ; GFX6-LABEL: s_saddsat_v3i32:
1461 ; GFX6-NEXT: s_min_i32 s7, s0, 0
1462 ; GFX6-NEXT: s_max_i32 s6, s0, 0
1463 ; GFX6-NEXT: s_sub_i32 s7, 0x80000000, s7
1464 ; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6
1465 ; GFX6-NEXT: s_max_i32 s3, s7, s3
1466 ; GFX6-NEXT: s_min_i32 s3, s3, s6
1467 ; GFX6-NEXT: s_min_i32 s6, s1, 0
1468 ; GFX6-NEXT: s_add_i32 s0, s0, s3
1469 ; GFX6-NEXT: s_max_i32 s3, s1, 0
1470 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
1471 ; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3
1472 ; GFX6-NEXT: s_max_i32 s4, s6, s4
1473 ; GFX6-NEXT: s_min_i32 s3, s4, s3
1474 ; GFX6-NEXT: s_min_i32 s4, s2, 0
1475 ; GFX6-NEXT: s_add_i32 s1, s1, s3
1476 ; GFX6-NEXT: s_max_i32 s3, s2, 0
1477 ; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4
1478 ; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3
1479 ; GFX6-NEXT: s_max_i32 s4, s4, s5
1480 ; GFX6-NEXT: s_min_i32 s3, s4, s3
1481 ; GFX6-NEXT: s_add_i32 s2, s2, s3
1482 ; GFX6-NEXT: ; return to shader part epilog
1484 ; GFX8-LABEL: s_saddsat_v3i32:
1486 ; GFX8-NEXT: s_min_i32 s7, s0, 0
1487 ; GFX8-NEXT: s_max_i32 s6, s0, 0
1488 ; GFX8-NEXT: s_sub_i32 s7, 0x80000000, s7
1489 ; GFX8-NEXT: s_sub_i32 s6, 0x7fffffff, s6
1490 ; GFX8-NEXT: s_max_i32 s3, s7, s3
1491 ; GFX8-NEXT: s_min_i32 s3, s3, s6
1492 ; GFX8-NEXT: s_min_i32 s6, s1, 0
1493 ; GFX8-NEXT: s_add_i32 s0, s0, s3
1494 ; GFX8-NEXT: s_max_i32 s3, s1, 0
1495 ; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6
1496 ; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3
1497 ; GFX8-NEXT: s_max_i32 s4, s6, s4
1498 ; GFX8-NEXT: s_min_i32 s3, s4, s3
1499 ; GFX8-NEXT: s_min_i32 s4, s2, 0
1500 ; GFX8-NEXT: s_add_i32 s1, s1, s3
1501 ; GFX8-NEXT: s_max_i32 s3, s2, 0
1502 ; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4
1503 ; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3
1504 ; GFX8-NEXT: s_max_i32 s4, s4, s5
1505 ; GFX8-NEXT: s_min_i32 s3, s4, s3
1506 ; GFX8-NEXT: s_add_i32 s2, s2, s3
1507 ; GFX8-NEXT: ; return to shader part epilog
1509 ; GFX9-LABEL: s_saddsat_v3i32:
1511 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1512 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1513 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
1514 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1515 ; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp
1516 ; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp
1517 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1518 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1519 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1520 ; GFX9-NEXT: ; return to shader part epilog
1522 ; GFX10PLUS-LABEL: s_saddsat_v3i32:
1523 ; GFX10PLUS: ; %bb.0:
1524 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s3 clamp
1525 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s4 clamp
1526 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s5 clamp
1527 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1528 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1529 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1530 ; GFX10PLUS-NEXT: ; return to shader part epilog
1531 %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1532 ret <3 x i32> %result
1535 define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1536 ; GFX6-LABEL: v_saddsat_v4i32:
1538 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1539 ; GFX6-NEXT: s_brev_b32 s5, 1
1540 ; GFX6-NEXT: v_min_i32_e32 v9, 0, v0
1541 ; GFX6-NEXT: s_brev_b32 s4, -2
1542 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0
1543 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9
1544 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8
1545 ; GFX6-NEXT: v_max_i32_e32 v4, v9, v4
1546 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8
1547 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1
1548 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
1549 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v1
1550 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
1551 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
1552 ; GFX6-NEXT: v_max_i32_e32 v5, v8, v5
1553 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4
1554 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v2
1555 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
1556 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v2
1557 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5
1558 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
1559 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6
1560 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4
1561 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v3
1562 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
1563 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v3
1564 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5
1565 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
1566 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
1567 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4
1568 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
1569 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1571 ; GFX8-LABEL: v_saddsat_v4i32:
1573 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574 ; GFX8-NEXT: s_brev_b32 s5, 1
1575 ; GFX8-NEXT: v_min_i32_e32 v9, 0, v0
1576 ; GFX8-NEXT: s_brev_b32 s4, -2
1577 ; GFX8-NEXT: v_max_i32_e32 v8, 0, v0
1578 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9
1579 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s4, v8
1580 ; GFX8-NEXT: v_max_i32_e32 v4, v9, v4
1581 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8
1582 ; GFX8-NEXT: v_min_i32_e32 v8, 0, v1
1583 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
1584 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v1
1585 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s5, v8
1586 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4
1587 ; GFX8-NEXT: v_max_i32_e32 v5, v8, v5
1588 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4
1589 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v2
1590 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4
1591 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v2
1592 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5
1593 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4
1594 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6
1595 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4
1596 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v3
1597 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
1598 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v3
1599 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5
1600 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
1601 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7
1602 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4
1603 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
1604 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1606 ; GFX9-LABEL: v_saddsat_v4i32:
1608 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1609 ; GFX9-NEXT: v_add_i32 v0, v0, v4 clamp
1610 ; GFX9-NEXT: v_add_i32 v1, v1, v5 clamp
1611 ; GFX9-NEXT: v_add_i32 v2, v2, v6 clamp
1612 ; GFX9-NEXT: v_add_i32 v3, v3, v7 clamp
1613 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1615 ; GFX10PLUS-LABEL: v_saddsat_v4i32:
1616 ; GFX10PLUS: ; %bb.0:
1617 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1618 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v4 clamp
1619 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v5 clamp
1620 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, v2, v6 clamp
1621 ; GFX10PLUS-NEXT: v_add_nc_i32 v3, v3, v7 clamp
1622 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1623 %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1624 ret <4 x i32> %result
1627 define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1628 ; GFX6-LABEL: s_saddsat_v4i32:
1630 ; GFX6-NEXT: s_min_i32 s9, s0, 0
1631 ; GFX6-NEXT: s_max_i32 s8, s0, 0
1632 ; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9
1633 ; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8
1634 ; GFX6-NEXT: s_max_i32 s4, s9, s4
1635 ; GFX6-NEXT: s_min_i32 s4, s4, s8
1636 ; GFX6-NEXT: s_min_i32 s8, s1, 0
1637 ; GFX6-NEXT: s_add_i32 s0, s0, s4
1638 ; GFX6-NEXT: s_max_i32 s4, s1, 0
1639 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
1640 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1641 ; GFX6-NEXT: s_max_i32 s5, s8, s5
1642 ; GFX6-NEXT: s_min_i32 s4, s5, s4
1643 ; GFX6-NEXT: s_min_i32 s5, s2, 0
1644 ; GFX6-NEXT: s_add_i32 s1, s1, s4
1645 ; GFX6-NEXT: s_max_i32 s4, s2, 0
1646 ; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5
1647 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1648 ; GFX6-NEXT: s_max_i32 s5, s5, s6
1649 ; GFX6-NEXT: s_min_i32 s4, s5, s4
1650 ; GFX6-NEXT: s_min_i32 s5, s3, 0
1651 ; GFX6-NEXT: s_add_i32 s2, s2, s4
1652 ; GFX6-NEXT: s_max_i32 s4, s3, 0
1653 ; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5
1654 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1655 ; GFX6-NEXT: s_max_i32 s5, s5, s7
1656 ; GFX6-NEXT: s_min_i32 s4, s5, s4
1657 ; GFX6-NEXT: s_add_i32 s3, s3, s4
1658 ; GFX6-NEXT: ; return to shader part epilog
1660 ; GFX8-LABEL: s_saddsat_v4i32:
1662 ; GFX8-NEXT: s_min_i32 s9, s0, 0
1663 ; GFX8-NEXT: s_max_i32 s8, s0, 0
1664 ; GFX8-NEXT: s_sub_i32 s9, 0x80000000, s9
1665 ; GFX8-NEXT: s_sub_i32 s8, 0x7fffffff, s8
1666 ; GFX8-NEXT: s_max_i32 s4, s9, s4
1667 ; GFX8-NEXT: s_min_i32 s4, s4, s8
1668 ; GFX8-NEXT: s_min_i32 s8, s1, 0
1669 ; GFX8-NEXT: s_add_i32 s0, s0, s4
1670 ; GFX8-NEXT: s_max_i32 s4, s1, 0
1671 ; GFX8-NEXT: s_sub_i32 s8, 0x80000000, s8
1672 ; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1673 ; GFX8-NEXT: s_max_i32 s5, s8, s5
1674 ; GFX8-NEXT: s_min_i32 s4, s5, s4
1675 ; GFX8-NEXT: s_min_i32 s5, s2, 0
1676 ; GFX8-NEXT: s_add_i32 s1, s1, s4
1677 ; GFX8-NEXT: s_max_i32 s4, s2, 0
1678 ; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5
1679 ; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1680 ; GFX8-NEXT: s_max_i32 s5, s5, s6
1681 ; GFX8-NEXT: s_min_i32 s4, s5, s4
1682 ; GFX8-NEXT: s_min_i32 s5, s3, 0
1683 ; GFX8-NEXT: s_add_i32 s2, s2, s4
1684 ; GFX8-NEXT: s_max_i32 s4, s3, 0
1685 ; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5
1686 ; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4
1687 ; GFX8-NEXT: s_max_i32 s5, s5, s7
1688 ; GFX8-NEXT: s_min_i32 s4, s5, s4
1689 ; GFX8-NEXT: s_add_i32 s3, s3, s4
1690 ; GFX8-NEXT: ; return to shader part epilog
1692 ; GFX9-LABEL: s_saddsat_v4i32:
1694 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1695 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1696 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1697 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1698 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1699 ; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp
1700 ; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp
1701 ; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp
1702 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1703 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1704 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1705 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1706 ; GFX9-NEXT: ; return to shader part epilog
1708 ; GFX10PLUS-LABEL: s_saddsat_v4i32:
1709 ; GFX10PLUS: ; %bb.0:
1710 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s4 clamp
1711 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s5 clamp
1712 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s6 clamp
1713 ; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s7 clamp
1714 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1715 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1716 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1717 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1718 ; GFX10PLUS-NEXT: ; return to shader part epilog
1719 %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1720 ret <4 x i32> %result
1723 define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1724 ; GFX6-LABEL: v_saddsat_v5i32:
1726 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1727 ; GFX6-NEXT: s_brev_b32 s5, 1
1728 ; GFX6-NEXT: v_min_i32_e32 v12, 0, v0
1729 ; GFX6-NEXT: s_brev_b32 s4, -2
1730 ; GFX6-NEXT: v_max_i32_e32 v10, 0, v0
1731 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12
1732 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s4, v10
1733 ; GFX6-NEXT: v_max_i32_e32 v5, v12, v5
1734 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10
1735 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v1
1736 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5
1737 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1
1738 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10
1739 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
1740 ; GFX6-NEXT: v_max_i32_e32 v6, v10, v6
1741 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5
1742 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2
1743 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
1744 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2
1745 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6
1746 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
1747 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7
1748 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5
1749 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3
1750 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2
1751 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
1752 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3
1753 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6
1754 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5
1755 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8
1756 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5
1757 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4
1758 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
1759 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v4
1760 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6
1761 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
1762 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
1763 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5
1764 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
1765 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1767 ; GFX8-LABEL: v_saddsat_v5i32:
1769 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1770 ; GFX8-NEXT: s_brev_b32 s5, 1
1771 ; GFX8-NEXT: v_min_i32_e32 v12, 0, v0
1772 ; GFX8-NEXT: s_brev_b32 s4, -2
1773 ; GFX8-NEXT: v_max_i32_e32 v10, 0, v0
1774 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12
1775 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s4, v10
1776 ; GFX8-NEXT: v_max_i32_e32 v5, v12, v5
1777 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10
1778 ; GFX8-NEXT: v_min_i32_e32 v10, 0, v1
1779 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
1780 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v1
1781 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s5, v10
1782 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5
1783 ; GFX8-NEXT: v_max_i32_e32 v6, v10, v6
1784 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5
1785 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v2
1786 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5
1787 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v2
1788 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6
1789 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5
1790 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7
1791 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5
1792 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3
1793 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2
1794 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
1795 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3
1796 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6
1797 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5
1798 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8
1799 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5
1800 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4
1801 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
1802 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v4
1803 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6
1804 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5
1805 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v9
1806 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5
1807 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
1808 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1810 ; GFX9-LABEL: v_saddsat_v5i32:
1812 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1813 ; GFX9-NEXT: v_add_i32 v0, v0, v5 clamp
1814 ; GFX9-NEXT: v_add_i32 v1, v1, v6 clamp
1815 ; GFX9-NEXT: v_add_i32 v2, v2, v7 clamp
1816 ; GFX9-NEXT: v_add_i32 v3, v3, v8 clamp
1817 ; GFX9-NEXT: v_add_i32 v4, v4, v9 clamp
1818 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1820 ; GFX10PLUS-LABEL: v_saddsat_v5i32:
1821 ; GFX10PLUS: ; %bb.0:
1822 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1823 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v5 clamp
1824 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v6 clamp
1825 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, v2, v7 clamp
1826 ; GFX10PLUS-NEXT: v_add_nc_i32 v3, v3, v8 clamp
1827 ; GFX10PLUS-NEXT: v_add_nc_i32 v4, v4, v9 clamp
1828 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1829 %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1830 ret <5 x i32> %result
1833 define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1834 ; GFX6-LABEL: s_saddsat_v5i32:
1836 ; GFX6-NEXT: s_min_i32 s11, s0, 0
1837 ; GFX6-NEXT: s_max_i32 s10, s0, 0
1838 ; GFX6-NEXT: s_sub_i32 s11, 0x80000000, s11
1839 ; GFX6-NEXT: s_sub_i32 s10, 0x7fffffff, s10
1840 ; GFX6-NEXT: s_max_i32 s5, s11, s5
1841 ; GFX6-NEXT: s_min_i32 s5, s5, s10
1842 ; GFX6-NEXT: s_min_i32 s10, s1, 0
1843 ; GFX6-NEXT: s_add_i32 s0, s0, s5
1844 ; GFX6-NEXT: s_max_i32 s5, s1, 0
1845 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
1846 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1847 ; GFX6-NEXT: s_max_i32 s6, s10, s6
1848 ; GFX6-NEXT: s_min_i32 s5, s6, s5
1849 ; GFX6-NEXT: s_min_i32 s6, s2, 0
1850 ; GFX6-NEXT: s_add_i32 s1, s1, s5
1851 ; GFX6-NEXT: s_max_i32 s5, s2, 0
1852 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
1853 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1854 ; GFX6-NEXT: s_max_i32 s6, s6, s7
1855 ; GFX6-NEXT: s_min_i32 s5, s6, s5
1856 ; GFX6-NEXT: s_min_i32 s6, s3, 0
1857 ; GFX6-NEXT: s_add_i32 s2, s2, s5
1858 ; GFX6-NEXT: s_max_i32 s5, s3, 0
1859 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
1860 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1861 ; GFX6-NEXT: s_max_i32 s6, s6, s8
1862 ; GFX6-NEXT: s_min_i32 s5, s6, s5
1863 ; GFX6-NEXT: s_min_i32 s6, s4, 0
1864 ; GFX6-NEXT: s_add_i32 s3, s3, s5
1865 ; GFX6-NEXT: s_max_i32 s5, s4, 0
1866 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
1867 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1868 ; GFX6-NEXT: s_max_i32 s6, s6, s9
1869 ; GFX6-NEXT: s_min_i32 s5, s6, s5
1870 ; GFX6-NEXT: s_add_i32 s4, s4, s5
1871 ; GFX6-NEXT: ; return to shader part epilog
1873 ; GFX8-LABEL: s_saddsat_v5i32:
1875 ; GFX8-NEXT: s_min_i32 s11, s0, 0
1876 ; GFX8-NEXT: s_max_i32 s10, s0, 0
1877 ; GFX8-NEXT: s_sub_i32 s11, 0x80000000, s11
1878 ; GFX8-NEXT: s_sub_i32 s10, 0x7fffffff, s10
1879 ; GFX8-NEXT: s_max_i32 s5, s11, s5
1880 ; GFX8-NEXT: s_min_i32 s5, s5, s10
1881 ; GFX8-NEXT: s_min_i32 s10, s1, 0
1882 ; GFX8-NEXT: s_add_i32 s0, s0, s5
1883 ; GFX8-NEXT: s_max_i32 s5, s1, 0
1884 ; GFX8-NEXT: s_sub_i32 s10, 0x80000000, s10
1885 ; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1886 ; GFX8-NEXT: s_max_i32 s6, s10, s6
1887 ; GFX8-NEXT: s_min_i32 s5, s6, s5
1888 ; GFX8-NEXT: s_min_i32 s6, s2, 0
1889 ; GFX8-NEXT: s_add_i32 s1, s1, s5
1890 ; GFX8-NEXT: s_max_i32 s5, s2, 0
1891 ; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6
1892 ; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1893 ; GFX8-NEXT: s_max_i32 s6, s6, s7
1894 ; GFX8-NEXT: s_min_i32 s5, s6, s5
1895 ; GFX8-NEXT: s_min_i32 s6, s3, 0
1896 ; GFX8-NEXT: s_add_i32 s2, s2, s5
1897 ; GFX8-NEXT: s_max_i32 s5, s3, 0
1898 ; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6
1899 ; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1900 ; GFX8-NEXT: s_max_i32 s6, s6, s8
1901 ; GFX8-NEXT: s_min_i32 s5, s6, s5
1902 ; GFX8-NEXT: s_min_i32 s6, s4, 0
1903 ; GFX8-NEXT: s_add_i32 s3, s3, s5
1904 ; GFX8-NEXT: s_max_i32 s5, s4, 0
1905 ; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6
1906 ; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5
1907 ; GFX8-NEXT: s_max_i32 s6, s6, s9
1908 ; GFX8-NEXT: s_min_i32 s5, s6, s5
1909 ; GFX8-NEXT: s_add_i32 s4, s4, s5
1910 ; GFX8-NEXT: ; return to shader part epilog
1912 ; GFX9-LABEL: s_saddsat_v5i32:
1914 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1915 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1916 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1917 ; GFX9-NEXT: v_mov_b32_e32 v3, s8
1918 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1919 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
1920 ; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp
1921 ; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp
1922 ; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp
1923 ; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp
1924 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1925 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1926 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1927 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1928 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1929 ; GFX9-NEXT: ; return to shader part epilog
1931 ; GFX10PLUS-LABEL: s_saddsat_v5i32:
1932 ; GFX10PLUS: ; %bb.0:
1933 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s5 clamp
1934 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s6 clamp
1935 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s7 clamp
1936 ; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s8 clamp
1937 ; GFX10PLUS-NEXT: v_add_nc_i32 v4, s4, s9 clamp
1938 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1939 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1940 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1941 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1942 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
1943 ; GFX10PLUS-NEXT: ; return to shader part epilog
1944 %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1945 ret <5 x i32> %result
1948 define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1949 ; GFX6-LABEL: v_saddsat_v16i32:
1951 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1952 ; GFX6-NEXT: s_brev_b32 s4, 1
1953 ; GFX6-NEXT: v_min_i32_e32 v31, 0, v0
1954 ; GFX6-NEXT: v_sub_i32_e32 v31, vcc, s4, v31
1955 ; GFX6-NEXT: v_max_i32_e32 v16, v31, v16
1956 ; GFX6-NEXT: s_brev_b32 s5, -2
1957 ; GFX6-NEXT: v_max_i32_e32 v31, 0, v0
1958 ; GFX6-NEXT: v_sub_i32_e32 v31, vcc, s5, v31
1959 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v31
1960 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16
1961 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v1
1962 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16
1963 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v17
1964 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v1
1965 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17
1966 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17
1967 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16
1968 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v2
1969 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16
1970 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v2
1971 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v18
1972 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17
1973 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17
1974 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16
1975 ; GFX6-NEXT: v_bfrev_b32_e32 v16, 1
1976 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v3
1977 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
1978 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
1979 ; GFX6-NEXT: v_bfrev_b32_e32 v18, -2
1980 ; GFX6-NEXT: v_max_i32_e32 v19, 0, v3
1981 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19
1982 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
1983 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17
1984 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v4
1985 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
1986 ; GFX6-NEXT: v_max_i32_e32 v19, 0, v4
1987 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v20
1988 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19
1989 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
1990 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v17
1991 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v5
1992 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
1993 ; GFX6-NEXT: v_max_i32_e32 v19, 0, v5
1994 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v21
1995 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19
1996 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
1997 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v17
1998 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v6
1999 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
2000 ; GFX6-NEXT: v_max_i32_e32 v19, 0, v6
2001 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22
2002 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19
2003 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19
2004 ; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32
2005 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17
2006 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v7
2007 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17
2008 ; GFX6-NEXT: v_max_i32_e32 v20, 0, v7
2009 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23
2010 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v18, v20
2011 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v20
2012 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v8
2013 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17
2014 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v8
2015 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2016 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2017 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v24
2018 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2019 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v9
2020 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17
2021 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v9
2022 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2023 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2024 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v25
2025 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2026 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v10
2027 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17
2028 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v10
2029 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2030 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2031 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v26
2032 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2033 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v11
2034 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17
2035 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v11
2036 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2037 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2038 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v27
2039 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2040 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v12
2041 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17
2042 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v12
2043 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2044 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2045 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v28
2046 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2047 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v13
2048 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17
2049 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v13
2050 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2051 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2052 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v29
2053 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2054 ; GFX6-NEXT: v_min_i32_e32 v20, 0, v14
2055 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17
2056 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v14
2057 ; GFX6-NEXT: v_sub_i32_e32 v20, vcc, v16, v20
2058 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2059 ; GFX6-NEXT: v_max_i32_e32 v20, v20, v30
2060 ; GFX6-NEXT: v_min_i32_e32 v17, v20, v17
2061 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17
2062 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v15
2063 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17
2064 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v15
2065 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
2066 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2067 ; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
2068 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17
2069 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16
2070 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2072 ; GFX8-LABEL: v_saddsat_v16i32:
2074 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2075 ; GFX8-NEXT: s_brev_b32 s4, 1
2076 ; GFX8-NEXT: v_min_i32_e32 v31, 0, v0
2077 ; GFX8-NEXT: v_sub_u32_e32 v31, vcc, s4, v31
2078 ; GFX8-NEXT: v_max_i32_e32 v16, v31, v16
2079 ; GFX8-NEXT: s_brev_b32 s5, -2
2080 ; GFX8-NEXT: v_max_i32_e32 v31, 0, v0
2081 ; GFX8-NEXT: v_sub_u32_e32 v31, vcc, s5, v31
2082 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v31
2083 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16
2084 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v1
2085 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16
2086 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v17
2087 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v1
2088 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17
2089 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17
2090 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16
2091 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v2
2092 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16
2093 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v2
2094 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v18
2095 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17
2096 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17
2097 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16
2098 ; GFX8-NEXT: v_bfrev_b32_e32 v16, 1
2099 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v3
2100 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
2101 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19
2102 ; GFX8-NEXT: v_bfrev_b32_e32 v18, -2
2103 ; GFX8-NEXT: v_max_i32_e32 v19, 0, v3
2104 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19
2105 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2106 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17
2107 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v4
2108 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
2109 ; GFX8-NEXT: v_max_i32_e32 v19, 0, v4
2110 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v20
2111 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19
2112 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2113 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17
2114 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v5
2115 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
2116 ; GFX8-NEXT: v_max_i32_e32 v19, 0, v5
2117 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v21
2118 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19
2119 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2120 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17
2121 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v6
2122 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
2123 ; GFX8-NEXT: v_max_i32_e32 v19, 0, v6
2124 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22
2125 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19
2126 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19
2127 ; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32
2128 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17
2129 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v7
2130 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17
2131 ; GFX8-NEXT: v_max_i32_e32 v20, 0, v7
2132 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23
2133 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v18, v20
2134 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v20
2135 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v8
2136 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17
2137 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v8
2138 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2139 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2140 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v24
2141 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2142 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v9
2143 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17
2144 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v9
2145 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2146 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2147 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v25
2148 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2149 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v10
2150 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17
2151 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v10
2152 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2153 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2154 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v26
2155 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2156 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v11
2157 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17
2158 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v11
2159 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2160 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2161 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v27
2162 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2163 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v12
2164 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17
2165 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v12
2166 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2167 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2168 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v28
2169 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2170 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v13
2171 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17
2172 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v13
2173 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2174 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2175 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v29
2176 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2177 ; GFX8-NEXT: v_min_i32_e32 v20, 0, v14
2178 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17
2179 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v14
2180 ; GFX8-NEXT: v_sub_u32_e32 v20, vcc, v16, v20
2181 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2182 ; GFX8-NEXT: v_max_i32_e32 v20, v20, v30
2183 ; GFX8-NEXT: v_min_i32_e32 v17, v20, v17
2184 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17
2185 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v15
2186 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17
2187 ; GFX8-NEXT: v_min_i32_e32 v18, 0, v15
2188 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v18
2189 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2190 ; GFX8-NEXT: v_max_i32_e32 v16, v16, v19
2191 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17
2192 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16
2193 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2195 ; GFX9-LABEL: v_saddsat_v16i32:
2197 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2198 ; GFX9-NEXT: v_add_i32 v0, v0, v16 clamp
2199 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
2200 ; GFX9-NEXT: v_add_i32 v1, v1, v17 clamp
2201 ; GFX9-NEXT: v_add_i32 v2, v2, v18 clamp
2202 ; GFX9-NEXT: v_add_i32 v3, v3, v19 clamp
2203 ; GFX9-NEXT: v_add_i32 v4, v4, v20 clamp
2204 ; GFX9-NEXT: v_add_i32 v5, v5, v21 clamp
2205 ; GFX9-NEXT: v_add_i32 v6, v6, v22 clamp
2206 ; GFX9-NEXT: v_add_i32 v7, v7, v23 clamp
2207 ; GFX9-NEXT: v_add_i32 v8, v8, v24 clamp
2208 ; GFX9-NEXT: v_add_i32 v9, v9, v25 clamp
2209 ; GFX9-NEXT: v_add_i32 v10, v10, v26 clamp
2210 ; GFX9-NEXT: v_add_i32 v11, v11, v27 clamp
2211 ; GFX9-NEXT: v_add_i32 v12, v12, v28 clamp
2212 ; GFX9-NEXT: v_add_i32 v13, v13, v29 clamp
2213 ; GFX9-NEXT: v_add_i32 v14, v14, v30 clamp
2214 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2215 ; GFX9-NEXT: v_add_i32 v15, v15, v16 clamp
2216 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2218 ; GFX10-LABEL: v_saddsat_v16i32:
2220 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2221 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
2222 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp
2223 ; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp
2224 ; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp
2225 ; GFX10-NEXT: v_add_nc_i32 v3, v3, v19 clamp
2226 ; GFX10-NEXT: v_add_nc_i32 v4, v4, v20 clamp
2227 ; GFX10-NEXT: v_add_nc_i32 v5, v5, v21 clamp
2228 ; GFX10-NEXT: v_add_nc_i32 v6, v6, v22 clamp
2229 ; GFX10-NEXT: v_add_nc_i32 v7, v7, v23 clamp
2230 ; GFX10-NEXT: v_add_nc_i32 v8, v8, v24 clamp
2231 ; GFX10-NEXT: v_add_nc_i32 v9, v9, v25 clamp
2232 ; GFX10-NEXT: v_add_nc_i32 v10, v10, v26 clamp
2233 ; GFX10-NEXT: v_add_nc_i32 v11, v11, v27 clamp
2234 ; GFX10-NEXT: v_add_nc_i32 v12, v12, v28 clamp
2235 ; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp
2236 ; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp
2237 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2238 ; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp
2239 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2241 ; GFX11-LABEL: v_saddsat_v16i32:
2243 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2244 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
2245 ; GFX11-NEXT: v_add_nc_i32 v0, v0, v16 clamp
2246 ; GFX11-NEXT: v_add_nc_i32 v1, v1, v17 clamp
2247 ; GFX11-NEXT: v_add_nc_i32 v2, v2, v18 clamp
2248 ; GFX11-NEXT: v_add_nc_i32 v3, v3, v19 clamp
2249 ; GFX11-NEXT: v_add_nc_i32 v4, v4, v20 clamp
2250 ; GFX11-NEXT: v_add_nc_i32 v5, v5, v21 clamp
2251 ; GFX11-NEXT: v_add_nc_i32 v6, v6, v22 clamp
2252 ; GFX11-NEXT: v_add_nc_i32 v7, v7, v23 clamp
2253 ; GFX11-NEXT: v_add_nc_i32 v8, v8, v24 clamp
2254 ; GFX11-NEXT: v_add_nc_i32 v9, v9, v25 clamp
2255 ; GFX11-NEXT: v_add_nc_i32 v10, v10, v26 clamp
2256 ; GFX11-NEXT: v_add_nc_i32 v11, v11, v27 clamp
2257 ; GFX11-NEXT: v_add_nc_i32 v12, v12, v28 clamp
2258 ; GFX11-NEXT: v_add_nc_i32 v13, v13, v29 clamp
2259 ; GFX11-NEXT: v_add_nc_i32 v14, v14, v30 clamp
2260 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2261 ; GFX11-NEXT: v_add_nc_i32 v15, v15, v31 clamp
2262 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2263 %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2264 ret <16 x i32> %result
2267 define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
2268 ; GFX6-LABEL: s_saddsat_v16i32:
2270 ; GFX6-NEXT: s_min_i32 s33, s0, 0
2271 ; GFX6-NEXT: s_max_i32 s32, s0, 0
2272 ; GFX6-NEXT: s_sub_i32 s33, 0x80000000, s33
2273 ; GFX6-NEXT: s_sub_i32 s32, 0x7fffffff, s32
2274 ; GFX6-NEXT: s_max_i32 s16, s33, s16
2275 ; GFX6-NEXT: s_min_i32 s16, s16, s32
2276 ; GFX6-NEXT: s_min_i32 s32, s1, 0
2277 ; GFX6-NEXT: s_add_i32 s0, s0, s16
2278 ; GFX6-NEXT: s_max_i32 s16, s1, 0
2279 ; GFX6-NEXT: s_sub_i32 s32, 0x80000000, s32
2280 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2281 ; GFX6-NEXT: s_max_i32 s17, s32, s17
2282 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2283 ; GFX6-NEXT: s_min_i32 s17, s2, 0
2284 ; GFX6-NEXT: s_add_i32 s1, s1, s16
2285 ; GFX6-NEXT: s_max_i32 s16, s2, 0
2286 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2287 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2288 ; GFX6-NEXT: s_max_i32 s17, s17, s18
2289 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2290 ; GFX6-NEXT: s_min_i32 s17, s3, 0
2291 ; GFX6-NEXT: s_add_i32 s2, s2, s16
2292 ; GFX6-NEXT: s_max_i32 s16, s3, 0
2293 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2294 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2295 ; GFX6-NEXT: s_max_i32 s17, s17, s19
2296 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2297 ; GFX6-NEXT: s_min_i32 s17, s4, 0
2298 ; GFX6-NEXT: s_add_i32 s3, s3, s16
2299 ; GFX6-NEXT: s_max_i32 s16, s4, 0
2300 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2301 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2302 ; GFX6-NEXT: s_max_i32 s17, s17, s20
2303 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2304 ; GFX6-NEXT: s_min_i32 s17, s5, 0
2305 ; GFX6-NEXT: s_add_i32 s4, s4, s16
2306 ; GFX6-NEXT: s_max_i32 s16, s5, 0
2307 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2308 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2309 ; GFX6-NEXT: s_max_i32 s17, s17, s21
2310 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2311 ; GFX6-NEXT: s_min_i32 s17, s6, 0
2312 ; GFX6-NEXT: s_add_i32 s5, s5, s16
2313 ; GFX6-NEXT: s_max_i32 s16, s6, 0
2314 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2315 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2316 ; GFX6-NEXT: s_max_i32 s17, s17, s22
2317 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2318 ; GFX6-NEXT: s_min_i32 s17, s7, 0
2319 ; GFX6-NEXT: s_add_i32 s6, s6, s16
2320 ; GFX6-NEXT: s_max_i32 s16, s7, 0
2321 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2322 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2323 ; GFX6-NEXT: s_max_i32 s17, s17, s23
2324 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2325 ; GFX6-NEXT: s_min_i32 s17, s8, 0
2326 ; GFX6-NEXT: s_add_i32 s7, s7, s16
2327 ; GFX6-NEXT: s_max_i32 s16, s8, 0
2328 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2329 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2330 ; GFX6-NEXT: s_max_i32 s17, s17, s24
2331 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2332 ; GFX6-NEXT: s_min_i32 s17, s9, 0
2333 ; GFX6-NEXT: s_add_i32 s8, s8, s16
2334 ; GFX6-NEXT: s_max_i32 s16, s9, 0
2335 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2336 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2337 ; GFX6-NEXT: s_max_i32 s17, s17, s25
2338 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2339 ; GFX6-NEXT: s_min_i32 s17, s10, 0
2340 ; GFX6-NEXT: s_add_i32 s9, s9, s16
2341 ; GFX6-NEXT: s_max_i32 s16, s10, 0
2342 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2343 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2344 ; GFX6-NEXT: s_max_i32 s17, s17, s26
2345 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2346 ; GFX6-NEXT: s_min_i32 s17, s11, 0
2347 ; GFX6-NEXT: s_add_i32 s10, s10, s16
2348 ; GFX6-NEXT: s_max_i32 s16, s11, 0
2349 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2350 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2351 ; GFX6-NEXT: s_max_i32 s17, s17, s27
2352 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2353 ; GFX6-NEXT: s_min_i32 s17, s12, 0
2354 ; GFX6-NEXT: s_add_i32 s11, s11, s16
2355 ; GFX6-NEXT: s_max_i32 s16, s12, 0
2356 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2357 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2358 ; GFX6-NEXT: s_max_i32 s17, s17, s28
2359 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2360 ; GFX6-NEXT: s_min_i32 s17, s13, 0
2361 ; GFX6-NEXT: s_add_i32 s12, s12, s16
2362 ; GFX6-NEXT: s_max_i32 s16, s13, 0
2363 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2364 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2365 ; GFX6-NEXT: s_max_i32 s17, s17, s29
2366 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2367 ; GFX6-NEXT: s_min_i32 s17, s14, 0
2368 ; GFX6-NEXT: s_add_i32 s13, s13, s16
2369 ; GFX6-NEXT: s_max_i32 s16, s14, 0
2370 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2371 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2372 ; GFX6-NEXT: s_max_i32 s17, s17, s30
2373 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2374 ; GFX6-NEXT: s_min_i32 s17, s15, 0
2375 ; GFX6-NEXT: s_add_i32 s14, s14, s16
2376 ; GFX6-NEXT: s_max_i32 s16, s15, 0
2377 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
2378 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2379 ; GFX6-NEXT: s_max_i32 s17, s17, s31
2380 ; GFX6-NEXT: s_min_i32 s16, s17, s16
2381 ; GFX6-NEXT: s_add_i32 s15, s15, s16
2382 ; GFX6-NEXT: ; return to shader part epilog
2384 ; GFX8-LABEL: s_saddsat_v16i32:
2386 ; GFX8-NEXT: s_min_i32 s33, s0, 0
2387 ; GFX8-NEXT: s_max_i32 s32, s0, 0
2388 ; GFX8-NEXT: s_sub_i32 s33, 0x80000000, s33
2389 ; GFX8-NEXT: s_sub_i32 s32, 0x7fffffff, s32
2390 ; GFX8-NEXT: s_max_i32 s16, s33, s16
2391 ; GFX8-NEXT: s_min_i32 s16, s16, s32
2392 ; GFX8-NEXT: s_min_i32 s32, s1, 0
2393 ; GFX8-NEXT: s_add_i32 s0, s0, s16
2394 ; GFX8-NEXT: s_max_i32 s16, s1, 0
2395 ; GFX8-NEXT: s_sub_i32 s32, 0x80000000, s32
2396 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2397 ; GFX8-NEXT: s_max_i32 s17, s32, s17
2398 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2399 ; GFX8-NEXT: s_min_i32 s17, s2, 0
2400 ; GFX8-NEXT: s_add_i32 s1, s1, s16
2401 ; GFX8-NEXT: s_max_i32 s16, s2, 0
2402 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2403 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2404 ; GFX8-NEXT: s_max_i32 s17, s17, s18
2405 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2406 ; GFX8-NEXT: s_min_i32 s17, s3, 0
2407 ; GFX8-NEXT: s_add_i32 s2, s2, s16
2408 ; GFX8-NEXT: s_max_i32 s16, s3, 0
2409 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2410 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2411 ; GFX8-NEXT: s_max_i32 s17, s17, s19
2412 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2413 ; GFX8-NEXT: s_min_i32 s17, s4, 0
2414 ; GFX8-NEXT: s_add_i32 s3, s3, s16
2415 ; GFX8-NEXT: s_max_i32 s16, s4, 0
2416 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2417 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2418 ; GFX8-NEXT: s_max_i32 s17, s17, s20
2419 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2420 ; GFX8-NEXT: s_min_i32 s17, s5, 0
2421 ; GFX8-NEXT: s_add_i32 s4, s4, s16
2422 ; GFX8-NEXT: s_max_i32 s16, s5, 0
2423 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2424 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2425 ; GFX8-NEXT: s_max_i32 s17, s17, s21
2426 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2427 ; GFX8-NEXT: s_min_i32 s17, s6, 0
2428 ; GFX8-NEXT: s_add_i32 s5, s5, s16
2429 ; GFX8-NEXT: s_max_i32 s16, s6, 0
2430 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2431 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2432 ; GFX8-NEXT: s_max_i32 s17, s17, s22
2433 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2434 ; GFX8-NEXT: s_min_i32 s17, s7, 0
2435 ; GFX8-NEXT: s_add_i32 s6, s6, s16
2436 ; GFX8-NEXT: s_max_i32 s16, s7, 0
2437 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2438 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2439 ; GFX8-NEXT: s_max_i32 s17, s17, s23
2440 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2441 ; GFX8-NEXT: s_min_i32 s17, s8, 0
2442 ; GFX8-NEXT: s_add_i32 s7, s7, s16
2443 ; GFX8-NEXT: s_max_i32 s16, s8, 0
2444 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2445 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2446 ; GFX8-NEXT: s_max_i32 s17, s17, s24
2447 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2448 ; GFX8-NEXT: s_min_i32 s17, s9, 0
2449 ; GFX8-NEXT: s_add_i32 s8, s8, s16
2450 ; GFX8-NEXT: s_max_i32 s16, s9, 0
2451 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2452 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2453 ; GFX8-NEXT: s_max_i32 s17, s17, s25
2454 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2455 ; GFX8-NEXT: s_min_i32 s17, s10, 0
2456 ; GFX8-NEXT: s_add_i32 s9, s9, s16
2457 ; GFX8-NEXT: s_max_i32 s16, s10, 0
2458 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2459 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2460 ; GFX8-NEXT: s_max_i32 s17, s17, s26
2461 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2462 ; GFX8-NEXT: s_min_i32 s17, s11, 0
2463 ; GFX8-NEXT: s_add_i32 s10, s10, s16
2464 ; GFX8-NEXT: s_max_i32 s16, s11, 0
2465 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2466 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2467 ; GFX8-NEXT: s_max_i32 s17, s17, s27
2468 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2469 ; GFX8-NEXT: s_min_i32 s17, s12, 0
2470 ; GFX8-NEXT: s_add_i32 s11, s11, s16
2471 ; GFX8-NEXT: s_max_i32 s16, s12, 0
2472 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2473 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2474 ; GFX8-NEXT: s_max_i32 s17, s17, s28
2475 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2476 ; GFX8-NEXT: s_min_i32 s17, s13, 0
2477 ; GFX8-NEXT: s_add_i32 s12, s12, s16
2478 ; GFX8-NEXT: s_max_i32 s16, s13, 0
2479 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2480 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2481 ; GFX8-NEXT: s_max_i32 s17, s17, s29
2482 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2483 ; GFX8-NEXT: s_min_i32 s17, s14, 0
2484 ; GFX8-NEXT: s_add_i32 s13, s13, s16
2485 ; GFX8-NEXT: s_max_i32 s16, s14, 0
2486 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2487 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2488 ; GFX8-NEXT: s_max_i32 s17, s17, s30
2489 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2490 ; GFX8-NEXT: s_min_i32 s17, s15, 0
2491 ; GFX8-NEXT: s_add_i32 s14, s14, s16
2492 ; GFX8-NEXT: s_max_i32 s16, s15, 0
2493 ; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17
2494 ; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16
2495 ; GFX8-NEXT: s_max_i32 s17, s17, s31
2496 ; GFX8-NEXT: s_min_i32 s16, s17, s16
2497 ; GFX8-NEXT: s_add_i32 s15, s15, s16
2498 ; GFX8-NEXT: ; return to shader part epilog
2500 ; GFX9-LABEL: s_saddsat_v16i32:
2502 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
2503 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
2504 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
2505 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
2506 ; GFX9-NEXT: v_mov_b32_e32 v4, s20
2507 ; GFX9-NEXT: v_mov_b32_e32 v5, s21
2508 ; GFX9-NEXT: v_mov_b32_e32 v6, s22
2509 ; GFX9-NEXT: v_mov_b32_e32 v7, s23
2510 ; GFX9-NEXT: v_mov_b32_e32 v8, s24
2511 ; GFX9-NEXT: v_mov_b32_e32 v9, s25
2512 ; GFX9-NEXT: v_mov_b32_e32 v10, s26
2513 ; GFX9-NEXT: v_mov_b32_e32 v11, s27
2514 ; GFX9-NEXT: v_mov_b32_e32 v12, s28
2515 ; GFX9-NEXT: v_mov_b32_e32 v13, s29
2516 ; GFX9-NEXT: v_mov_b32_e32 v14, s30
2517 ; GFX9-NEXT: v_mov_b32_e32 v15, s31
2518 ; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
2519 ; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp
2520 ; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp
2521 ; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp
2522 ; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp
2523 ; GFX9-NEXT: v_add_i32 v5, s5, v5 clamp
2524 ; GFX9-NEXT: v_add_i32 v6, s6, v6 clamp
2525 ; GFX9-NEXT: v_add_i32 v7, s7, v7 clamp
2526 ; GFX9-NEXT: v_add_i32 v8, s8, v8 clamp
2527 ; GFX9-NEXT: v_add_i32 v9, s9, v9 clamp
2528 ; GFX9-NEXT: v_add_i32 v10, s10, v10 clamp
2529 ; GFX9-NEXT: v_add_i32 v11, s11, v11 clamp
2530 ; GFX9-NEXT: v_add_i32 v12, s12, v12 clamp
2531 ; GFX9-NEXT: v_add_i32 v13, s13, v13 clamp
2532 ; GFX9-NEXT: v_add_i32 v14, s14, v14 clamp
2533 ; GFX9-NEXT: v_add_i32 v15, s15, v15 clamp
2534 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2535 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2536 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2537 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
2538 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
2539 ; GFX9-NEXT: v_readfirstlane_b32 s5, v5
2540 ; GFX9-NEXT: v_readfirstlane_b32 s6, v6
2541 ; GFX9-NEXT: v_readfirstlane_b32 s7, v7
2542 ; GFX9-NEXT: v_readfirstlane_b32 s8, v8
2543 ; GFX9-NEXT: v_readfirstlane_b32 s9, v9
2544 ; GFX9-NEXT: v_readfirstlane_b32 s10, v10
2545 ; GFX9-NEXT: v_readfirstlane_b32 s11, v11
2546 ; GFX9-NEXT: v_readfirstlane_b32 s12, v12
2547 ; GFX9-NEXT: v_readfirstlane_b32 s13, v13
2548 ; GFX9-NEXT: v_readfirstlane_b32 s14, v14
2549 ; GFX9-NEXT: v_readfirstlane_b32 s15, v15
2550 ; GFX9-NEXT: ; return to shader part epilog
2552 ; GFX10PLUS-LABEL: s_saddsat_v16i32:
2553 ; GFX10PLUS: ; %bb.0:
2554 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s16 clamp
2555 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, s1, s17 clamp
2556 ; GFX10PLUS-NEXT: v_add_nc_i32 v2, s2, s18 clamp
2557 ; GFX10PLUS-NEXT: v_add_nc_i32 v3, s3, s19 clamp
2558 ; GFX10PLUS-NEXT: v_add_nc_i32 v4, s4, s20 clamp
2559 ; GFX10PLUS-NEXT: v_add_nc_i32 v5, s5, s21 clamp
2560 ; GFX10PLUS-NEXT: v_add_nc_i32 v6, s6, s22 clamp
2561 ; GFX10PLUS-NEXT: v_add_nc_i32 v7, s7, s23 clamp
2562 ; GFX10PLUS-NEXT: v_add_nc_i32 v8, s8, s24 clamp
2563 ; GFX10PLUS-NEXT: v_add_nc_i32 v9, s9, s25 clamp
2564 ; GFX10PLUS-NEXT: v_add_nc_i32 v10, s10, s26 clamp
2565 ; GFX10PLUS-NEXT: v_add_nc_i32 v11, s11, s27 clamp
2566 ; GFX10PLUS-NEXT: v_add_nc_i32 v12, s12, s28 clamp
2567 ; GFX10PLUS-NEXT: v_add_nc_i32 v13, s13, s29 clamp
2568 ; GFX10PLUS-NEXT: v_add_nc_i32 v14, s14, s30 clamp
2569 ; GFX10PLUS-NEXT: v_add_nc_i32 v15, s15, s31 clamp
2570 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2571 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2572 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
2573 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
2574 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
2575 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5
2576 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6
2577 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7
2578 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8
2579 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9
2580 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10
2581 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11
2582 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12
2583 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13
2584 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14
2585 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15
2586 ; GFX10PLUS-NEXT: ; return to shader part epilog
2587 %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2588 ret <16 x i32> %result
2591 define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
2592 ; GFX6-LABEL: v_saddsat_i16:
2594 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2595 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2596 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
2597 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2598 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
2599 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
2600 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
2601 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
2602 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2
2603 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2604 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2605 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2607 ; GFX8-LABEL: v_saddsat_i16:
2609 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2610 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0
2611 ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0
2612 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3
2613 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2
2614 ; GFX8-NEXT: v_max_i16_e32 v1, v3, v1
2615 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2
2616 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
2617 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2619 ; GFX9-LABEL: v_saddsat_i16:
2621 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2622 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
2623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2625 ; GFX10PLUS-LABEL: v_saddsat_i16:
2626 ; GFX10PLUS: ; %bb.0:
2627 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2628 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
2629 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2630 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2634 define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
2635 ; GFX6-LABEL: s_saddsat_i16:
2637 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2638 ; GFX6-NEXT: s_min_i32 s3, s0, 0
2639 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2640 ; GFX6-NEXT: s_max_i32 s2, s0, 0
2641 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3
2642 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
2643 ; GFX6-NEXT: s_max_i32 s1, s3, s1
2644 ; GFX6-NEXT: s_min_i32 s1, s1, s2
2645 ; GFX6-NEXT: s_add_i32 s0, s0, s1
2646 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
2647 ; GFX6-NEXT: ; return to shader part epilog
2649 ; GFX8-LABEL: s_saddsat_i16:
2651 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
2652 ; GFX8-NEXT: s_sext_i32_i16 s3, 0
2653 ; GFX8-NEXT: s_max_i32 s4, s2, s3
2654 ; GFX8-NEXT: s_min_i32 s2, s2, s3
2655 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
2656 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
2657 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2658 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
2659 ; GFX8-NEXT: s_max_i32 s1, s2, s1
2660 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2661 ; GFX8-NEXT: s_sext_i32_i16 s2, s4
2662 ; GFX8-NEXT: s_min_i32 s1, s1, s2
2663 ; GFX8-NEXT: s_add_i32 s0, s0, s1
2664 ; GFX8-NEXT: ; return to shader part epilog
2666 ; GFX9-LABEL: s_saddsat_i16:
2668 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
2669 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
2670 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2671 ; GFX9-NEXT: ; return to shader part epilog
2673 ; GFX10PLUS-LABEL: s_saddsat_i16:
2674 ; GFX10PLUS: ; %bb.0:
2675 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
2676 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2677 ; GFX10PLUS-NEXT: ; return to shader part epilog
2678 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2682 define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
2683 ; GFX6-LABEL: saddsat_i16_sv:
2685 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2686 ; GFX6-NEXT: s_min_i32 s2, s0, 0
2687 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2688 ; GFX6-NEXT: s_max_i32 s1, s0, 0
2689 ; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2
2690 ; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1
2691 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0
2692 ; GFX6-NEXT: v_min_i32_e32 v0, s1, v0
2693 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2694 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2695 ; GFX6-NEXT: ; return to shader part epilog
2697 ; GFX8-LABEL: saddsat_i16_sv:
2699 ; GFX8-NEXT: s_sext_i32_i16 s1, s0
2700 ; GFX8-NEXT: s_sext_i32_i16 s2, 0
2701 ; GFX8-NEXT: s_max_i32 s3, s1, s2
2702 ; GFX8-NEXT: s_min_i32 s1, s1, s2
2703 ; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1
2704 ; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
2705 ; GFX8-NEXT: v_max_i16_e32 v0, s1, v0
2706 ; GFX8-NEXT: v_min_i16_e32 v0, s3, v0
2707 ; GFX8-NEXT: v_add_u16_e32 v0, s0, v0
2708 ; GFX8-NEXT: ; return to shader part epilog
2710 ; GFX9-LABEL: saddsat_i16_sv:
2712 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
2713 ; GFX9-NEXT: ; return to shader part epilog
2715 ; GFX10PLUS-LABEL: saddsat_i16_sv:
2716 ; GFX10PLUS: ; %bb.0:
2717 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, v0 clamp
2718 ; GFX10PLUS-NEXT: ; return to shader part epilog
2719 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2720 %cast = bitcast i16 %result to half
2724 define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
2725 ; GFX6-LABEL: saddsat_i16_vs:
2727 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2728 ; GFX6-NEXT: v_min_i32_e32 v2, 0, v0
2729 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2730 ; GFX6-NEXT: v_max_i32_e32 v1, 0, v0
2731 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2
2732 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
2733 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2
2734 ; GFX6-NEXT: v_min_i32_e32 v1, v2, v1
2735 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2736 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2737 ; GFX6-NEXT: ; return to shader part epilog
2739 ; GFX8-LABEL: saddsat_i16_vs:
2741 ; GFX8-NEXT: v_min_i16_e32 v2, 0, v0
2742 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v0
2743 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2
2744 ; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1
2745 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2
2746 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
2747 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
2748 ; GFX8-NEXT: ; return to shader part epilog
2750 ; GFX9-LABEL: saddsat_i16_vs:
2752 ; GFX9-NEXT: v_add_i16 v0, v0, s0 clamp
2753 ; GFX9-NEXT: ; return to shader part epilog
2755 ; GFX10PLUS-LABEL: saddsat_i16_vs:
2756 ; GFX10PLUS: ; %bb.0:
2757 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, s0 clamp
2758 ; GFX10PLUS-NEXT: ; return to shader part epilog
2759 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2760 %cast = bitcast i16 %result to half
2764 define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
2765 ; GFX6-LABEL: v_saddsat_v2i16:
2767 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2768 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2769 ; GFX6-NEXT: s_brev_b32 s5, 1
2770 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
2771 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2772 ; GFX6-NEXT: s_brev_b32 s4, -2
2773 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
2774 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5
2775 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
2776 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2
2777 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2778 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
2779 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
2780 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2781 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
2782 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
2783 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4
2784 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3
2785 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
2786 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
2787 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
2788 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2789 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2790 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2792 ; GFX8-LABEL: v_saddsat_v2i16:
2794 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2795 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v0
2796 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2797 ; GFX8-NEXT: v_max_i16_e32 v3, 0, v0
2798 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4
2799 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3
2800 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v1
2801 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v2
2802 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3
2803 ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2
2804 ; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5
2805 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4
2806 ; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2807 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
2808 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v3
2809 ; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2810 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2811 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2813 ; GFX9-LABEL: v_saddsat_v2i16:
2815 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2816 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
2817 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2819 ; GFX10PLUS-LABEL: v_saddsat_v2i16:
2820 ; GFX10PLUS: ; %bb.0:
2821 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2822 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v1 clamp
2823 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2824 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2825 ret <2 x i16> %result
2828 define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
2829 ; GFX6-LABEL: s_saddsat_v2i16:
2831 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2832 ; GFX6-NEXT: s_min_i32 s5, s0, 0
2833 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2834 ; GFX6-NEXT: s_max_i32 s4, s0, 0
2835 ; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5
2836 ; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4
2837 ; GFX6-NEXT: s_max_i32 s2, s5, s2
2838 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2839 ; GFX6-NEXT: s_min_i32 s2, s2, s4
2840 ; GFX6-NEXT: s_min_i32 s4, s1, 0
2841 ; GFX6-NEXT: s_add_i32 s0, s0, s2
2842 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16
2843 ; GFX6-NEXT: s_max_i32 s3, s1, 0
2844 ; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4
2845 ; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3
2846 ; GFX6-NEXT: s_max_i32 s2, s4, s2
2847 ; GFX6-NEXT: s_min_i32 s2, s2, s3
2848 ; GFX6-NEXT: s_add_i32 s1, s1, s2
2849 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
2850 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
2851 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2852 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
2853 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2854 ; GFX6-NEXT: s_or_b32 s0, s0, s1
2855 ; GFX6-NEXT: ; return to shader part epilog
2857 ; GFX8-LABEL: s_saddsat_v2i16:
2859 ; GFX8-NEXT: s_sext_i32_i16 s4, s0
2860 ; GFX8-NEXT: s_sext_i32_i16 s5, 0
2861 ; GFX8-NEXT: s_max_i32 s6, s4, s5
2862 ; GFX8-NEXT: s_min_i32 s4, s4, s5
2863 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
2864 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
2865 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
2866 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2867 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
2868 ; GFX8-NEXT: s_max_i32 s1, s4, s1
2869 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2870 ; GFX8-NEXT: s_sext_i32_i16 s4, s6
2871 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
2872 ; GFX8-NEXT: s_min_i32 s1, s1, s4
2873 ; GFX8-NEXT: s_add_i32 s0, s0, s1
2874 ; GFX8-NEXT: s_sext_i32_i16 s1, s2
2875 ; GFX8-NEXT: s_max_i32 s4, s1, s5
2876 ; GFX8-NEXT: s_min_i32 s1, s1, s5
2877 ; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1
2878 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2879 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
2880 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
2881 ; GFX8-NEXT: s_max_i32 s1, s1, s3
2882 ; GFX8-NEXT: s_sext_i32_i16 s1, s1
2883 ; GFX8-NEXT: s_sext_i32_i16 s3, s4
2884 ; GFX8-NEXT: s_min_i32 s1, s1, s3
2885 ; GFX8-NEXT: s_add_i32 s2, s2, s1
2886 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
2887 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
2888 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
2889 ; GFX8-NEXT: s_or_b32 s0, s0, s1
2890 ; GFX8-NEXT: ; return to shader part epilog
2892 ; GFX9-LABEL: s_saddsat_v2i16:
2894 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
2895 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
2896 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2897 ; GFX9-NEXT: ; return to shader part epilog
2899 ; GFX10PLUS-LABEL: s_saddsat_v2i16:
2900 ; GFX10PLUS: ; %bb.0:
2901 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s1 clamp
2902 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2903 ; GFX10PLUS-NEXT: ; return to shader part epilog
2904 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2905 %cast = bitcast <2 x i16> %result to i32
2909 define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
2910 ; GFX6-LABEL: saddsat_v2i16_sv:
2912 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2913 ; GFX6-NEXT: s_min_i32 s3, s0, 0
2914 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2915 ; GFX6-NEXT: s_max_i32 s2, s0, 0
2916 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3
2917 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2
2918 ; GFX6-NEXT: v_max_i32_e32 v0, s3, v0
2919 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0
2920 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2921 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
2922 ; GFX6-NEXT: s_min_i32 s2, s0, 0
2923 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2924 ; GFX6-NEXT: s_max_i32 s1, s0, 0
2925 ; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2
2926 ; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1
2927 ; GFX6-NEXT: v_max_i32_e32 v1, s2, v1
2928 ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1
2929 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1
2930 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2931 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2932 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2933 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2934 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2935 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
2936 ; GFX6-NEXT: ; return to shader part epilog
2938 ; GFX8-LABEL: saddsat_v2i16_sv:
2940 ; GFX8-NEXT: s_sext_i32_i16 s2, s0
2941 ; GFX8-NEXT: s_sext_i32_i16 s3, 0
2942 ; GFX8-NEXT: s_max_i32 s4, s2, s3
2943 ; GFX8-NEXT: s_min_i32 s2, s2, s3
2944 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
2945 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
2946 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
2947 ; GFX8-NEXT: v_max_i16_e32 v1, s2, v0
2948 ; GFX8-NEXT: s_sext_i32_i16 s2, s1
2949 ; GFX8-NEXT: v_min_i16_e32 v1, s4, v1
2950 ; GFX8-NEXT: s_max_i32 s4, s2, s3
2951 ; GFX8-NEXT: s_min_i32 s2, s2, s3
2952 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
2953 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
2954 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
2955 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2956 ; GFX8-NEXT: v_min_i16_e32 v0, s4, v0
2957 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2958 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v1
2959 ; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2960 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
2961 ; GFX8-NEXT: ; return to shader part epilog
2963 ; GFX9-LABEL: saddsat_v2i16_sv:
2965 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
2966 ; GFX9-NEXT: ; return to shader part epilog
2968 ; GFX10PLUS-LABEL: saddsat_v2i16_sv:
2969 ; GFX10PLUS: ; %bb.0:
2970 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, v0 clamp
2971 ; GFX10PLUS-NEXT: ; return to shader part epilog
2972 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2973 %cast = bitcast <2 x i16> %result to float
2977 define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
2978 ; GFX6-LABEL: saddsat_v2i16_vs:
2980 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2981 ; GFX6-NEXT: s_brev_b32 s3, 1
2982 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
2983 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2984 ; GFX6-NEXT: s_brev_b32 s2, -2
2985 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
2986 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3
2987 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2
2988 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3
2989 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2990 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2
2991 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v1
2992 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2993 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
2994 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1
2995 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3
2996 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2
2997 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3
2998 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2
2999 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
3000 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3001 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3002 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3003 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3004 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3005 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3006 ; GFX6-NEXT: ; return to shader part epilog
3008 ; GFX8-LABEL: saddsat_v2i16_vs:
3010 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0
3011 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
3012 ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0
3013 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3
3014 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2
3015 ; GFX8-NEXT: v_max_i16_e32 v3, s0, v3
3016 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v1
3017 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
3018 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2
3019 ; GFX8-NEXT: v_max_i16_e32 v3, 0, v1
3020 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4
3021 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3
3022 ; GFX8-NEXT: v_max_i16_e32 v4, s1, v4
3023 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3
3024 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v2
3025 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3026 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
3027 ; GFX8-NEXT: ; return to shader part epilog
3029 ; GFX9-LABEL: saddsat_v2i16_vs:
3031 ; GFX9-NEXT: v_pk_add_i16 v0, v0, s0 clamp
3032 ; GFX9-NEXT: ; return to shader part epilog
3034 ; GFX10PLUS-LABEL: saddsat_v2i16_vs:
3035 ; GFX10PLUS: ; %bb.0:
3036 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, s0 clamp
3037 ; GFX10PLUS-NEXT: ; return to shader part epilog
3038 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
3039 %cast = bitcast <2 x i16> %result to float
3043 ; FIXME: v3i16 insert/extract
3044 ; define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
3045 ; %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3046 ; ret <3 x i16> %result
3049 ; define amdgpu_ps <3 x i16> @s_saddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
3050 ; %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3051 ; ret <3 x i16> %result
3054 define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
3055 ; GFX6-LABEL: v_saddsat_v4i16:
3057 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3058 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3059 ; GFX6-NEXT: s_brev_b32 s5, 1
3060 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0
3061 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3062 ; GFX6-NEXT: s_brev_b32 s4, -2
3063 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0
3064 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10
3065 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8
3066 ; GFX6-NEXT: v_max_i32_e32 v4, v10, v4
3067 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3068 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8
3069 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1
3070 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
3071 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
3072 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1
3073 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
3074 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
3075 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4
3076 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5
3077 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3078 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
3079 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
3080 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2
3081 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2
3082 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2
3083 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6
3084 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
3085 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
3086 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3087 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1
3088 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5
3089 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3
3090 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
3091 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
3092 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3
3093 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6
3094 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
3095 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
3096 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3097 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5
3098 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3099 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
3100 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3101 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2
3102 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3
3103 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3104 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3105 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3106 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
3107 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
3108 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3109 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3110 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3112 ; GFX8-LABEL: v_saddsat_v4i16:
3114 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3115 ; GFX8-NEXT: v_min_i16_e32 v7, 0, v0
3116 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
3117 ; GFX8-NEXT: v_max_i16_e32 v6, 0, v0
3118 ; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7
3119 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6
3120 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v2
3121 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v4
3122 ; GFX8-NEXT: v_min_i16_e32 v6, v7, v6
3123 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v4
3124 ; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8
3125 ; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7
3126 ; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3127 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v1
3128 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3129 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v7
3130 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v1
3131 ; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8
3132 ; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7
3133 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v3
3134 ; GFX8-NEXT: v_min_i16_e32 v9, 0, v5
3135 ; GFX8-NEXT: v_min_i16_e32 v7, v8, v7
3136 ; GFX8-NEXT: v_max_i16_e32 v8, 0, v5
3137 ; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9
3138 ; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8
3139 ; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3140 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v8
3141 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v6
3142 ; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3143 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
3144 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v7
3145 ; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3146 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
3147 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3149 ; GFX9-LABEL: v_saddsat_v4i16:
3151 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
3153 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
3154 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3156 ; GFX10PLUS-LABEL: v_saddsat_v4i16:
3157 ; GFX10PLUS: ; %bb.0:
3158 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3159 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp
3160 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp
3161 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3162 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3163 %cast = bitcast <4 x i16> %result to <2 x float>
3164 ret <2 x float> %cast
3167 define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
3168 ; GFX6-LABEL: s_saddsat_v4i16:
3170 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
3171 ; GFX6-NEXT: s_min_i32 s9, s0, 0
3172 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3173 ; GFX6-NEXT: s_max_i32 s8, s0, 0
3174 ; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9
3175 ; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8
3176 ; GFX6-NEXT: s_max_i32 s4, s9, s4
3177 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3178 ; GFX6-NEXT: s_min_i32 s4, s4, s8
3179 ; GFX6-NEXT: s_min_i32 s8, s1, 0
3180 ; GFX6-NEXT: s_add_i32 s0, s0, s4
3181 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16
3182 ; GFX6-NEXT: s_max_i32 s5, s1, 0
3183 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
3184 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
3185 ; GFX6-NEXT: s_max_i32 s4, s8, s4
3186 ; GFX6-NEXT: s_min_i32 s4, s4, s5
3187 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3188 ; GFX6-NEXT: s_add_i32 s1, s1, s4
3189 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16
3190 ; GFX6-NEXT: s_min_i32 s6, s2, 0
3191 ; GFX6-NEXT: s_max_i32 s5, s2, 0
3192 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
3193 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
3194 ; GFX6-NEXT: s_max_i32 s4, s6, s4
3195 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3196 ; GFX6-NEXT: s_min_i32 s4, s4, s5
3197 ; GFX6-NEXT: s_min_i32 s6, s3, 0
3198 ; GFX6-NEXT: s_add_i32 s2, s2, s4
3199 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16
3200 ; GFX6-NEXT: s_max_i32 s5, s3, 0
3201 ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6
3202 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5
3203 ; GFX6-NEXT: s_max_i32 s4, s6, s4
3204 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
3205 ; GFX6-NEXT: s_min_i32 s4, s4, s5
3206 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
3207 ; GFX6-NEXT: s_add_i32 s3, s3, s4
3208 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
3209 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16
3210 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16
3211 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
3212 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3213 ; GFX6-NEXT: s_or_b32 s0, s0, s1
3214 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
3215 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
3216 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3217 ; GFX6-NEXT: s_or_b32 s1, s1, s2
3218 ; GFX6-NEXT: ; return to shader part epilog
3220 ; GFX8-LABEL: s_saddsat_v4i16:
3222 ; GFX8-NEXT: s_sext_i32_i16 s8, s0
3223 ; GFX8-NEXT: s_sext_i32_i16 s9, 0
3224 ; GFX8-NEXT: s_max_i32 s10, s8, s9
3225 ; GFX8-NEXT: s_min_i32 s8, s8, s9
3226 ; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8
3227 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16
3228 ; GFX8-NEXT: s_sext_i32_i16 s8, s8
3229 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3230 ; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
3231 ; GFX8-NEXT: s_max_i32 s2, s8, s2
3232 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3233 ; GFX8-NEXT: s_sext_i32_i16 s8, s10
3234 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16
3235 ; GFX8-NEXT: s_min_i32 s2, s2, s8
3236 ; GFX8-NEXT: s_add_i32 s0, s0, s2
3237 ; GFX8-NEXT: s_sext_i32_i16 s2, s4
3238 ; GFX8-NEXT: s_max_i32 s8, s2, s9
3239 ; GFX8-NEXT: s_min_i32 s2, s2, s9
3240 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
3241 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3242 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
3243 ; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8
3244 ; GFX8-NEXT: s_max_i32 s2, s2, s6
3245 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3246 ; GFX8-NEXT: s_sext_i32_i16 s6, s8
3247 ; GFX8-NEXT: s_min_i32 s2, s2, s6
3248 ; GFX8-NEXT: s_add_i32 s4, s4, s2
3249 ; GFX8-NEXT: s_sext_i32_i16 s2, s1
3250 ; GFX8-NEXT: s_max_i32 s6, s2, s9
3251 ; GFX8-NEXT: s_min_i32 s2, s2, s9
3252 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
3253 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16
3254 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3255 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3256 ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
3257 ; GFX8-NEXT: s_max_i32 s2, s2, s3
3258 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3259 ; GFX8-NEXT: s_sext_i32_i16 s3, s6
3260 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16
3261 ; GFX8-NEXT: s_min_i32 s2, s2, s3
3262 ; GFX8-NEXT: s_add_i32 s1, s1, s2
3263 ; GFX8-NEXT: s_sext_i32_i16 s2, s5
3264 ; GFX8-NEXT: s_max_i32 s3, s2, s9
3265 ; GFX8-NEXT: s_min_i32 s2, s2, s9
3266 ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
3267 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3268 ; GFX8-NEXT: s_sext_i32_i16 s6, s7
3269 ; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3
3270 ; GFX8-NEXT: s_max_i32 s2, s2, s6
3271 ; GFX8-NEXT: s_sext_i32_i16 s2, s2
3272 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3273 ; GFX8-NEXT: s_min_i32 s2, s2, s3
3274 ; GFX8-NEXT: s_add_i32 s5, s5, s2
3275 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s4
3276 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
3277 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
3278 ; GFX8-NEXT: s_or_b32 s0, s0, s2
3279 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s5
3280 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
3281 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16
3282 ; GFX8-NEXT: s_or_b32 s1, s1, s2
3283 ; GFX8-NEXT: ; return to shader part epilog
3285 ; GFX9-LABEL: s_saddsat_v4i16:
3287 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3288 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3289 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
3290 ; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp
3291 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3292 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3293 ; GFX9-NEXT: ; return to shader part epilog
3295 ; GFX10PLUS-LABEL: s_saddsat_v4i16:
3296 ; GFX10PLUS: ; %bb.0:
3297 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s2 clamp
3298 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, s1, s3 clamp
3299 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
3300 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
3301 ; GFX10PLUS-NEXT: ; return to shader part epilog
3302 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3303 %cast = bitcast <4 x i16> %result to <2 x i32>
3308 ; define <5 x i16> @v_saddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
3309 ; %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3310 ; ret <5 x i16> %result
3313 ; define amdgpu_ps <5 x i16> @s_saddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
3314 ; %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3315 ; ret <5 x i16> %result
3318 define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
3319 ; GFX6-LABEL: v_saddsat_v6i16:
3321 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3323 ; GFX6-NEXT: s_brev_b32 s5, 1
3324 ; GFX6-NEXT: v_min_i32_e32 v14, 0, v0
3325 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
3326 ; GFX6-NEXT: s_brev_b32 s4, -2
3327 ; GFX6-NEXT: v_max_i32_e32 v12, 0, v0
3328 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14
3329 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s4, v12
3330 ; GFX6-NEXT: v_max_i32_e32 v6, v14, v6
3331 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3332 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v12
3333 ; GFX6-NEXT: v_min_i32_e32 v12, 0, v1
3334 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6
3335 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7
3336 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v1
3337 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12
3338 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s4, v7
3339 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6
3340 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7
3341 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3342 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6
3343 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8
3344 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v2
3345 ; GFX6-NEXT: v_bfrev_b32_e32 v13, -2
3346 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v2
3347 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
3348 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7
3349 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
3350 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3351 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1
3352 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7
3353 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3
3354 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
3355 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9
3356 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v3
3357 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
3358 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7
3359 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
3360 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3361 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7
3362 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v4
3363 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
3364 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10
3365 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v4
3366 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
3367 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7
3368 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
3369 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
3370 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7
3371 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v5
3372 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
3373 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11
3374 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v5
3375 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
3376 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3377 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7
3378 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
3379 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3380 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7
3381 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3382 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2
3383 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3
3384 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6
3385 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3386 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3387 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5
3388 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3389 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
3390 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
3391 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4
3392 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3393 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5
3394 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3395 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4
3396 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3397 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
3398 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3400 ; GFX8-LABEL: v_saddsat_v6i16:
3402 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3403 ; GFX8-NEXT: v_min_i16_e32 v10, 0, v0
3404 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
3405 ; GFX8-NEXT: v_max_i16_e32 v9, 0, v0
3406 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10
3407 ; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9
3408 ; GFX8-NEXT: v_max_i16_e32 v10, v10, v3
3409 ; GFX8-NEXT: v_min_i16_e32 v11, 0, v6
3410 ; GFX8-NEXT: v_min_i16_e32 v9, v10, v9
3411 ; GFX8-NEXT: v_max_i16_e32 v10, 0, v6
3412 ; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11
3413 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10
3414 ; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3415 ; GFX8-NEXT: v_min_i16_e32 v11, 0, v1
3416 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
3417 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v10
3418 ; GFX8-NEXT: v_max_i16_e32 v10, 0, v1
3419 ; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11
3420 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10
3421 ; GFX8-NEXT: v_max_i16_e32 v11, v11, v4
3422 ; GFX8-NEXT: v_min_i16_e32 v12, 0, v7
3423 ; GFX8-NEXT: v_min_i16_e32 v10, v11, v10
3424 ; GFX8-NEXT: v_max_i16_e32 v11, 0, v7
3425 ; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12
3426 ; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11
3427 ; GFX8-NEXT: v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3428 ; GFX8-NEXT: v_min_i16_e32 v12, 0, v2
3429 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
3430 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v11
3431 ; GFX8-NEXT: v_max_i16_e32 v11, 0, v2
3432 ; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12
3433 ; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11
3434 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v5
3435 ; GFX8-NEXT: v_min_i16_e32 v13, 0, v8
3436 ; GFX8-NEXT: v_min_i16_e32 v11, v12, v11
3437 ; GFX8-NEXT: v_max_i16_e32 v12, 0, v8
3438 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13
3439 ; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12
3440 ; GFX8-NEXT: v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3441 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v9
3442 ; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3443 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v12
3444 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
3445 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v10
3446 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3447 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
3448 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v11
3449 ; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3450 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
3451 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3453 ; GFX9-LABEL: v_saddsat_v6i16:
3455 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3456 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v3 clamp
3457 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v4 clamp
3458 ; GFX9-NEXT: v_pk_add_i16 v2, v2, v5 clamp
3459 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3461 ; GFX10PLUS-LABEL: v_saddsat_v6i16:
3462 ; GFX10PLUS: ; %bb.0:
3463 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3464 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v3 clamp
3465 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v4 clamp
3466 ; GFX10PLUS-NEXT: v_pk_add_i16 v2, v2, v5 clamp
3467 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3468 %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3469 %cast = bitcast <6 x i16> %result to <3 x float>
3470 ret <3 x float> %cast
3473 define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
3474 ; GFX6-LABEL: s_saddsat_v6i16:
3476 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
3477 ; GFX6-NEXT: s_min_i32 s13, s0, 0
3478 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
3479 ; GFX6-NEXT: s_max_i32 s12, s0, 0
3480 ; GFX6-NEXT: s_sub_i32 s13, 0x80000000, s13
3481 ; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12
3482 ; GFX6-NEXT: s_max_i32 s6, s13, s6
3483 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3484 ; GFX6-NEXT: s_min_i32 s6, s6, s12
3485 ; GFX6-NEXT: s_min_i32 s12, s1, 0
3486 ; GFX6-NEXT: s_add_i32 s0, s0, s6
3487 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16
3488 ; GFX6-NEXT: s_max_i32 s7, s1, 0
3489 ; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12
3490 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7
3491 ; GFX6-NEXT: s_max_i32 s6, s12, s6
3492 ; GFX6-NEXT: s_min_i32 s6, s6, s7
3493 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3494 ; GFX6-NEXT: s_add_i32 s1, s1, s6
3495 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16
3496 ; GFX6-NEXT: s_min_i32 s8, s2, 0
3497 ; GFX6-NEXT: s_max_i32 s7, s2, 0
3498 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
3499 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7
3500 ; GFX6-NEXT: s_max_i32 s6, s8, s6
3501 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3502 ; GFX6-NEXT: s_min_i32 s6, s6, s7
3503 ; GFX6-NEXT: s_min_i32 s8, s3, 0
3504 ; GFX6-NEXT: s_add_i32 s2, s2, s6
3505 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16
3506 ; GFX6-NEXT: s_max_i32 s7, s3, 0
3507 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
3508 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7
3509 ; GFX6-NEXT: s_max_i32 s6, s8, s6
3510 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3511 ; GFX6-NEXT: s_min_i32 s6, s6, s7
3512 ; GFX6-NEXT: s_min_i32 s8, s4, 0
3513 ; GFX6-NEXT: s_add_i32 s3, s3, s6
3514 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16
3515 ; GFX6-NEXT: s_max_i32 s7, s4, 0
3516 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
3517 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7
3518 ; GFX6-NEXT: s_max_i32 s6, s8, s6
3519 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
3520 ; GFX6-NEXT: s_min_i32 s6, s6, s7
3521 ; GFX6-NEXT: s_min_i32 s8, s5, 0
3522 ; GFX6-NEXT: s_add_i32 s4, s4, s6
3523 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16
3524 ; GFX6-NEXT: s_max_i32 s7, s5, 0
3525 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8
3526 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
3527 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7
3528 ; GFX6-NEXT: s_max_i32 s6, s8, s6
3529 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
3530 ; GFX6-NEXT: s_min_i32 s6, s6, s7
3531 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
3532 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16
3533 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16
3534 ; GFX6-NEXT: s_add_i32 s5, s5, s6
3535 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
3536 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3537 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16
3538 ; GFX6-NEXT: s_or_b32 s0, s0, s1
3539 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
3540 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
3541 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16
3542 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3543 ; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
3544 ; GFX6-NEXT: s_or_b32 s1, s1, s2
3545 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
3546 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3547 ; GFX6-NEXT: s_or_b32 s2, s2, s3
3548 ; GFX6-NEXT: ; return to shader part epilog
3550 ; GFX8-LABEL: s_saddsat_v6i16:
3552 ; GFX8-NEXT: s_sext_i32_i16 s12, s0
3553 ; GFX8-NEXT: s_sext_i32_i16 s13, 0
3554 ; GFX8-NEXT: s_max_i32 s14, s12, s13
3555 ; GFX8-NEXT: s_min_i32 s12, s12, s13
3556 ; GFX8-NEXT: s_sub_i32 s12, 0xffff8000, s12
3557 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16
3558 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
3559 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3560 ; GFX8-NEXT: s_sub_i32 s14, 0x7fff, s14
3561 ; GFX8-NEXT: s_max_i32 s3, s12, s3
3562 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3563 ; GFX8-NEXT: s_sext_i32_i16 s12, s14
3564 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16
3565 ; GFX8-NEXT: s_min_i32 s3, s3, s12
3566 ; GFX8-NEXT: s_add_i32 s0, s0, s3
3567 ; GFX8-NEXT: s_sext_i32_i16 s3, s6
3568 ; GFX8-NEXT: s_max_i32 s12, s3, s13
3569 ; GFX8-NEXT: s_min_i32 s3, s3, s13
3570 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
3571 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3572 ; GFX8-NEXT: s_sext_i32_i16 s9, s9
3573 ; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12
3574 ; GFX8-NEXT: s_max_i32 s3, s3, s9
3575 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3576 ; GFX8-NEXT: s_sext_i32_i16 s9, s12
3577 ; GFX8-NEXT: s_min_i32 s3, s3, s9
3578 ; GFX8-NEXT: s_add_i32 s6, s6, s3
3579 ; GFX8-NEXT: s_sext_i32_i16 s3, s1
3580 ; GFX8-NEXT: s_max_i32 s9, s3, s13
3581 ; GFX8-NEXT: s_min_i32 s3, s3, s13
3582 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
3583 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16
3584 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3585 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3586 ; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
3587 ; GFX8-NEXT: s_max_i32 s3, s3, s4
3588 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3589 ; GFX8-NEXT: s_sext_i32_i16 s4, s9
3590 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16
3591 ; GFX8-NEXT: s_min_i32 s3, s3, s4
3592 ; GFX8-NEXT: s_add_i32 s1, s1, s3
3593 ; GFX8-NEXT: s_sext_i32_i16 s3, s7
3594 ; GFX8-NEXT: s_max_i32 s4, s3, s13
3595 ; GFX8-NEXT: s_min_i32 s3, s3, s13
3596 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
3597 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3598 ; GFX8-NEXT: s_sext_i32_i16 s9, s10
3599 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
3600 ; GFX8-NEXT: s_max_i32 s3, s3, s9
3601 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3602 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3603 ; GFX8-NEXT: s_min_i32 s3, s3, s4
3604 ; GFX8-NEXT: s_add_i32 s7, s7, s3
3605 ; GFX8-NEXT: s_sext_i32_i16 s3, s2
3606 ; GFX8-NEXT: s_max_i32 s4, s3, s13
3607 ; GFX8-NEXT: s_min_i32 s3, s3, s13
3608 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
3609 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16
3610 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3611 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
3612 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
3613 ; GFX8-NEXT: s_max_i32 s3, s3, s5
3614 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3615 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3616 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16
3617 ; GFX8-NEXT: s_min_i32 s3, s3, s4
3618 ; GFX8-NEXT: s_add_i32 s2, s2, s3
3619 ; GFX8-NEXT: s_sext_i32_i16 s3, s8
3620 ; GFX8-NEXT: s_max_i32 s4, s3, s13
3621 ; GFX8-NEXT: s_min_i32 s3, s3, s13
3622 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
3623 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3624 ; GFX8-NEXT: s_sext_i32_i16 s5, s11
3625 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
3626 ; GFX8-NEXT: s_max_i32 s3, s3, s5
3627 ; GFX8-NEXT: s_sext_i32_i16 s3, s3
3628 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3629 ; GFX8-NEXT: s_min_i32 s3, s3, s4
3630 ; GFX8-NEXT: s_add_i32 s8, s8, s3
3631 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s6
3632 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
3633 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16
3634 ; GFX8-NEXT: s_or_b32 s0, s0, s3
3635 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s7
3636 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
3637 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16
3638 ; GFX8-NEXT: s_or_b32 s1, s1, s3
3639 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s8
3640 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
3641 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16
3642 ; GFX8-NEXT: s_or_b32 s2, s2, s3
3643 ; GFX8-NEXT: ; return to shader part epilog
3645 ; GFX9-LABEL: s_saddsat_v6i16:
3647 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
3648 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3649 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
3650 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
3651 ; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp
3652 ; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp
3653 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
3654 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
3655 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
3656 ; GFX9-NEXT: ; return to shader part epilog
3658 ; GFX10PLUS-LABEL: s_saddsat_v6i16:
3659 ; GFX10PLUS: ; %bb.0:
3660 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s3 clamp
3661 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, s1, s4 clamp
3662 ; GFX10PLUS-NEXT: v_pk_add_i16 v2, s2, s5 clamp
3663 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
3664 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
3665 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
3666 ; GFX10PLUS-NEXT: ; return to shader part epilog
3667 %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3668 %cast = bitcast <6 x i16> %result to <3 x i32>
3672 define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
3673 ; GFX6-LABEL: v_saddsat_v8i16:
3675 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3676 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3677 ; GFX6-NEXT: s_brev_b32 s5, 1
3678 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v0
3679 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
3680 ; GFX6-NEXT: s_brev_b32 s4, -2
3681 ; GFX6-NEXT: v_max_i32_e32 v16, 0, v0
3682 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18
3683 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16
3684 ; GFX6-NEXT: v_max_i32_e32 v8, v18, v8
3685 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3686 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v16
3687 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v1
3688 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8
3689 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9
3690 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v1
3691 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s5, v16
3692 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s4, v9
3693 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8
3694 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3695 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3696 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8
3697 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10
3698 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v2
3699 ; GFX6-NEXT: v_bfrev_b32_e32 v17, -2
3700 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v2
3701 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10
3702 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9
3703 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8
3704 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3705 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1
3706 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3707 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3
3708 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8
3709 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11
3710 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v3
3711 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10
3712 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9
3713 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8
3714 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3715 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3716 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v4
3717 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8
3718 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12
3719 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v4
3720 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10
3721 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9
3722 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8
3723 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
3724 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3725 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v5
3726 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8
3727 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13
3728 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v5
3729 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10
3730 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9
3731 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8
3732 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
3733 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3734 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v6
3735 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8
3736 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14
3737 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v6
3738 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10
3739 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9
3740 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8
3741 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
3742 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3743 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v7
3744 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
3745 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8
3746 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15
3747 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7
3748 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10
3749 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
3750 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9
3751 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8
3752 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3753 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2
3754 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3
3755 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9
3756 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3757 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3758 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5
3759 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8
3760 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3761 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
3762 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
3763 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4
3764 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7
3765 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3766 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5
3767 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6
3768 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3769 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4
3770 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3771 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7
3772 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
3773 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6
3774 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3775 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
3776 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3778 ; GFX8-LABEL: v_saddsat_v8i16:
3780 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3781 ; GFX8-NEXT: v_min_i16_e32 v13, 0, v0
3782 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0
3783 ; GFX8-NEXT: v_max_i16_e32 v12, 0, v0
3784 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13
3785 ; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12
3786 ; GFX8-NEXT: v_max_i16_e32 v13, v13, v4
3787 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v8
3788 ; GFX8-NEXT: v_min_i16_e32 v12, v13, v12
3789 ; GFX8-NEXT: v_max_i16_e32 v13, 0, v8
3790 ; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14
3791 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13
3792 ; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3793 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v1
3794 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
3795 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v13
3796 ; GFX8-NEXT: v_max_i16_e32 v13, 0, v1
3797 ; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14
3798 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13
3799 ; GFX8-NEXT: v_max_i16_e32 v14, v14, v5
3800 ; GFX8-NEXT: v_min_i16_e32 v15, 0, v9
3801 ; GFX8-NEXT: v_min_i16_e32 v13, v14, v13
3802 ; GFX8-NEXT: v_max_i16_e32 v14, 0, v9
3803 ; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15
3804 ; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14
3805 ; GFX8-NEXT: v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3806 ; GFX8-NEXT: v_min_i16_e32 v15, 0, v2
3807 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2
3808 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v14
3809 ; GFX8-NEXT: v_max_i16_e32 v14, 0, v2
3810 ; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15
3811 ; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14
3812 ; GFX8-NEXT: v_max_i16_e32 v15, v15, v6
3813 ; GFX8-NEXT: v_min_i16_e32 v16, 0, v10
3814 ; GFX8-NEXT: v_min_i16_e32 v14, v15, v14
3815 ; GFX8-NEXT: v_max_i16_e32 v15, 0, v10
3816 ; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16
3817 ; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15
3818 ; GFX8-NEXT: v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3819 ; GFX8-NEXT: v_min_i16_e32 v16, 0, v3
3820 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3
3821 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v15
3822 ; GFX8-NEXT: v_max_i16_e32 v15, 0, v3
3823 ; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16
3824 ; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15
3825 ; GFX8-NEXT: v_max_i16_e32 v16, v16, v7
3826 ; GFX8-NEXT: v_min_i16_e32 v17, 0, v11
3827 ; GFX8-NEXT: v_min_i16_e32 v15, v16, v15
3828 ; GFX8-NEXT: v_max_i16_e32 v16, 0, v11
3829 ; GFX8-NEXT: v_sub_u16_e32 v17, 0x8000, v17
3830 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v12
3831 ; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3832 ; GFX8-NEXT: v_sub_u16_e32 v16, 0x7fff, v16
3833 ; GFX8-NEXT: v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3834 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
3835 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v13
3836 ; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3837 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v16
3838 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
3839 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v14
3840 ; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3841 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
3842 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v15
3843 ; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3844 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
3845 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3847 ; GFX9-LABEL: v_saddsat_v8i16:
3849 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v4 clamp
3851 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v5 clamp
3852 ; GFX9-NEXT: v_pk_add_i16 v2, v2, v6 clamp
3853 ; GFX9-NEXT: v_pk_add_i16 v3, v3, v7 clamp
3854 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3856 ; GFX10PLUS-LABEL: v_saddsat_v8i16:
3857 ; GFX10PLUS: ; %bb.0:
3858 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3859 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v4 clamp
3860 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v5 clamp
3861 ; GFX10PLUS-NEXT: v_pk_add_i16 v2, v2, v6 clamp
3862 ; GFX10PLUS-NEXT: v_pk_add_i16 v3, v3, v7 clamp
3863 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3864 %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
3865 %cast = bitcast <8 x i16> %result to <4 x float>
3866 ret <4 x float> %cast
3869 define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
3870 ; GFX6-LABEL: s_saddsat_v8i16:
3872 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
3873 ; GFX6-NEXT: s_min_i32 s17, s0, 0
3874 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16
3875 ; GFX6-NEXT: s_max_i32 s16, s0, 0
3876 ; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17
3877 ; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16
3878 ; GFX6-NEXT: s_max_i32 s8, s17, s8
3879 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3880 ; GFX6-NEXT: s_min_i32 s8, s8, s16
3881 ; GFX6-NEXT: s_min_i32 s16, s1, 0
3882 ; GFX6-NEXT: s_add_i32 s0, s0, s8
3883 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16
3884 ; GFX6-NEXT: s_max_i32 s9, s1, 0
3885 ; GFX6-NEXT: s_sub_i32 s16, 0x80000000, s16
3886 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3887 ; GFX6-NEXT: s_max_i32 s8, s16, s8
3888 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3889 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3890 ; GFX6-NEXT: s_add_i32 s1, s1, s8
3891 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16
3892 ; GFX6-NEXT: s_min_i32 s10, s2, 0
3893 ; GFX6-NEXT: s_max_i32 s9, s2, 0
3894 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
3895 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3896 ; GFX6-NEXT: s_max_i32 s8, s10, s8
3897 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3898 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3899 ; GFX6-NEXT: s_min_i32 s10, s3, 0
3900 ; GFX6-NEXT: s_add_i32 s2, s2, s8
3901 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16
3902 ; GFX6-NEXT: s_max_i32 s9, s3, 0
3903 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
3904 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3905 ; GFX6-NEXT: s_max_i32 s8, s10, s8
3906 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3907 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3908 ; GFX6-NEXT: s_min_i32 s10, s4, 0
3909 ; GFX6-NEXT: s_add_i32 s3, s3, s8
3910 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16
3911 ; GFX6-NEXT: s_max_i32 s9, s4, 0
3912 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
3913 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3914 ; GFX6-NEXT: s_max_i32 s8, s10, s8
3915 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
3916 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3917 ; GFX6-NEXT: s_min_i32 s10, s5, 0
3918 ; GFX6-NEXT: s_add_i32 s4, s4, s8
3919 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16
3920 ; GFX6-NEXT: s_max_i32 s9, s5, 0
3921 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
3922 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3923 ; GFX6-NEXT: s_max_i32 s8, s10, s8
3924 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
3925 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3926 ; GFX6-NEXT: s_min_i32 s10, s6, 0
3927 ; GFX6-NEXT: s_add_i32 s5, s5, s8
3928 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16
3929 ; GFX6-NEXT: s_max_i32 s9, s6, 0
3930 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
3931 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3932 ; GFX6-NEXT: s_max_i32 s8, s10, s8
3933 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16
3934 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3935 ; GFX6-NEXT: s_min_i32 s10, s7, 0
3936 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16
3937 ; GFX6-NEXT: s_add_i32 s6, s6, s8
3938 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16
3939 ; GFX6-NEXT: s_max_i32 s9, s7, 0
3940 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10
3941 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16
3942 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9
3943 ; GFX6-NEXT: s_max_i32 s8, s10, s8
3944 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
3945 ; GFX6-NEXT: s_ashr_i32 s2, s2, 16
3946 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16
3947 ; GFX6-NEXT: s_min_i32 s8, s8, s9
3948 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
3949 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
3950 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16
3951 ; GFX6-NEXT: s_add_i32 s7, s7, s8
3952 ; GFX6-NEXT: s_or_b32 s0, s0, s1
3953 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
3954 ; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
3955 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16
3956 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16
3957 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
3958 ; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
3959 ; GFX6-NEXT: s_ashr_i32 s6, s6, 16
3960 ; GFX6-NEXT: s_or_b32 s1, s1, s2
3961 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
3962 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
3963 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
3964 ; GFX6-NEXT: s_or_b32 s2, s2, s3
3965 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
3966 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
3967 ; GFX6-NEXT: s_or_b32 s3, s3, s4
3968 ; GFX6-NEXT: ; return to shader part epilog
3970 ; GFX8-LABEL: s_saddsat_v8i16:
3972 ; GFX8-NEXT: s_sext_i32_i16 s16, s0
3973 ; GFX8-NEXT: s_sext_i32_i16 s17, 0
3974 ; GFX8-NEXT: s_max_i32 s18, s16, s17
3975 ; GFX8-NEXT: s_min_i32 s16, s16, s17
3976 ; GFX8-NEXT: s_sub_i32 s16, 0xffff8000, s16
3977 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16
3978 ; GFX8-NEXT: s_sext_i32_i16 s16, s16
3979 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3980 ; GFX8-NEXT: s_sub_i32 s18, 0x7fff, s18
3981 ; GFX8-NEXT: s_max_i32 s4, s16, s4
3982 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3983 ; GFX8-NEXT: s_sext_i32_i16 s16, s18
3984 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16
3985 ; GFX8-NEXT: s_min_i32 s4, s4, s16
3986 ; GFX8-NEXT: s_add_i32 s0, s0, s4
3987 ; GFX8-NEXT: s_sext_i32_i16 s4, s8
3988 ; GFX8-NEXT: s_max_i32 s16, s4, s17
3989 ; GFX8-NEXT: s_min_i32 s4, s4, s17
3990 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
3991 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3992 ; GFX8-NEXT: s_sext_i32_i16 s12, s12
3993 ; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16
3994 ; GFX8-NEXT: s_max_i32 s4, s4, s12
3995 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
3996 ; GFX8-NEXT: s_sext_i32_i16 s12, s16
3997 ; GFX8-NEXT: s_min_i32 s4, s4, s12
3998 ; GFX8-NEXT: s_add_i32 s8, s8, s4
3999 ; GFX8-NEXT: s_sext_i32_i16 s4, s1
4000 ; GFX8-NEXT: s_max_i32 s12, s4, s17
4001 ; GFX8-NEXT: s_min_i32 s4, s4, s17
4002 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
4003 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16
4004 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4005 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4006 ; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12
4007 ; GFX8-NEXT: s_max_i32 s4, s4, s5
4008 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4009 ; GFX8-NEXT: s_sext_i32_i16 s5, s12
4010 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16
4011 ; GFX8-NEXT: s_min_i32 s4, s4, s5
4012 ; GFX8-NEXT: s_add_i32 s1, s1, s4
4013 ; GFX8-NEXT: s_sext_i32_i16 s4, s9
4014 ; GFX8-NEXT: s_max_i32 s5, s4, s17
4015 ; GFX8-NEXT: s_min_i32 s4, s4, s17
4016 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
4017 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4018 ; GFX8-NEXT: s_sext_i32_i16 s12, s13
4019 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
4020 ; GFX8-NEXT: s_max_i32 s4, s4, s12
4021 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4022 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4023 ; GFX8-NEXT: s_min_i32 s4, s4, s5
4024 ; GFX8-NEXT: s_add_i32 s9, s9, s4
4025 ; GFX8-NEXT: s_sext_i32_i16 s4, s2
4026 ; GFX8-NEXT: s_max_i32 s5, s4, s17
4027 ; GFX8-NEXT: s_min_i32 s4, s4, s17
4028 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
4029 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16
4030 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4031 ; GFX8-NEXT: s_sext_i32_i16 s6, s6
4032 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
4033 ; GFX8-NEXT: s_max_i32 s4, s4, s6
4034 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4035 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4036 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16
4037 ; GFX8-NEXT: s_min_i32 s4, s4, s5
4038 ; GFX8-NEXT: s_add_i32 s2, s2, s4
4039 ; GFX8-NEXT: s_sext_i32_i16 s4, s10
4040 ; GFX8-NEXT: s_max_i32 s5, s4, s17
4041 ; GFX8-NEXT: s_min_i32 s4, s4, s17
4042 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
4043 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4044 ; GFX8-NEXT: s_sext_i32_i16 s6, s14
4045 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
4046 ; GFX8-NEXT: s_max_i32 s4, s4, s6
4047 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4048 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4049 ; GFX8-NEXT: s_min_i32 s4, s4, s5
4050 ; GFX8-NEXT: s_add_i32 s10, s10, s4
4051 ; GFX8-NEXT: s_sext_i32_i16 s4, s3
4052 ; GFX8-NEXT: s_max_i32 s5, s4, s17
4053 ; GFX8-NEXT: s_min_i32 s4, s4, s17
4054 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
4055 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4056 ; GFX8-NEXT: s_sext_i32_i16 s6, s7
4057 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
4058 ; GFX8-NEXT: s_max_i32 s4, s4, s6
4059 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4060 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4061 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16
4062 ; GFX8-NEXT: s_min_i32 s4, s4, s5
4063 ; GFX8-NEXT: s_add_i32 s3, s3, s4
4064 ; GFX8-NEXT: s_sext_i32_i16 s4, s11
4065 ; GFX8-NEXT: s_max_i32 s5, s4, s17
4066 ; GFX8-NEXT: s_min_i32 s4, s4, s17
4067 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16
4068 ; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
4069 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4070 ; GFX8-NEXT: s_sext_i32_i16 s6, s15
4071 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
4072 ; GFX8-NEXT: s_max_i32 s4, s4, s6
4073 ; GFX8-NEXT: s_sext_i32_i16 s4, s4
4074 ; GFX8-NEXT: s_sext_i32_i16 s5, s5
4075 ; GFX8-NEXT: s_min_i32 s4, s4, s5
4076 ; GFX8-NEXT: s_add_i32 s11, s11, s4
4077 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s8
4078 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
4079 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4080 ; GFX8-NEXT: s_or_b32 s0, s0, s4
4081 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s9
4082 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
4083 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4084 ; GFX8-NEXT: s_or_b32 s1, s1, s4
4085 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s10
4086 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
4087 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4088 ; GFX8-NEXT: s_or_b32 s2, s2, s4
4089 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s11
4090 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
4091 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16
4092 ; GFX8-NEXT: s_or_b32 s3, s3, s4
4093 ; GFX8-NEXT: ; return to shader part epilog
4095 ; GFX9-LABEL: s_saddsat_v8i16:
4097 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4098 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4099 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
4100 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
4101 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
4102 ; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp
4103 ; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp
4104 ; GFX9-NEXT: v_pk_add_i16 v3, s3, v3 clamp
4105 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
4106 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
4107 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
4108 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
4109 ; GFX9-NEXT: ; return to shader part epilog
4111 ; GFX10PLUS-LABEL: s_saddsat_v8i16:
4112 ; GFX10PLUS: ; %bb.0:
4113 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s4 clamp
4114 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, s1, s5 clamp
4115 ; GFX10PLUS-NEXT: v_pk_add_i16 v2, s2, s6 clamp
4116 ; GFX10PLUS-NEXT: v_pk_add_i16 v3, s3, s7 clamp
4117 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
4118 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
4119 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
4120 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
4121 ; GFX10PLUS-NEXT: ; return to shader part epilog
4122 %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
4123 %cast = bitcast <8 x i16> %result to <4 x i32>
4127 define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
4128 ; GFX6-LABEL: v_saddsat_i48:
4130 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4131 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
4132 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
4133 ; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
4134 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
4135 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
4136 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4137 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4138 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4139 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0
4140 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5
4141 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4142 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
4143 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
4144 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4146 ; GFX8-LABEL: v_saddsat_i48:
4148 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4149 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
4150 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
4151 ; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
4152 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
4153 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
4154 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4155 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4156 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4157 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0
4158 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5
4159 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4160 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
4161 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
4162 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4164 ; GFX9-LABEL: v_saddsat_i48:
4166 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4167 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4168 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
4169 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
4170 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
4171 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4172 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4173 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4174 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4175 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4176 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4177 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4178 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4179 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4181 ; GFX10-LABEL: v_saddsat_i48:
4183 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4184 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4185 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
4186 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
4187 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4188 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
4189 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4190 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
4191 ; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
4192 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
4193 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4194 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4195 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4196 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4198 ; GFX11-LABEL: v_saddsat_i48:
4200 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4201 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4202 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
4203 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
4204 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4205 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
4206 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4207 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
4208 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
4209 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
4210 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4211 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4212 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4213 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4217 define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
4218 ; GFX6-LABEL: s_saddsat_i48:
4220 ; GFX6-NEXT: s_add_u32 s4, s0, s2
4221 ; GFX6-NEXT: s_addc_u32 s3, s1, s3
4222 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4223 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
4224 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
4225 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
4226 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
4227 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4228 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4229 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31
4230 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15
4231 ; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000
4232 ; GFX6-NEXT: v_mov_b32_e32 v0, s5
4233 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
4234 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
4235 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
4236 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
4237 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4238 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4239 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
4240 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
4241 ; GFX6-NEXT: ; return to shader part epilog
4243 ; GFX8-LABEL: s_saddsat_i48:
4245 ; GFX8-NEXT: s_add_u32 s4, s0, s2
4246 ; GFX8-NEXT: s_addc_u32 s3, s1, s3
4247 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4248 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
4249 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
4250 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
4251 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
4252 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
4253 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4254 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31
4255 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15
4256 ; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000
4257 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
4258 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
4259 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
4260 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
4261 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
4262 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4263 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4264 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
4265 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
4266 ; GFX8-NEXT: ; return to shader part epilog
4268 ; GFX9-LABEL: s_saddsat_i48:
4270 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4271 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
4272 ; GFX9-NEXT: s_add_u32 s4, s0, s2
4273 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
4274 ; GFX9-NEXT: s_addc_u32 s5, s1, s3
4275 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
4276 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4277 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4278 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31
4279 ; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
4280 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4281 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4282 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
4283 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
4284 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
4285 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4286 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4287 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4288 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
4289 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
4290 ; GFX9-NEXT: ; return to shader part epilog
4292 ; GFX10-LABEL: s_saddsat_i48:
4294 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4295 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
4296 ; GFX10-NEXT: s_add_u32 s4, s0, s2
4297 ; GFX10-NEXT: s_addc_u32 s5, s1, s3
4298 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
4299 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4300 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
4301 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
4302 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31
4303 ; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
4304 ; GFX10-NEXT: s_xor_b32 s0, s1, s0
4305 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4306 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4307 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4308 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
4309 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
4310 ; GFX10-NEXT: ; return to shader part epilog
4312 ; GFX11-LABEL: s_saddsat_i48:
4314 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4315 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
4316 ; GFX11-NEXT: s_add_u32 s4, s0, s2
4317 ; GFX11-NEXT: s_addc_u32 s5, s1, s3
4318 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4319 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4320 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
4321 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31
4322 ; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
4323 ; GFX11-NEXT: s_xor_b32 s0, s1, s0
4324 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4325 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4326 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4327 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
4328 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
4329 ; GFX11-NEXT: ; return to shader part epilog
4330 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4334 define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
4335 ; GFX6-LABEL: saddsat_i48_sv:
4337 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4338 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
4339 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
4340 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
4341 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4342 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
4343 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4344 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4345 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4346 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4347 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0
4348 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4349 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4350 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4351 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
4352 ; GFX6-NEXT: ; return to shader part epilog
4354 ; GFX8-LABEL: saddsat_i48_sv:
4356 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4357 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
4358 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
4359 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
4360 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4361 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
4362 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4363 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4364 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4365 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4366 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0
4367 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4368 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4369 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4370 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
4371 ; GFX8-NEXT: ; return to shader part epilog
4373 ; GFX9-LABEL: saddsat_i48_sv:
4375 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4376 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4377 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
4378 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
4379 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
4380 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4381 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4382 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4383 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4384 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4385 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4386 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4387 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4388 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
4389 ; GFX9-NEXT: ; return to shader part epilog
4391 ; GFX10-LABEL: saddsat_i48_sv:
4393 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4394 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4395 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
4396 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4397 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4398 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4399 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
4400 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4401 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4402 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4403 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4404 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4405 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
4406 ; GFX10-NEXT: ; return to shader part epilog
4408 ; GFX11-LABEL: saddsat_i48_sv:
4410 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4411 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4412 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
4413 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4414 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4415 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4416 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
4417 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4418 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4419 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4420 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4421 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
4422 ; GFX11-NEXT: ; return to shader part epilog
4423 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4424 %ext.result = zext i48 %result to i64
4425 %cast = bitcast i64 %ext.result to <2 x float>
4426 ret <2 x float> %cast
4429 define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
4430 ; GFX6-LABEL: saddsat_i48_vs:
4432 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4433 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
4434 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
4435 ; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
4436 ; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
4437 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4438 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4439 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4440 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4441 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4442 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0
4443 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4444 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4445 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4446 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
4447 ; GFX6-NEXT: ; return to shader part epilog
4449 ; GFX8-LABEL: saddsat_i48_vs:
4451 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4452 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
4453 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
4454 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
4455 ; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
4456 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
4457 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4458 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4459 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4460 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
4461 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0
4462 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4463 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4464 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4465 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
4466 ; GFX8-NEXT: ; return to shader part epilog
4468 ; GFX9-LABEL: saddsat_i48_vs:
4470 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4471 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
4472 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
4473 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
4474 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
4475 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
4476 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
4477 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4478 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4479 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4480 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4481 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4482 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4483 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
4484 ; GFX9-NEXT: ; return to shader part epilog
4486 ; GFX10-LABEL: saddsat_i48_vs:
4488 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4489 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4490 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
4491 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4492 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
4493 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4494 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4495 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4496 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4497 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4498 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4499 ; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4500 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
4501 ; GFX10-NEXT: ; return to shader part epilog
4503 ; GFX11-LABEL: saddsat_i48_vs:
4505 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
4506 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
4507 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
4508 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4509 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
4510 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4511 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4512 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4513 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4514 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4515 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
4516 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
4517 ; GFX11-NEXT: ; return to shader part epilog
4518 %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4519 %ext.result = zext i48 %result to i64
4520 %cast = bitcast i64 %ext.result to <2 x float>
4521 ret <2 x float> %cast
4524 define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
4525 ; GFX6-LABEL: v_saddsat_i64:
4527 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4528 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
4529 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
4530 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4531 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4532 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4533 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
4534 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4535 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4536 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4537 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4539 ; GFX8-LABEL: v_saddsat_i64:
4541 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4542 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
4543 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
4544 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4545 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4546 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4547 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
4548 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4549 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4550 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4551 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4553 ; GFX9-LABEL: v_saddsat_i64:
4555 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4556 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
4557 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
4558 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
4559 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
4560 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
4561 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4562 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4563 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4564 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4565 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4567 ; GFX10-LABEL: v_saddsat_i64:
4569 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4570 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
4571 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4572 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
4573 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4574 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4575 ; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
4576 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
4577 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
4578 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4579 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4581 ; GFX11-LABEL: v_saddsat_i64:
4583 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4584 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
4585 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4586 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
4587 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
4588 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
4589 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
4590 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4591 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
4592 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4593 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4597 define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
4598 ; GFX6-LABEL: s_saddsat_i64:
4600 ; GFX6-NEXT: s_add_u32 s4, s0, s2
4601 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
4602 ; GFX6-NEXT: s_addc_u32 s5, s1, s3
4603 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
4604 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4605 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4606 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31
4607 ; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000
4608 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
4609 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
4610 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
4611 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
4612 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
4613 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4614 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4615 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
4616 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
4617 ; GFX6-NEXT: ; return to shader part epilog
4619 ; GFX8-LABEL: s_saddsat_i64:
4621 ; GFX8-NEXT: s_add_u32 s4, s0, s2
4622 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
4623 ; GFX8-NEXT: s_addc_u32 s5, s1, s3
4624 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
4625 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4626 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4627 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31
4628 ; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000
4629 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
4630 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
4631 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
4632 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
4633 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
4634 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4635 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4636 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
4637 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
4638 ; GFX8-NEXT: ; return to shader part epilog
4640 ; GFX9-LABEL: s_saddsat_i64:
4642 ; GFX9-NEXT: s_add_u32 s4, s0, s2
4643 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
4644 ; GFX9-NEXT: s_addc_u32 s5, s1, s3
4645 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
4646 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4647 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4648 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31
4649 ; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
4650 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4651 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4652 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
4653 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
4654 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
4655 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4656 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4657 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
4658 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
4659 ; GFX9-NEXT: ; return to shader part epilog
4661 ; GFX10-LABEL: s_saddsat_i64:
4663 ; GFX10-NEXT: s_add_u32 s4, s0, s2
4664 ; GFX10-NEXT: s_addc_u32 s5, s1, s3
4665 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
4666 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4667 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
4668 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
4669 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31
4670 ; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
4671 ; GFX10-NEXT: s_xor_b32 s0, s1, s0
4672 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4673 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4674 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
4675 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
4676 ; GFX10-NEXT: ; return to shader part epilog
4678 ; GFX11-LABEL: s_saddsat_i64:
4680 ; GFX11-NEXT: s_add_u32 s4, s0, s2
4681 ; GFX11-NEXT: s_addc_u32 s5, s1, s3
4682 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
4683 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4684 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
4685 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31
4686 ; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
4687 ; GFX11-NEXT: s_xor_b32 s0, s1, s0
4688 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
4689 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
4690 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
4691 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
4692 ; GFX11-NEXT: ; return to shader part epilog
4693 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4697 define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
4698 ; GFX6-LABEL: saddsat_i64_sv:
4700 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4701 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
4702 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
4703 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4704 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4705 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4706 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
4707 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4708 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4709 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4710 ; GFX6-NEXT: ; return to shader part epilog
4712 ; GFX8-LABEL: saddsat_i64_sv:
4714 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4715 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
4716 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
4717 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4718 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4719 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4720 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
4721 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4722 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4723 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4724 ; GFX8-NEXT: ; return to shader part epilog
4726 ; GFX9-LABEL: saddsat_i64_sv:
4728 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
4729 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
4730 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
4731 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
4732 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
4733 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4734 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4735 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
4736 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4737 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4738 ; GFX9-NEXT: ; return to shader part epilog
4740 ; GFX10-LABEL: saddsat_i64_sv:
4742 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
4743 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4744 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4745 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4746 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
4747 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4748 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4749 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4750 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4751 ; GFX10-NEXT: ; return to shader part epilog
4753 ; GFX11-LABEL: saddsat_i64_sv:
4755 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
4756 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4757 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4758 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
4759 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
4760 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4761 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4762 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4763 ; GFX11-NEXT: ; return to shader part epilog
4764 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4765 %cast = bitcast i64 %result to <2 x float>
4766 ret <2 x float> %cast
4769 define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
4770 ; GFX6-LABEL: saddsat_i64_vs:
4772 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
4773 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
4774 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
4775 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4776 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4777 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4778 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
4779 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4780 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4781 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4782 ; GFX6-NEXT: ; return to shader part epilog
4784 ; GFX8-LABEL: saddsat_i64_vs:
4786 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
4787 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
4788 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
4789 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4790 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4791 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4792 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
4793 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4794 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4795 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4796 ; GFX8-NEXT: ; return to shader part epilog
4798 ; GFX9-LABEL: saddsat_i64_vs:
4800 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
4801 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
4802 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
4803 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
4804 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
4805 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
4806 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
4807 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
4808 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
4809 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
4810 ; GFX9-NEXT: ; return to shader part epilog
4812 ; GFX10-LABEL: saddsat_i64_vs:
4814 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
4815 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4816 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
4817 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4818 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4819 ; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
4820 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4821 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
4822 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4823 ; GFX10-NEXT: ; return to shader part epilog
4825 ; GFX11-LABEL: saddsat_i64_vs:
4827 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
4828 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4829 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
4830 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
4831 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4832 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
4833 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4834 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
4835 ; GFX11-NEXT: ; return to shader part epilog
4836 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4837 %cast = bitcast i64 %result to <2 x float>
4838 ret <2 x float> %cast
4841 define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
4842 ; GFX6-LABEL: v_saddsat_v2i64:
4844 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4845 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v4
4846 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc
4847 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4848 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
4849 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9
4850 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
4851 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
4852 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4853 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
4854 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
4855 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6
4856 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc
4857 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4858 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
4859 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5
4860 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
4861 ; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4862 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
4863 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
4864 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4866 ; GFX8-LABEL: v_saddsat_v2i64:
4868 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4869 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v4
4870 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc
4871 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4872 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
4873 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9
4874 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
4875 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1
4876 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4877 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
4878 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
4879 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6
4880 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc
4881 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4882 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
4883 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5
4884 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2
4885 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4886 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
4887 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
4888 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4890 ; GFX9-LABEL: v_saddsat_v2i64:
4892 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4893 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4
4894 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
4895 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
4896 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
4897 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9
4898 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
4899 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
4900 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4901 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
4902 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
4903 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6
4904 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
4905 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
4906 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
4907 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
4908 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2
4909 ; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
4910 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
4911 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
4912 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4914 ; GFX10-LABEL: v_saddsat_v2i64:
4916 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4917 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4
4918 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4919 ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6
4920 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4921 ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9
4922 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4923 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5]
4924 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11
4925 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
4926 ; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12
4927 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
4928 ; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4
4929 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
4930 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo
4931 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
4932 ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5
4933 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo
4934 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo
4935 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4937 ; GFX11-LABEL: v_saddsat_v2i64:
4939 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4940 ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4
4941 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4942 ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6
4943 ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4944 ; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9
4945 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4946 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5]
4947 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11
4948 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
4949 ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7]
4950 ; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12
4951 ; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4
4952 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
4953 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
4954 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1
4955 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3
4956 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4957 %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4958 ret <2 x i64> %result
4961 define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
4962 ; GFX6-LABEL: s_saddsat_v2i64:
4964 ; GFX6-NEXT: s_add_u32 s8, s0, s4
4965 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
4966 ; GFX6-NEXT: s_addc_u32 s9, s1, s5
4967 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
4968 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4969 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
4970 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31
4971 ; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
4972 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
4973 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
4974 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
4975 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
4976 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
4977 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
4978 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
4979 ; GFX6-NEXT: s_add_u32 s0, s2, s6
4980 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
4981 ; GFX6-NEXT: s_addc_u32 s1, s3, s7
4982 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
4983 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4984 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
4985 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31
4986 ; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
4987 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
4988 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
4989 ; GFX6-NEXT: v_mov_b32_e32 v4, s0
4990 ; GFX6-NEXT: v_mov_b32_e32 v5, s1
4991 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc
4992 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
4993 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
4994 ; GFX6-NEXT: v_readfirstlane_b32 s0, v2
4995 ; GFX6-NEXT: v_readfirstlane_b32 s1, v3
4996 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
4997 ; GFX6-NEXT: v_readfirstlane_b32 s3, v1
4998 ; GFX6-NEXT: ; return to shader part epilog
5000 ; GFX8-LABEL: s_saddsat_v2i64:
5002 ; GFX8-NEXT: s_add_u32 s8, s0, s4
5003 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
5004 ; GFX8-NEXT: s_addc_u32 s9, s1, s5
5005 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
5006 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5007 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
5008 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31
5009 ; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
5010 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
5011 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
5012 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
5013 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
5014 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
5015 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
5016 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
5017 ; GFX8-NEXT: s_add_u32 s0, s2, s6
5018 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
5019 ; GFX8-NEXT: s_addc_u32 s1, s3, s7
5020 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
5021 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5022 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
5023 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31
5024 ; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
5025 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
5026 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
5027 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
5028 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
5029 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc
5030 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
5031 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
5032 ; GFX8-NEXT: v_readfirstlane_b32 s0, v2
5033 ; GFX8-NEXT: v_readfirstlane_b32 s1, v3
5034 ; GFX8-NEXT: v_readfirstlane_b32 s2, v0
5035 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1
5036 ; GFX8-NEXT: ; return to shader part epilog
5038 ; GFX9-LABEL: s_saddsat_v2i64:
5040 ; GFX9-NEXT: s_add_u32 s8, s0, s4
5041 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5042 ; GFX9-NEXT: s_addc_u32 s9, s1, s5
5043 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5044 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5045 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
5046 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31
5047 ; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
5048 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5049 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5050 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
5051 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
5052 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
5053 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
5054 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
5055 ; GFX9-NEXT: s_add_u32 s0, s2, s6
5056 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5057 ; GFX9-NEXT: s_addc_u32 s1, s3, s7
5058 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5059 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
5060 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
5061 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31
5062 ; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
5063 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5064 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5065 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
5066 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5067 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc
5068 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
5069 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
5070 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
5071 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
5072 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0
5073 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1
5074 ; GFX9-NEXT: ; return to shader part epilog
5076 ; GFX10-LABEL: s_saddsat_v2i64:
5078 ; GFX10-NEXT: s_add_u32 s8, s0, s4
5079 ; GFX10-NEXT: s_addc_u32 s9, s1, s5
5080 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
5081 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5082 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
5083 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31
5084 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
5085 ; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000
5086 ; GFX10-NEXT: s_xor_b32 s8, s1, s0
5087 ; GFX10-NEXT: s_add_u32 s0, s2, s6
5088 ; GFX10-NEXT: s_addc_u32 s1, s3, s7
5089 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
5090 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5091 ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
5092 ; GFX10-NEXT: v_mov_b32_e32 v3, s1
5093 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
5094 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31
5095 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
5096 ; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
5097 ; GFX10-NEXT: s_xor_b32 s1, s3, s2
5098 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
5099 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
5100 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
5101 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
5102 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
5103 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
5104 ; GFX10-NEXT: ; return to shader part epilog
5106 ; GFX11-LABEL: s_saddsat_v2i64:
5108 ; GFX11-NEXT: s_add_u32 s8, s0, s4
5109 ; GFX11-NEXT: s_addc_u32 s9, s1, s5
5110 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
5111 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
5112 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
5113 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31
5114 ; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000
5115 ; GFX11-NEXT: s_xor_b32 s8, s1, s0
5116 ; GFX11-NEXT: s_add_u32 s0, s2, s6
5117 ; GFX11-NEXT: s_addc_u32 s1, s3, s7
5118 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
5119 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
5120 ; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
5121 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
5122 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31
5123 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
5124 ; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
5125 ; GFX11-NEXT: s_xor_b32 s1, s3, s2
5126 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
5127 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
5128 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
5129 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
5130 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2
5131 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
5132 ; GFX11-NEXT: ; return to shader part epilog
5133 %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
5134 ret <2 x i64> %result
5137 define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
5138 ; GFX6-LABEL: s_saddsat_i128:
5140 ; GFX6-NEXT: s_add_u32 s4, s0, s4
5141 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5142 ; GFX6-NEXT: s_addc_u32 s5, s1, s5
5143 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
5144 ; GFX6-NEXT: s_addc_u32 s8, s2, s6
5145 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
5146 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5147 ; GFX6-NEXT: s_addc_u32 s9, s3, s7
5148 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
5149 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5150 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5151 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
5152 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
5153 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5154 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5155 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
5156 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
5157 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5158 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5159 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31
5160 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5161 ; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
5162 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
5163 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
5164 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
5165 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5166 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5167 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5168 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5169 ; GFX6-NEXT: v_mov_b32_e32 v4, s8
5170 ; GFX6-NEXT: v_mov_b32_e32 v5, s9
5171 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
5172 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
5173 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
5174 ; GFX6-NEXT: v_readfirstlane_b32 s1, v2
5175 ; GFX6-NEXT: v_readfirstlane_b32 s2, v1
5176 ; GFX6-NEXT: v_readfirstlane_b32 s3, v3
5177 ; GFX6-NEXT: ; return to shader part epilog
5179 ; GFX8-LABEL: s_saddsat_i128:
5181 ; GFX8-NEXT: s_add_u32 s4, s0, s4
5182 ; GFX8-NEXT: s_addc_u32 s5, s1, s5
5183 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
5184 ; GFX8-NEXT: s_addc_u32 s8, s2, s6
5185 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
5186 ; GFX8-NEXT: s_addc_u32 s9, s3, s7
5187 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
5188 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5189 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
5190 ; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
5191 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
5192 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5193 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5194 ; GFX8-NEXT: s_and_b32 s0, 1, s2
5195 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5196 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5197 ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0
5198 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
5199 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
5200 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5201 ; GFX8-NEXT: s_and_b32 s0, 1, s2
5202 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
5203 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5204 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5205 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
5206 ; GFX8-NEXT: s_ashr_i32 s0, s9, 31
5207 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5208 ; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
5209 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
5210 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
5211 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
5212 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5213 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5214 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5215 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
5216 ; GFX8-NEXT: v_mov_b32_e32 v4, s8
5217 ; GFX8-NEXT: v_mov_b32_e32 v5, s9
5218 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
5219 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
5220 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
5221 ; GFX8-NEXT: v_readfirstlane_b32 s1, v2
5222 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1
5223 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
5224 ; GFX8-NEXT: ; return to shader part epilog
5226 ; GFX9-LABEL: s_saddsat_i128:
5228 ; GFX9-NEXT: s_add_u32 s4, s0, s4
5229 ; GFX9-NEXT: s_addc_u32 s5, s1, s5
5230 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5231 ; GFX9-NEXT: s_addc_u32 s8, s2, s6
5232 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
5233 ; GFX9-NEXT: s_addc_u32 s9, s3, s7
5234 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5235 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5236 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5237 ; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
5238 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
5239 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5240 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
5241 ; GFX9-NEXT: s_and_b32 s0, 1, s2
5242 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5243 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
5244 ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0
5245 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
5246 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
5247 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5248 ; GFX9-NEXT: s_and_b32 s0, 1, s2
5249 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
5250 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5251 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5252 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
5253 ; GFX9-NEXT: s_ashr_i32 s0, s9, 31
5254 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5255 ; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
5256 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
5257 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
5258 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
5259 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5260 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5261 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5262 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5263 ; GFX9-NEXT: v_mov_b32_e32 v4, s8
5264 ; GFX9-NEXT: v_mov_b32_e32 v5, s9
5265 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
5266 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
5267 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
5268 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2
5269 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
5270 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
5271 ; GFX9-NEXT: ; return to shader part epilog
5273 ; GFX10-LABEL: s_saddsat_i128:
5275 ; GFX10-NEXT: s_add_u32 s4, s0, s4
5276 ; GFX10-NEXT: s_addc_u32 s5, s1, s5
5277 ; GFX10-NEXT: s_addc_u32 s8, s2, s6
5278 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
5279 ; GFX10-NEXT: s_addc_u32 s9, s3, s7
5280 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
5281 ; GFX10-NEXT: v_mov_b32_e32 v3, s9
5282 ; GFX10-NEXT: s_cselect_b32 s10, 1, 0
5283 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5284 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
5285 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0
5286 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5287 ; GFX10-NEXT: s_and_b32 s0, 1, s10
5288 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
5289 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
5290 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0
5291 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5292 ; GFX10-NEXT: s_and_b32 s1, 1, s1
5293 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
5294 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5295 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
5296 ; GFX10-NEXT: v_mov_b32_e32 v2, s5
5297 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31
5298 ; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
5299 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
5300 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
5301 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5302 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5303 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
5304 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5305 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5306 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5307 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5308 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
5309 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2
5310 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0
5311 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
5312 ; GFX10-NEXT: ; return to shader part epilog
5314 ; GFX11-LABEL: s_saddsat_i128:
5316 ; GFX11-NEXT: s_add_u32 s4, s0, s4
5317 ; GFX11-NEXT: s_addc_u32 s5, s1, s5
5318 ; GFX11-NEXT: s_addc_u32 s8, s2, s6
5319 ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
5320 ; GFX11-NEXT: s_addc_u32 s9, s3, s7
5321 ; GFX11-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
5322 ; GFX11-NEXT: v_mov_b32_e32 v3, s9
5323 ; GFX11-NEXT: s_cselect_b32 s10, 1, 0
5324 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
5325 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
5326 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0
5327 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
5328 ; GFX11-NEXT: s_and_b32 s0, 1, s10
5329 ; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
5330 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
5331 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0
5332 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
5333 ; GFX11-NEXT: s_and_b32 s1, 1, s1
5334 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
5335 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5336 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
5337 ; GFX11-NEXT: v_mov_b32_e32 v2, s5
5338 ; GFX11-NEXT: s_ashr_i32 s0, s9, 31
5339 ; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
5340 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5341 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
5342 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5343 ; GFX11-NEXT: v_mov_b32_e32 v0, s8
5344 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
5345 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
5346 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
5347 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
5348 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
5349 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2
5350 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
5351 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
5352 ; GFX11-NEXT: ; return to shader part epilog
5353 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5357 define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
5358 ; GFX6-LABEL: saddsat_i128_sv:
5360 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
5361 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
5362 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
5363 ; GFX6-NEXT: v_mov_b32_e32 v4, s2
5364 ; GFX6-NEXT: v_mov_b32_e32 v5, s3
5365 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc
5366 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc
5367 ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5368 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
5369 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5370 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
5371 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5372 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
5373 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5374 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
5375 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5376 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5377 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
5378 ; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6
5379 ; GFX6-NEXT: v_bfrev_b32_e32 v6, 1
5380 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v3, v6
5381 ; GFX6-NEXT: v_and_b32_e32 v2, 1, v2
5382 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
5383 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
5384 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
5385 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
5386 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
5387 ; GFX6-NEXT: ; return to shader part epilog
5389 ; GFX8-LABEL: saddsat_i128_sv:
5391 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
5392 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
5393 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
5394 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
5395 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
5396 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc
5397 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc
5398 ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5399 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
5400 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5401 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
5402 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5403 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
5404 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5405 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
5406 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5407 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5408 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
5409 ; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6
5410 ; GFX8-NEXT: v_bfrev_b32_e32 v6, 1
5411 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v3, v6
5412 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
5413 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
5414 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
5415 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
5416 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
5417 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
5418 ; GFX8-NEXT: ; return to shader part epilog
5420 ; GFX9-LABEL: saddsat_i128_sv:
5422 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
5423 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
5424 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
5425 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
5426 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
5427 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
5428 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
5429 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5430 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
5431 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5432 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
5433 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5434 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
5435 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5436 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
5437 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5438 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5439 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
5440 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6
5441 ; GFX9-NEXT: v_bfrev_b32_e32 v6, 1
5442 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6
5443 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
5444 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
5445 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
5446 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
5447 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
5448 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
5449 ; GFX9-NEXT: ; return to shader part epilog
5451 ; GFX10-LABEL: saddsat_i128_sv:
5453 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
5454 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
5455 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
5456 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5457 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5458 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5459 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5460 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5461 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5462 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5463 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5464 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5465 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5466 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5467 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5468 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6
5469 ; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3
5470 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
5471 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
5472 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
5473 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
5474 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo
5475 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
5476 ; GFX10-NEXT: ; return to shader part epilog
5478 ; GFX11-LABEL: saddsat_i128_sv:
5480 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
5481 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
5482 ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
5483 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5484 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5485 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5486 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5487 ; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5488 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5489 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5490 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5491 ; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5492 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5493 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5
5494 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5495 ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
5496 ; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3
5497 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
5498 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
5499 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
5500 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
5501 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_cndmask_b32 v3, v5, v6
5502 ; GFX11-NEXT: ; return to shader part epilog
5503 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5504 %cast = bitcast i128 %result to <4 x float>
5505 ret <4 x float> %cast
5508 define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
5509 ; GFX6-LABEL: saddsat_i128_vs:
5511 ; GFX6-NEXT: v_mov_b32_e32 v5, s1
5512 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0
5513 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
5514 ; GFX6-NEXT: v_mov_b32_e32 v6, s2
5515 ; GFX6-NEXT: v_mov_b32_e32 v7, s3
5516 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc
5517 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc
5518 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5519 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5520 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5521 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5522 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5523 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5524 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5525 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5526 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5527 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
5528 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5529 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5530 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
5531 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
5532 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5533 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5534 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5535 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5536 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5537 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5538 ; GFX6-NEXT: ; return to shader part epilog
5540 ; GFX8-LABEL: saddsat_i128_vs:
5542 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
5543 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0
5544 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
5545 ; GFX8-NEXT: v_mov_b32_e32 v6, s2
5546 ; GFX8-NEXT: v_mov_b32_e32 v7, s3
5547 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc
5548 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc
5549 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5550 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0
5551 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5552 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5553 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5554 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5555 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5556 ; GFX8-NEXT: s_cselect_b32 s4, 1, 0
5557 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5558 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5559 ; GFX8-NEXT: s_and_b32 s0, 1, s4
5560 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
5561 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5562 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
5563 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5564 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
5565 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
5566 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5567 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5568 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5569 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5570 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5571 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5572 ; GFX8-NEXT: ; return to shader part epilog
5574 ; GFX9-LABEL: saddsat_i128_vs:
5576 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5577 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0
5578 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
5579 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
5580 ; GFX9-NEXT: v_mov_b32_e32 v7, s3
5581 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v6, vcc
5582 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
5583 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5584 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0
5585 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5586 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5587 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5588 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5589 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5590 ; GFX9-NEXT: s_cselect_b32 s4, 1, 0
5591 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5592 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5593 ; GFX9-NEXT: s_and_b32 s0, 1, s4
5594 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
5595 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5596 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
5597 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5598 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
5599 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
5600 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5601 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5602 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
5603 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
5604 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
5605 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
5606 ; GFX9-NEXT: ; return to shader part epilog
5608 ; GFX10-LABEL: saddsat_i128_vs:
5610 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0
5611 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5612 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5613 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5614 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5615 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
5616 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
5617 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0
5618 ; GFX10-NEXT: s_and_b32 s0, 1, s0
5619 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5620 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5621 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
5622 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
5623 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5624 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5625 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5626 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5627 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
5628 ; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
5629 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
5630 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5631 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5632 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5633 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5634 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
5635 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
5636 ; GFX10-NEXT: ; return to shader part epilog
5638 ; GFX11-LABEL: saddsat_i128_vs:
5640 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0
5641 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5642 ; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5643 ; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5644 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5645 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
5646 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
5647 ; GFX11-NEXT: s_cselect_b32 s0, 1, 0
5648 ; GFX11-NEXT: s_and_b32 s0, 1, s0
5649 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5650 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5651 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
5652 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
5653 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5654 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5655 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
5656 ; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
5657 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5658 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
5659 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5660 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
5661 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5662 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
5663 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
5664 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
5665 ; GFX11-NEXT: ; return to shader part epilog
5666 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5667 %cast = bitcast i128 %result to <4 x float>
5668 ret <4 x float> %cast
5671 define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
5672 ; GFX6-LABEL: v_saddsat_v2i128:
5674 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5675 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v8
5676 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc
5677 ; GFX6-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc
5678 ; GFX6-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc
5679 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5680 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5681 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5682 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5683 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5684 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v17
5685 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5686 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5687 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5688 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5689 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
5690 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5691 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
5692 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
5693 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5694 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5695 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
5696 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc
5697 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
5698 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
5699 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v12
5700 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc
5701 ; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc
5702 ; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc
5703 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5704 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
5705 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5706 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5707 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5708 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11
5709 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
5710 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5711 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5712 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5713 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
5714 ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4
5715 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6
5716 ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
5717 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
5718 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
5719 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
5720 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
5721 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
5722 ; GFX6-NEXT: s_setpc_b64 s[30:31]
5724 ; GFX8-LABEL: v_saddsat_v2i128:
5726 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5727 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v8
5728 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc
5729 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc
5730 ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc
5731 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5732 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5733 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5734 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5735 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5736 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v17
5737 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5738 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5739 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5740 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5741 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
5742 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
5743 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
5744 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
5745 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
5746 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5747 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
5748 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc
5749 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
5750 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
5751 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v12
5752 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc
5753 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc
5754 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc
5755 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5756 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
5757 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5758 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5759 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5760 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11
5761 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
5762 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5763 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5764 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5765 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
5766 ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4
5767 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6
5768 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
5769 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
5770 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
5771 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
5772 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
5773 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
5774 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5776 ; GFX9-LABEL: v_saddsat_v2i128:
5778 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5779 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8
5780 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v9, vcc
5781 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v2, v10, vcc
5782 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, v3, v11, vcc
5783 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5784 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
5785 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5786 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5787 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5788 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17
5789 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5790 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5791 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
5792 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5793 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
5794 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
5795 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
5796 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
5797 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
5798 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5799 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
5800 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc
5801 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
5802 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
5803 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v12
5804 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v13, vcc
5805 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v6, v14, vcc
5806 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v15, vcc
5807 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5808 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
5809 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5810 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5811 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5812 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
5813 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
5814 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5815 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
5816 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5817 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
5818 ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
5819 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6
5820 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
5821 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
5822 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
5823 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
5824 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
5825 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc
5826 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5828 ; GFX10-LABEL: v_saddsat_v2i128:
5830 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5831 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8
5832 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
5833 ; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
5834 ; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5835 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5836 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5837 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5838 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5839 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5840 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5841 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5842 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5843 ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12
5844 ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5845 ; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5846 ; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5847 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5848 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5849 ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5850 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
5851 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5852 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5853 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
5854 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5855 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5856 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5857 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5858 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19
5859 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5860 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5861 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17
5862 ; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6
5863 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5864 ; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3
5865 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5866 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
5867 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
5868 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo
5869 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v1
5870 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
5871 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
5872 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5
5873 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4
5874 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4
5875 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4
5876 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v19, v7, s4
5877 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5879 ; GFX11-LABEL: v_saddsat_v2i128:
5881 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5882 ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8
5883 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
5884 ; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
5885 ; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5886 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5887 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5888 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5889 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5890 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5891 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5892 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5893 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5894 ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12
5895 ; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
5896 ; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
5897 ; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
5898 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5899 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
5900 ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
5901 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
5902 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
5903 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
5904 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
5905 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
5906 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
5907 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
5908 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
5909 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
5910 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
5911 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17
5912 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
5913 ; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6
5914 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
5915 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
5916 ; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3
5917 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
5918 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
5919 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
5920 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
5921 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
5922 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
5923 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0
5924 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0
5925 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0
5926 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v19, v7, s0
5927 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5928 %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
5929 ret <2 x i128> %result
5932 define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
5933 ; GFX6-LABEL: s_saddsat_v2i128:
5935 ; GFX6-NEXT: s_add_u32 s8, s0, s8
5936 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5937 ; GFX6-NEXT: s_addc_u32 s9, s1, s9
5938 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
5939 ; GFX6-NEXT: s_addc_u32 s16, s2, s10
5940 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
5941 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5942 ; GFX6-NEXT: s_addc_u32 s17, s3, s11
5943 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
5944 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5945 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
5946 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
5947 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
5948 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
5949 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5950 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
5951 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
5952 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5953 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5954 ; GFX6-NEXT: s_ashr_i32 s0, s17, 31
5955 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5956 ; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
5957 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
5958 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
5959 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
5960 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5961 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
5962 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
5963 ; GFX6-NEXT: v_mov_b32_e32 v0, s1
5964 ; GFX6-NEXT: v_mov_b32_e32 v2, s16
5965 ; GFX6-NEXT: v_mov_b32_e32 v3, s17
5966 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
5967 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
5968 ; GFX6-NEXT: s_add_u32 s0, s4, s12
5969 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
5970 ; GFX6-NEXT: s_addc_u32 s1, s5, s13
5971 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
5972 ; GFX6-NEXT: s_addc_u32 s2, s6, s14
5973 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
5974 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5975 ; GFX6-NEXT: s_addc_u32 s3, s7, s15
5976 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
5977 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
5978 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5979 ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
5980 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
5981 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
5982 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5983 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
5984 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
5985 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
5986 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
5987 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31
5988 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
5989 ; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
5990 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
5991 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
5992 ; GFX6-NEXT: v_mov_b32_e32 v3, s1
5993 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
5994 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
5995 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
5996 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
5997 ; GFX6-NEXT: v_mov_b32_e32 v8, s2
5998 ; GFX6-NEXT: v_mov_b32_e32 v9, s3
5999 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
6000 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
6001 ; GFX6-NEXT: v_readfirstlane_b32 s0, v4
6002 ; GFX6-NEXT: v_readfirstlane_b32 s1, v5
6003 ; GFX6-NEXT: v_readfirstlane_b32 s2, v6
6004 ; GFX6-NEXT: v_readfirstlane_b32 s3, v7
6005 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
6006 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2
6007 ; GFX6-NEXT: v_readfirstlane_b32 s6, v1
6008 ; GFX6-NEXT: v_readfirstlane_b32 s7, v3
6009 ; GFX6-NEXT: ; return to shader part epilog
6011 ; GFX8-LABEL: s_saddsat_v2i128:
6013 ; GFX8-NEXT: s_add_u32 s8, s0, s8
6014 ; GFX8-NEXT: s_addc_u32 s9, s1, s9
6015 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
6016 ; GFX8-NEXT: s_addc_u32 s16, s2, s10
6017 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
6018 ; GFX8-NEXT: s_addc_u32 s17, s3, s11
6019 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
6020 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
6021 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
6022 ; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
6023 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
6024 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6025 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
6026 ; GFX8-NEXT: s_and_b32 s0, 1, s2
6027 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6028 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
6029 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0
6030 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
6031 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0
6032 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6033 ; GFX8-NEXT: s_and_b32 s0, 1, s2
6034 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
6035 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6036 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
6037 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
6038 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31
6039 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
6040 ; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
6041 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
6042 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
6043 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
6044 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6045 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
6046 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
6047 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
6048 ; GFX8-NEXT: v_mov_b32_e32 v2, s16
6049 ; GFX8-NEXT: v_mov_b32_e32 v3, s17
6050 ; GFX8-NEXT: s_add_u32 s0, s4, s12
6051 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
6052 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
6053 ; GFX8-NEXT: s_addc_u32 s1, s5, s13
6054 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
6055 ; GFX8-NEXT: s_addc_u32 s2, s6, s14
6056 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
6057 ; GFX8-NEXT: s_addc_u32 s3, s7, s15
6058 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
6059 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
6060 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
6061 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6062 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0
6063 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6064 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
6065 ; GFX8-NEXT: s_and_b32 s4, 1, s6
6066 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6067 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
6068 ; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0
6069 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
6070 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0
6071 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6072 ; GFX8-NEXT: s_and_b32 s4, 1, s6
6073 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
6074 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6075 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
6076 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
6077 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31
6078 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
6079 ; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
6080 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
6081 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
6082 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
6083 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6084 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
6085 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
6086 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
6087 ; GFX8-NEXT: v_mov_b32_e32 v8, s2
6088 ; GFX8-NEXT: v_mov_b32_e32 v9, s3
6089 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
6090 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
6091 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4
6092 ; GFX8-NEXT: v_readfirstlane_b32 s1, v5
6093 ; GFX8-NEXT: v_readfirstlane_b32 s2, v6
6094 ; GFX8-NEXT: v_readfirstlane_b32 s3, v7
6095 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0
6096 ; GFX8-NEXT: v_readfirstlane_b32 s5, v2
6097 ; GFX8-NEXT: v_readfirstlane_b32 s6, v1
6098 ; GFX8-NEXT: v_readfirstlane_b32 s7, v3
6099 ; GFX8-NEXT: ; return to shader part epilog
6101 ; GFX9-LABEL: s_saddsat_v2i128:
6103 ; GFX9-NEXT: s_add_u32 s8, s0, s8
6104 ; GFX9-NEXT: s_addc_u32 s9, s1, s9
6105 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
6106 ; GFX9-NEXT: s_addc_u32 s16, s2, s10
6107 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
6108 ; GFX9-NEXT: s_addc_u32 s17, s3, s11
6109 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6110 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
6111 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6112 ; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
6113 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
6114 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6115 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
6116 ; GFX9-NEXT: s_and_b32 s0, 1, s2
6117 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6118 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
6119 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
6120 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
6121 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0
6122 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
6123 ; GFX9-NEXT: s_and_b32 s0, 1, s2
6124 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
6125 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6126 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
6127 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
6128 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31
6129 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
6130 ; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
6131 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
6132 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
6133 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
6134 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6135 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
6136 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
6137 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
6138 ; GFX9-NEXT: v_mov_b32_e32 v2, s16
6139 ; GFX9-NEXT: v_mov_b32_e32 v3, s17
6140 ; GFX9-NEXT: s_add_u32 s0, s4, s12
6141 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
6142 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
6143 ; GFX9-NEXT: s_addc_u32 s1, s5, s13
6144 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
6145 ; GFX9-NEXT: s_addc_u32 s2, s6, s14
6146 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
6147 ; GFX9-NEXT: s_addc_u32 s3, s7, s15
6148 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
6149 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
6150 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
6151 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6152 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0
6153 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
6154 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
6155 ; GFX9-NEXT: s_and_b32 s4, 1, s6
6156 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
6157 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
6158 ; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
6159 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
6160 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0
6161 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
6162 ; GFX9-NEXT: s_and_b32 s4, 1, s6
6163 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
6164 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6165 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
6166 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
6167 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
6168 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
6169 ; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
6170 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6171 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
6172 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
6173 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
6174 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
6175 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
6176 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
6177 ; GFX9-NEXT: v_mov_b32_e32 v8, s2
6178 ; GFX9-NEXT: v_mov_b32_e32 v9, s3
6179 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
6180 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
6181 ; GFX9-NEXT: v_readfirstlane_b32 s0, v4
6182 ; GFX9-NEXT: v_readfirstlane_b32 s1, v5
6183 ; GFX9-NEXT: v_readfirstlane_b32 s2, v6
6184 ; GFX9-NEXT: v_readfirstlane_b32 s3, v7
6185 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0
6186 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2
6187 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1
6188 ; GFX9-NEXT: v_readfirstlane_b32 s7, v3
6189 ; GFX9-NEXT: ; return to shader part epilog
6191 ; GFX10-LABEL: s_saddsat_v2i128:
6193 ; GFX10-NEXT: s_add_u32 s8, s0, s8
6194 ; GFX10-NEXT: s_addc_u32 s9, s1, s9
6195 ; GFX10-NEXT: s_addc_u32 s16, s2, s10
6196 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
6197 ; GFX10-NEXT: s_addc_u32 s17, s3, s11
6198 ; GFX10-NEXT: v_mov_b32_e32 v4, s9
6199 ; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
6200 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0
6201 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
6202 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6203 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
6204 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
6205 ; GFX10-NEXT: s_and_b32 s0, 1, s18
6206 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
6207 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
6208 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0
6209 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
6210 ; GFX10-NEXT: s_and_b32 s1, 1, s1
6211 ; GFX10-NEXT: s_ashr_i32 s10, s17, 31
6212 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
6213 ; GFX10-NEXT: s_add_u32 s11, s10, 0x80000000
6214 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6215 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
6216 ; GFX10-NEXT: s_add_u32 s0, s4, s12
6217 ; GFX10-NEXT: s_addc_u32 s1, s5, s13
6218 ; GFX10-NEXT: s_addc_u32 s2, s6, s14
6219 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6220 ; GFX10-NEXT: s_addc_u32 s3, s7, s15
6221 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
6222 ; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6223 ; GFX10-NEXT: v_mov_b32_e32 v5, s0
6224 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0
6225 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
6226 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6227 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0
6228 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
6229 ; GFX10-NEXT: v_mov_b32_e32 v6, s1
6230 ; GFX10-NEXT: v_mov_b32_e32 v7, s3
6231 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
6232 ; GFX10-NEXT: s_and_b32 s4, 1, s12
6233 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
6234 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
6235 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0
6236 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
6237 ; GFX10-NEXT: s_and_b32 s5, 1, s5
6238 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
6239 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6240 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
6241 ; GFX10-NEXT: v_mov_b32_e32 v0, s16
6242 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
6243 ; GFX10-NEXT: v_mov_b32_e32 v3, s8
6244 ; GFX10-NEXT: s_ashr_i32 s4, s3, 31
6245 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
6246 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, vcc_lo
6247 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
6248 ; GFX10-NEXT: v_mov_b32_e32 v2, s17
6249 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo
6250 ; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
6251 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4
6252 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
6253 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6254 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
6255 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
6256 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0
6257 ; GFX10-NEXT: v_readfirstlane_b32 s3, v2
6258 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6259 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6260 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6261 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6262 ; GFX10-NEXT: v_readfirstlane_b32 s0, v3
6263 ; GFX10-NEXT: v_readfirstlane_b32 s4, v5
6264 ; GFX10-NEXT: v_readfirstlane_b32 s5, v6
6265 ; GFX10-NEXT: v_readfirstlane_b32 s6, v1
6266 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7
6267 ; GFX10-NEXT: ; return to shader part epilog
6269 ; GFX11-LABEL: s_saddsat_v2i128:
6271 ; GFX11-NEXT: s_add_u32 s8, s0, s8
6272 ; GFX11-NEXT: s_addc_u32 s9, s1, s9
6273 ; GFX11-NEXT: s_addc_u32 s16, s2, s10
6274 ; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
6275 ; GFX11-NEXT: s_addc_u32 s17, s3, s11
6276 ; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
6277 ; GFX11-NEXT: s_cselect_b32 s18, 1, 0
6278 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
6279 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
6280 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
6281 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
6282 ; GFX11-NEXT: s_and_b32 s0, 1, s18
6283 ; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
6284 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
6285 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0
6286 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
6287 ; GFX11-NEXT: s_and_b32 s1, 1, s1
6288 ; GFX11-NEXT: s_ashr_i32 s10, s17, 31
6289 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
6290 ; GFX11-NEXT: s_add_u32 s11, s10, 0x80000000
6291 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6292 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
6293 ; GFX11-NEXT: s_add_u32 s0, s4, s12
6294 ; GFX11-NEXT: s_addc_u32 s1, s5, s13
6295 ; GFX11-NEXT: s_addc_u32 s2, s6, s14
6296 ; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6297 ; GFX11-NEXT: s_addc_u32 s3, s7, s15
6298 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
6299 ; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
6300 ; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
6301 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
6302 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
6303 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0
6304 ; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0
6305 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_and_b32 v0, 1, v0
6306 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
6307 ; GFX11-NEXT: s_and_b32 s4, 1, s12
6308 ; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
6309 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
6310 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0
6311 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
6312 ; GFX11-NEXT: s_and_b32 s5, 1, s5
6313 ; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
6314 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6315 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
6316 ; GFX11-NEXT: v_mov_b32_e32 v3, s8
6317 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
6318 ; GFX11-NEXT: v_mov_b32_e32 v0, s16
6319 ; GFX11-NEXT: s_ashr_i32 s4, s3, 31
6320 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
6321 ; GFX11-NEXT: v_mov_b32_e32 v4, s9
6322 ; GFX11-NEXT: v_mov_b32_e32 v2, s17
6323 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo
6324 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, vcc_lo
6325 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
6326 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
6327 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
6328 ; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
6329 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
6330 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
6331 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
6332 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0
6333 ; GFX11-NEXT: v_readfirstlane_b32 s3, v2
6334 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
6335 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
6336 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
6337 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
6338 ; GFX11-NEXT: v_readfirstlane_b32 s0, v3
6339 ; GFX11-NEXT: v_readfirstlane_b32 s4, v5
6340 ; GFX11-NEXT: v_readfirstlane_b32 s5, v6
6341 ; GFX11-NEXT: v_readfirstlane_b32 s6, v1
6342 ; GFX11-NEXT: v_readfirstlane_b32 s7, v7
6343 ; GFX11-NEXT: ; return to shader part epilog
6344 %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
6345 ret <2 x i128> %result
6348 declare i7 @llvm.sadd.sat.i7(i7, i7) #0
6349 declare i8 @llvm.sadd.sat.i8(i8, i8) #0
6350 declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) #0
6351 declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) #0
6353 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
6354 declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
6355 declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
6356 declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
6357 declare <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16>, <5 x i16>) #0
6358 declare <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16>, <6 x i16>) #0
6359 declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) #0
6361 declare i24 @llvm.sadd.sat.i24(i24, i24) #0
6363 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
6364 declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
6365 declare <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32>, <3 x i32>) #0
6366 declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) #0
6367 declare <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32>, <5 x i32>) #0
6368 declare <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32>, <16 x i32>) #0
6370 declare i48 @llvm.sadd.sat.i48(i48, i48) #0
6372 declare i64 @llvm.sadd.sat.i64(i64, i64) #0
6373 declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) #0
6375 declare i128 @llvm.sadd.sat.i128(i128, i128) #0
6376 declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>) #0
6378 attributes #0 = { nounwind readnone speculatable willreturn }