1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
9 ; GFX6-LABEL: v_uaddsat_i7:
11 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0
13 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1
14 ; GFX6-NEXT: v_not_b32_e32 v2, v0
15 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
17 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0
18 ; GFX6-NEXT: s_setpc_b64 s[30:31]
20 ; GFX8-LABEL: v_uaddsat_i7:
22 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0
24 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1
25 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp
26 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
27 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29 ; GFX9-LABEL: v_uaddsat_i7:
31 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0
33 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1
34 ; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp
35 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
36 ; GFX9-NEXT: s_setpc_b64 s[30:31]
38 ; GFX10PLUS-LABEL: v_uaddsat_i7:
40 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0
42 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1
43 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp
44 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
45 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
46 %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
50 define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
51 ; GFX6-LABEL: s_uaddsat_i7:
53 ; GFX6-NEXT: s_lshl_b32 s0, s0, 25
54 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25
55 ; GFX6-NEXT: s_not_b32 s2, s0
56 ; GFX6-NEXT: s_min_u32 s1, s2, s1
57 ; GFX6-NEXT: s_add_i32 s0, s0, s1
58 ; GFX6-NEXT: s_lshr_b32 s0, s0, 25
59 ; GFX6-NEXT: ; return to shader part epilog
61 ; GFX8-LABEL: s_uaddsat_i7:
63 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9
64 ; GFX8-NEXT: s_lshl_b32 s0, s0, 9
65 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
66 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
67 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
68 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
69 ; GFX8-NEXT: ; return to shader part epilog
71 ; GFX9-LABEL: s_uaddsat_i7:
73 ; GFX9-NEXT: s_lshl_b32 s1, s1, 9
74 ; GFX9-NEXT: s_lshl_b32 s0, s0, 9
75 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
76 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
77 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
78 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
79 ; GFX9-NEXT: ; return to shader part epilog
81 ; GFX10PLUS-LABEL: s_uaddsat_i7:
83 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
84 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
85 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
86 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
87 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
88 ; GFX10PLUS-NEXT: ; return to shader part epilog
89 %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
93 define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
94 ; GFX6-LABEL: v_uaddsat_i8:
96 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
98 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
99 ; GFX6-NEXT: v_not_b32_e32 v2, v0
100 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
101 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
102 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
103 ; GFX6-NEXT: s_setpc_b64 s[30:31]
105 ; GFX8-LABEL: v_uaddsat_i8:
107 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
109 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
110 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp
111 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
112 ; GFX8-NEXT: s_setpc_b64 s[30:31]
114 ; GFX9-LABEL: v_uaddsat_i8:
116 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
118 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
119 ; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp
120 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
121 ; GFX9-NEXT: s_setpc_b64 s[30:31]
123 ; GFX10PLUS-LABEL: v_uaddsat_i8:
124 ; GFX10PLUS: ; %bb.0:
125 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
127 ; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
128 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp
129 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
130 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
131 %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
135 define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
136 ; GFX6-LABEL: s_uaddsat_i8:
138 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
139 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
140 ; GFX6-NEXT: s_not_b32 s2, s0
141 ; GFX6-NEXT: s_min_u32 s1, s2, s1
142 ; GFX6-NEXT: s_add_i32 s0, s0, s1
143 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24
144 ; GFX6-NEXT: ; return to shader part epilog
146 ; GFX8-LABEL: s_uaddsat_i8:
148 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
149 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
150 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
151 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
152 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
153 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
154 ; GFX8-NEXT: ; return to shader part epilog
156 ; GFX9-LABEL: s_uaddsat_i8:
158 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
159 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
160 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
161 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
162 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
163 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
164 ; GFX9-NEXT: ; return to shader part epilog
166 ; GFX10PLUS-LABEL: s_uaddsat_i8:
167 ; GFX10PLUS: ; %bb.0:
168 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
169 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
170 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
171 ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
172 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
173 ; GFX10PLUS-NEXT: ; return to shader part epilog
174 %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
178 define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
179 ; GFX6-LABEL: v_uaddsat_v2i8:
181 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
183 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
184 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
185 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
186 ; GFX6-NEXT: v_not_b32_e32 v4, v0
187 ; GFX6-NEXT: v_min_u32_e32 v1, v4, v1
188 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
189 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
190 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
191 ; GFX6-NEXT: v_not_b32_e32 v3, v1
192 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
193 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
194 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
195 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
196 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
197 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
198 ; GFX6-NEXT: s_setpc_b64 s[30:31]
200 ; GFX8-LABEL: v_uaddsat_v2i8:
202 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
204 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
205 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
206 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
207 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
208 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp
209 ; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp
210 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
211 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
212 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
213 ; GFX8-NEXT: s_setpc_b64 s[30:31]
215 ; GFX9-LABEL: v_uaddsat_v2i8:
217 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
219 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
220 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
221 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
222 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
223 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
224 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
225 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
226 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
227 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
228 ; GFX9-NEXT: s_movk_i32 s4, 0xff
229 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
230 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
231 ; GFX9-NEXT: s_setpc_b64 s[30:31]
233 ; GFX10-LABEL: v_uaddsat_v2i8:
235 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
237 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
238 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
239 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
240 ; GFX10-NEXT: s_movk_i32 s4, 0xff
241 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
242 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
243 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
244 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
245 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
246 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
247 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
248 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
249 ; GFX10-NEXT: s_setpc_b64 s[30:31]
251 ; GFX11-LABEL: v_uaddsat_v2i8:
253 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
255 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
256 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
257 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
258 ; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
259 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
260 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
261 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
262 ; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp
263 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
264 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
265 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
266 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
267 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
268 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
269 ; GFX11-NEXT: s_setpc_b64 s[30:31]
270 %lhs = bitcast i16 %lhs.arg to <2 x i8>
271 %rhs = bitcast i16 %rhs.arg to <2 x i8>
272 %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
273 %cast.result = bitcast <2 x i8> %result to i16
277 define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
278 ; GFX6-LABEL: s_uaddsat_v2i8:
280 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
281 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
282 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8
283 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
284 ; GFX6-NEXT: s_not_b32 s4, s0
285 ; GFX6-NEXT: s_min_u32 s1, s4, s1
286 ; GFX6-NEXT: s_add_i32 s0, s0, s1
287 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
288 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
289 ; GFX6-NEXT: s_not_b32 s3, s1
290 ; GFX6-NEXT: s_min_u32 s2, s3, s2
291 ; GFX6-NEXT: s_add_i32 s1, s1, s2
292 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24
293 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24
294 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
295 ; GFX6-NEXT: s_or_b32 s0, s0, s1
296 ; GFX6-NEXT: ; return to shader part epilog
298 ; GFX8-LABEL: s_uaddsat_v2i8:
300 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8
301 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
302 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
303 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
304 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
305 ; GFX8-NEXT: s_lshl_b32 s1, s3, 8
306 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
307 ; GFX8-NEXT: s_lshl_b32 s0, s2, 8
308 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
309 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp
310 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
311 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
312 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
313 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
314 ; GFX8-NEXT: ; return to shader part epilog
316 ; GFX9-LABEL: s_uaddsat_v2i8:
318 ; GFX9-NEXT: s_lshr_b32 s2, s0, 8
319 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
320 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8
321 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
322 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
323 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
324 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
325 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
326 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16
327 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
328 ; GFX9-NEXT: s_lshl_b32 s2, s2, 8
329 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
330 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
331 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
332 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
333 ; GFX9-NEXT: s_movk_i32 s0, 0xff
334 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
335 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
336 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
337 ; GFX9-NEXT: ; return to shader part epilog
339 ; GFX10-LABEL: s_uaddsat_v2i8:
341 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
342 ; GFX10-NEXT: s_lshr_b32 s3, s1, 8
343 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
344 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
345 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
346 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
347 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
348 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8
349 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
350 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
351 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
352 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
353 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
354 ; GFX10-NEXT: s_movk_i32 s0, 0xff
355 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
356 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
357 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
358 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
359 ; GFX10-NEXT: ; return to shader part epilog
361 ; GFX11-LABEL: s_uaddsat_v2i8:
363 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
364 ; GFX11-NEXT: s_lshr_b32 s3, s1, 8
365 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
366 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
367 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
368 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
369 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
370 ; GFX11-NEXT: s_lshl_b32 s2, s2, 8
371 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
372 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8
373 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
374 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
375 ; GFX11-NEXT: v_pk_add_u16 v0, s0, s1 clamp
376 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
377 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
378 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
379 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
380 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
381 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
382 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
383 ; GFX11-NEXT: ; return to shader part epilog
384 %lhs = bitcast i16 %lhs.arg to <2 x i8>
385 %rhs = bitcast i16 %rhs.arg to <2 x i8>
386 %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
387 %cast.result = bitcast <2 x i8> %result to i16
391 define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
392 ; GFX6-LABEL: v_uaddsat_v4i8:
394 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
396 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
397 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0
398 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
399 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1
400 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1
401 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1
402 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
403 ; GFX6-NEXT: v_not_b32_e32 v8, v0
404 ; GFX6-NEXT: v_min_u32_e32 v1, v8, v1
405 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
406 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
407 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5
408 ; GFX6-NEXT: v_not_b32_e32 v5, v1
409 ; GFX6-NEXT: v_min_u32_e32 v2, v5, v2
410 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
411 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
412 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6
413 ; GFX6-NEXT: v_not_b32_e32 v5, v2
414 ; GFX6-NEXT: v_min_u32_e32 v3, v5, v3
415 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
416 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4
417 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7
418 ; GFX6-NEXT: v_not_b32_e32 v5, v3
419 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
420 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
421 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2
422 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
423 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3
424 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 24
425 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
426 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
427 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3
428 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
429 ; GFX6-NEXT: s_setpc_b64 s[30:31]
431 ; GFX8-LABEL: v_uaddsat_v4i8:
433 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GFX8-NEXT: v_mov_b32_e32 v2, 8
435 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
436 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
437 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0
438 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
439 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
440 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1
441 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
442 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
443 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp
444 ; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp
445 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4
446 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6
447 ; GFX8-NEXT: v_add_u16_e64 v2, v2, v3 clamp
448 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5
449 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7
450 ; GFX8-NEXT: v_add_u16_e64 v3, v3, v4 clamp
451 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
452 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
453 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
454 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
455 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
456 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
457 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
458 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
459 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
460 ; GFX8-NEXT: s_setpc_b64 s[30:31]
462 ; GFX9-LABEL: v_uaddsat_v4i8:
464 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
466 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
467 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
468 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
469 ; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
470 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
471 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
472 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
473 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
474 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
475 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
476 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
477 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
478 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
479 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 clamp
480 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
481 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
482 ; GFX9-NEXT: v_mov_b32_e32 v2, 8
483 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
484 ; GFX9-NEXT: s_movk_i32 s4, 0xff
485 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
486 ; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
487 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0
488 ; GFX9-NEXT: v_mov_b32_e32 v3, 24
489 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
490 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
491 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
492 ; GFX9-NEXT: s_setpc_b64 s[30:31]
494 ; GFX10-LABEL: v_uaddsat_v4i8:
496 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
497 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
498 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
499 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
500 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
501 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
502 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
503 ; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
504 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
505 ; GFX10-NEXT: v_mov_b32_e32 v4, 24
506 ; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
507 ; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
508 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
509 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
510 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
511 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
512 ; GFX10-NEXT: v_pk_add_u16 v2, v2, v3 clamp
513 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
514 ; GFX10-NEXT: v_mov_b32_e32 v1, 8
515 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1]
516 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
517 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
518 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0
519 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
520 ; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1
521 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
522 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
523 ; GFX10-NEXT: s_setpc_b64 s[30:31]
525 ; GFX11-LABEL: v_uaddsat_v4i8:
527 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
529 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
530 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
531 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
532 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
533 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
534 ; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
535 ; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
536 ; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
537 ; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
538 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
539 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
540 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
541 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
542 ; GFX11-NEXT: v_pk_add_u16 v2, v2, v3 clamp
543 ; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp
544 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
545 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
546 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8
547 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0
548 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
549 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
550 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
551 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
552 ; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2
553 ; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0
554 ; GFX11-NEXT: s_setpc_b64 s[30:31]
555 %lhs = bitcast i32 %lhs.arg to <4 x i8>
556 %rhs = bitcast i32 %rhs.arg to <4 x i8>
557 %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
558 %cast.result = bitcast <4 x i8> %result to i32
562 define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
563 ; GFX6-LABEL: s_uaddsat_v4i8:
565 ; GFX6-NEXT: s_lshr_b32 s2, s0, 8
566 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
567 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24
568 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24
569 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8
570 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16
571 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24
572 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24
573 ; GFX6-NEXT: s_not_b32 s8, s0
574 ; GFX6-NEXT: s_min_u32 s1, s8, s1
575 ; GFX6-NEXT: s_add_i32 s0, s0, s1
576 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24
577 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24
578 ; GFX6-NEXT: s_not_b32 s5, s1
579 ; GFX6-NEXT: s_min_u32 s2, s5, s2
580 ; GFX6-NEXT: s_add_i32 s1, s1, s2
581 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24
582 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24
583 ; GFX6-NEXT: s_not_b32 s5, s2
584 ; GFX6-NEXT: s_min_u32 s3, s5, s3
585 ; GFX6-NEXT: s_add_i32 s2, s2, s3
586 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24
587 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24
588 ; GFX6-NEXT: s_not_b32 s5, s3
589 ; GFX6-NEXT: s_min_u32 s4, s5, s4
590 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24
591 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24
592 ; GFX6-NEXT: s_add_i32 s3, s3, s4
593 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
594 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24
595 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
596 ; GFX6-NEXT: s_lshl_b32 s0, s2, 16
597 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
598 ; GFX6-NEXT: s_lshl_b32 s0, s3, 24
599 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
600 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
601 ; GFX6-NEXT: ; return to shader part epilog
603 ; GFX8-LABEL: s_uaddsat_v4i8:
605 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8
606 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16
607 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24
608 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
609 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8
610 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16
611 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24
612 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
613 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
614 ; GFX8-NEXT: s_lshl_b32 s1, s5, 8
615 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
616 ; GFX8-NEXT: s_lshl_b32 s0, s2, 8
617 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
618 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp
619 ; GFX8-NEXT: s_lshl_b32 s1, s6, 8
620 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
621 ; GFX8-NEXT: s_lshl_b32 s0, s3, 8
622 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
623 ; GFX8-NEXT: s_lshl_b32 s1, s7, 8
624 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
625 ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
626 ; GFX8-NEXT: s_lshl_b32 s0, s4, 8
627 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
628 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
629 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
630 ; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
631 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
632 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
633 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
634 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
635 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
636 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
637 ; GFX8-NEXT: ; return to shader part epilog
639 ; GFX9-LABEL: s_uaddsat_v4i8:
641 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8
642 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
643 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24
644 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
645 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
646 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16
647 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
648 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
649 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8
650 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
651 ; GFX9-NEXT: s_lshr_b32 s6, s3, 16
652 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16
653 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24
654 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
655 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
656 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
657 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
658 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16
659 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
660 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
661 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
662 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
663 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16
664 ; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
665 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8
666 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
667 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
668 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
669 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
670 ; GFX9-NEXT: s_mov_b32 s2, 8
671 ; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 clamp
672 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
673 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
674 ; GFX9-NEXT: s_movk_i32 s0, 0xff
675 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
676 ; GFX9-NEXT: s_mov_b32 s5, 24
677 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2
678 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
679 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
680 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
681 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
682 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
683 ; GFX9-NEXT: ; return to shader part epilog
685 ; GFX10-LABEL: s_uaddsat_v4i8:
687 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8
688 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
689 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24
690 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8
691 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
692 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
693 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16
694 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24
695 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
696 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
697 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
698 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
699 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
700 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
701 ; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
702 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
703 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
704 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
705 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16
706 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16
707 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
708 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
709 ; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
710 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8
711 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
712 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
713 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
714 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp
715 ; GFX10-NEXT: s_mov_b32 s0, 8
716 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
717 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
718 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
719 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
720 ; GFX10-NEXT: s_mov_b32 s0, 24
721 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
722 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
723 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
724 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
725 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
726 ; GFX10-NEXT: ; return to shader part epilog
728 ; GFX11-LABEL: s_uaddsat_v4i8:
730 ; GFX11-NEXT: s_lshr_b32 s2, s0, 8
731 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24
732 ; GFX11-NEXT: s_lshr_b32 s4, s1, 8
733 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24
734 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2
735 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3
736 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4
737 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
738 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5
739 ; GFX11-NEXT: s_lshr_b32 s5, s3, 16
740 ; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008
741 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
742 ; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
743 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8
744 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
745 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
746 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16
747 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16
748 ; GFX11-NEXT: v_pk_add_u16 v0, s2, s3 clamp
749 ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
750 ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
751 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
752 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8
753 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
754 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
755 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
756 ; GFX11-NEXT: v_pk_add_u16 v1, s0, s1 clamp
757 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
758 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
759 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
760 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
761 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
762 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2
763 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
764 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
765 ; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
766 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
767 ; GFX11-NEXT: ; return to shader part epilog
768 %lhs = bitcast i32 %lhs.arg to <4 x i8>
769 %rhs = bitcast i32 %rhs.arg to <4 x i8>
770 %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
771 %cast.result = bitcast <4 x i8> %result to i32
775 define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) {
776 ; GFX6-LABEL: v_uaddsat_i24:
778 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
779 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
780 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
781 ; GFX6-NEXT: v_not_b32_e32 v2, v0
782 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
783 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
784 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0
785 ; GFX6-NEXT: s_setpc_b64 s[30:31]
787 ; GFX8-LABEL: v_uaddsat_i24:
789 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0
791 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
792 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp
793 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
794 ; GFX8-NEXT: s_setpc_b64 s[30:31]
796 ; GFX9-LABEL: v_uaddsat_i24:
798 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
799 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
800 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
801 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp
802 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
803 ; GFX9-NEXT: s_setpc_b64 s[30:31]
805 ; GFX10PLUS-LABEL: v_uaddsat_i24:
806 ; GFX10PLUS: ; %bb.0:
807 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
808 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0
809 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1
810 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp
811 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
812 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
813 %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
817 define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
818 ; GFX6-LABEL: s_uaddsat_i24:
820 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8
821 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8
822 ; GFX6-NEXT: s_not_b32 s2, s0
823 ; GFX6-NEXT: s_min_u32 s1, s2, s1
824 ; GFX6-NEXT: s_add_i32 s0, s0, s1
825 ; GFX6-NEXT: s_lshr_b32 s0, s0, 8
826 ; GFX6-NEXT: ; return to shader part epilog
828 ; GFX8-LABEL: s_uaddsat_i24:
830 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8
831 ; GFX8-NEXT: s_lshl_b32 s0, s0, 8
832 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
833 ; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
834 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
835 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
836 ; GFX8-NEXT: ; return to shader part epilog
838 ; GFX9-LABEL: s_uaddsat_i24:
840 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8
841 ; GFX9-NEXT: s_lshl_b32 s0, s0, 8
842 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
843 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
844 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
845 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
846 ; GFX9-NEXT: ; return to shader part epilog
848 ; GFX10PLUS-LABEL: s_uaddsat_i24:
849 ; GFX10PLUS: ; %bb.0:
850 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
851 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
852 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
853 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
854 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
855 ; GFX10PLUS-NEXT: ; return to shader part epilog
856 %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
860 define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
861 ; GFX6-LABEL: v_uaddsat_i32:
863 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
864 ; GFX6-NEXT: v_not_b32_e32 v2, v0
865 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
866 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
867 ; GFX6-NEXT: s_setpc_b64 s[30:31]
869 ; GFX8-LABEL: v_uaddsat_i32:
871 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
872 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp
873 ; GFX8-NEXT: s_setpc_b64 s[30:31]
875 ; GFX9-LABEL: v_uaddsat_i32:
877 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
878 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp
879 ; GFX9-NEXT: s_setpc_b64 s[30:31]
881 ; GFX10PLUS-LABEL: v_uaddsat_i32:
882 ; GFX10PLUS: ; %bb.0:
883 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
884 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp
885 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
886 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
890 define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
891 ; GFX6-LABEL: s_uaddsat_i32:
893 ; GFX6-NEXT: s_not_b32 s2, s0
894 ; GFX6-NEXT: s_min_u32 s1, s2, s1
895 ; GFX6-NEXT: s_add_i32 s0, s0, s1
896 ; GFX6-NEXT: ; return to shader part epilog
898 ; GFX8-LABEL: s_uaddsat_i32:
900 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
901 ; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
902 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
903 ; GFX8-NEXT: ; return to shader part epilog
905 ; GFX9-LABEL: s_uaddsat_i32:
907 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
908 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
909 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
910 ; GFX9-NEXT: ; return to shader part epilog
912 ; GFX10PLUS-LABEL: s_uaddsat_i32:
913 ; GFX10PLUS: ; %bb.0:
914 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
915 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
916 ; GFX10PLUS-NEXT: ; return to shader part epilog
917 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
921 define amdgpu_ps float @uaddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
922 ; GFX6-LABEL: uaddsat_i32_sv:
924 ; GFX6-NEXT: s_not_b32 s1, s0
925 ; GFX6-NEXT: v_min_u32_e32 v0, s1, v0
926 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
927 ; GFX6-NEXT: ; return to shader part epilog
929 ; GFX8-LABEL: uaddsat_i32_sv:
931 ; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
932 ; GFX8-NEXT: ; return to shader part epilog
934 ; GFX9-LABEL: uaddsat_i32_sv:
936 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
937 ; GFX9-NEXT: ; return to shader part epilog
939 ; GFX10PLUS-LABEL: uaddsat_i32_sv:
940 ; GFX10PLUS: ; %bb.0:
941 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, v0 clamp
942 ; GFX10PLUS-NEXT: ; return to shader part epilog
943 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
944 %cast = bitcast i32 %result to float
948 define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
949 ; GFX6-LABEL: uaddsat_i32_vs:
951 ; GFX6-NEXT: v_not_b32_e32 v1, v0
952 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1
953 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
954 ; GFX6-NEXT: ; return to shader part epilog
956 ; GFX8-LABEL: uaddsat_i32_vs:
958 ; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, s0 clamp
959 ; GFX8-NEXT: ; return to shader part epilog
961 ; GFX9-LABEL: uaddsat_i32_vs:
963 ; GFX9-NEXT: v_add_u32_e64 v0, v0, s0 clamp
964 ; GFX9-NEXT: ; return to shader part epilog
966 ; GFX10PLUS-LABEL: uaddsat_i32_vs:
967 ; GFX10PLUS: ; %bb.0:
968 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, s0 clamp
969 ; GFX10PLUS-NEXT: ; return to shader part epilog
970 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
971 %cast = bitcast i32 %result to float
975 define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
976 ; GFX6-LABEL: v_uaddsat_v2i32:
978 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
979 ; GFX6-NEXT: v_not_b32_e32 v4, v0
980 ; GFX6-NEXT: v_min_u32_e32 v2, v4, v2
981 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
982 ; GFX6-NEXT: v_not_b32_e32 v2, v1
983 ; GFX6-NEXT: v_min_u32_e32 v2, v2, v3
984 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
985 ; GFX6-NEXT: s_setpc_b64 s[30:31]
987 ; GFX8-LABEL: v_uaddsat_v2i32:
989 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
990 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 clamp
991 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v3 clamp
992 ; GFX8-NEXT: s_setpc_b64 s[30:31]
994 ; GFX9-LABEL: v_uaddsat_v2i32:
996 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
997 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v2 clamp
998 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v3 clamp
999 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1001 ; GFX10PLUS-LABEL: v_uaddsat_v2i32:
1002 ; GFX10PLUS: ; %bb.0:
1003 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp
1005 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp
1006 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1007 %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1008 ret <2 x i32> %result
1011 define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1012 ; GFX6-LABEL: s_uaddsat_v2i32:
1014 ; GFX6-NEXT: s_not_b32 s4, s0
1015 ; GFX6-NEXT: s_min_u32 s2, s4, s2
1016 ; GFX6-NEXT: s_add_i32 s0, s0, s2
1017 ; GFX6-NEXT: s_not_b32 s2, s1
1018 ; GFX6-NEXT: s_min_u32 s2, s2, s3
1019 ; GFX6-NEXT: s_add_i32 s1, s1, s2
1020 ; GFX6-NEXT: ; return to shader part epilog
1022 ; GFX8-LABEL: s_uaddsat_v2i32:
1024 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1025 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1026 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], s0, v0 clamp
1027 ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1028 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1029 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1030 ; GFX8-NEXT: ; return to shader part epilog
1032 ; GFX9-LABEL: s_uaddsat_v2i32:
1034 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1035 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1036 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
1037 ; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp
1038 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1039 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1040 ; GFX9-NEXT: ; return to shader part epilog
1042 ; GFX10PLUS-LABEL: s_uaddsat_v2i32:
1043 ; GFX10PLUS: ; %bb.0:
1044 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s2 clamp
1045 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s3 clamp
1046 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1047 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1048 ; GFX10PLUS-NEXT: ; return to shader part epilog
1049 %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1050 ret <2 x i32> %result
1053 define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1054 ; GFX6-LABEL: v_uaddsat_v3i32:
1056 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057 ; GFX6-NEXT: v_not_b32_e32 v6, v0
1058 ; GFX6-NEXT: v_min_u32_e32 v3, v6, v3
1059 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
1060 ; GFX6-NEXT: v_not_b32_e32 v3, v1
1061 ; GFX6-NEXT: v_min_u32_e32 v3, v3, v4
1062 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
1063 ; GFX6-NEXT: v_not_b32_e32 v3, v2
1064 ; GFX6-NEXT: v_min_u32_e32 v3, v3, v5
1065 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1066 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1068 ; GFX8-LABEL: v_uaddsat_v3i32:
1070 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v3 clamp
1072 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 clamp
1073 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v5 clamp
1074 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1076 ; GFX9-LABEL: v_uaddsat_v3i32:
1078 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1079 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v3 clamp
1080 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v4 clamp
1081 ; GFX9-NEXT: v_add_u32_e64 v2, v2, v5 clamp
1082 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1084 ; GFX10PLUS-LABEL: v_uaddsat_v3i32:
1085 ; GFX10PLUS: ; %bb.0:
1086 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1087 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp
1088 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp
1089 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp
1090 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1091 %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1092 ret <3 x i32> %result
1095 define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1096 ; GFX6-LABEL: s_uaddsat_v3i32:
1098 ; GFX6-NEXT: s_not_b32 s6, s0
1099 ; GFX6-NEXT: s_min_u32 s3, s6, s3
1100 ; GFX6-NEXT: s_add_i32 s0, s0, s3
1101 ; GFX6-NEXT: s_not_b32 s3, s1
1102 ; GFX6-NEXT: s_min_u32 s3, s3, s4
1103 ; GFX6-NEXT: s_add_i32 s1, s1, s3
1104 ; GFX6-NEXT: s_not_b32 s3, s2
1105 ; GFX6-NEXT: s_min_u32 s3, s3, s5
1106 ; GFX6-NEXT: s_add_i32 s2, s2, s3
1107 ; GFX6-NEXT: ; return to shader part epilog
1109 ; GFX8-LABEL: s_uaddsat_v3i32:
1111 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
1112 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
1113 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
1114 ; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], s0, v0 clamp
1115 ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1116 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp
1117 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1118 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1119 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1120 ; GFX8-NEXT: ; return to shader part epilog
1122 ; GFX9-LABEL: s_uaddsat_v3i32:
1124 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1125 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1126 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
1127 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
1128 ; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp
1129 ; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp
1130 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1131 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1132 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1133 ; GFX9-NEXT: ; return to shader part epilog
1135 ; GFX10PLUS-LABEL: s_uaddsat_v3i32:
1136 ; GFX10PLUS: ; %bb.0:
1137 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s3 clamp
1138 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s4 clamp
1139 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, s2, s5 clamp
1140 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1141 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1142 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1143 ; GFX10PLUS-NEXT: ; return to shader part epilog
1144 %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1145 ret <3 x i32> %result
1148 define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1149 ; GFX6-LABEL: v_uaddsat_v4i32:
1151 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1152 ; GFX6-NEXT: v_not_b32_e32 v8, v0
1153 ; GFX6-NEXT: v_min_u32_e32 v4, v8, v4
1154 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
1155 ; GFX6-NEXT: v_not_b32_e32 v4, v1
1156 ; GFX6-NEXT: v_min_u32_e32 v4, v4, v5
1157 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
1158 ; GFX6-NEXT: v_not_b32_e32 v4, v2
1159 ; GFX6-NEXT: v_min_u32_e32 v4, v4, v6
1160 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
1161 ; GFX6-NEXT: v_not_b32_e32 v4, v3
1162 ; GFX6-NEXT: v_min_u32_e32 v4, v4, v7
1163 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
1164 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1166 ; GFX8-LABEL: v_uaddsat_v4i32:
1168 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1169 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 clamp
1170 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v5 clamp
1171 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 clamp
1172 ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v7 clamp
1173 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1175 ; GFX9-LABEL: v_uaddsat_v4i32:
1177 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v4 clamp
1179 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v5 clamp
1180 ; GFX9-NEXT: v_add_u32_e64 v2, v2, v6 clamp
1181 ; GFX9-NEXT: v_add_u32_e64 v3, v3, v7 clamp
1182 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1184 ; GFX10PLUS-LABEL: v_uaddsat_v4i32:
1185 ; GFX10PLUS: ; %bb.0:
1186 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp
1188 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp
1189 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp
1190 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp
1191 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1192 %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1193 ret <4 x i32> %result
1196 define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1197 ; GFX6-LABEL: s_uaddsat_v4i32:
1199 ; GFX6-NEXT: s_not_b32 s8, s0
1200 ; GFX6-NEXT: s_min_u32 s4, s8, s4
1201 ; GFX6-NEXT: s_add_i32 s0, s0, s4
1202 ; GFX6-NEXT: s_not_b32 s4, s1
1203 ; GFX6-NEXT: s_min_u32 s4, s4, s5
1204 ; GFX6-NEXT: s_add_i32 s1, s1, s4
1205 ; GFX6-NEXT: s_not_b32 s4, s2
1206 ; GFX6-NEXT: s_min_u32 s4, s4, s6
1207 ; GFX6-NEXT: s_add_i32 s2, s2, s4
1208 ; GFX6-NEXT: s_not_b32 s4, s3
1209 ; GFX6-NEXT: s_min_u32 s4, s4, s7
1210 ; GFX6-NEXT: s_add_i32 s3, s3, s4
1211 ; GFX6-NEXT: ; return to shader part epilog
1213 ; GFX8-LABEL: s_uaddsat_v4i32:
1215 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1216 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1217 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
1218 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
1219 ; GFX8-NEXT: v_add_u32_e64 v0, s[8:9], s0, v0 clamp
1220 ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1221 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp
1222 ; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp
1223 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1224 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1225 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1226 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1227 ; GFX8-NEXT: ; return to shader part epilog
1229 ; GFX9-LABEL: s_uaddsat_v4i32:
1231 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1232 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1233 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1234 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1235 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
1236 ; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp
1237 ; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp
1238 ; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp
1239 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1240 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1241 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1242 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1243 ; GFX9-NEXT: ; return to shader part epilog
1245 ; GFX10PLUS-LABEL: s_uaddsat_v4i32:
1246 ; GFX10PLUS: ; %bb.0:
1247 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s4 clamp
1248 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s5 clamp
1249 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, s2, s6 clamp
1250 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, s3, s7 clamp
1251 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1252 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1253 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1254 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1255 ; GFX10PLUS-NEXT: ; return to shader part epilog
1256 %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1257 ret <4 x i32> %result
1260 define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1261 ; GFX6-LABEL: v_uaddsat_v5i32:
1263 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1264 ; GFX6-NEXT: v_not_b32_e32 v10, v0
1265 ; GFX6-NEXT: v_min_u32_e32 v5, v10, v5
1266 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5
1267 ; GFX6-NEXT: v_not_b32_e32 v5, v1
1268 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v6
1269 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
1270 ; GFX6-NEXT: v_not_b32_e32 v5, v2
1271 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v7
1272 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
1273 ; GFX6-NEXT: v_not_b32_e32 v5, v3
1274 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v8
1275 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
1276 ; GFX6-NEXT: v_not_b32_e32 v5, v4
1277 ; GFX6-NEXT: v_min_u32_e32 v5, v5, v9
1278 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
1279 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1281 ; GFX8-LABEL: v_uaddsat_v5i32:
1283 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1284 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v5 clamp
1285 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v6 clamp
1286 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v7 clamp
1287 ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v8 clamp
1288 ; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v9 clamp
1289 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1291 ; GFX9-LABEL: v_uaddsat_v5i32:
1293 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1294 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v5 clamp
1295 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v6 clamp
1296 ; GFX9-NEXT: v_add_u32_e64 v2, v2, v7 clamp
1297 ; GFX9-NEXT: v_add_u32_e64 v3, v3, v8 clamp
1298 ; GFX9-NEXT: v_add_u32_e64 v4, v4, v9 clamp
1299 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1301 ; GFX10PLUS-LABEL: v_uaddsat_v5i32:
1302 ; GFX10PLUS: ; %bb.0:
1303 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1304 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, v0, v5 clamp
1305 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, v1, v6 clamp
1306 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, v2, v7 clamp
1307 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, v3, v8 clamp
1308 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v4, v4, v9 clamp
1309 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1310 %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1311 ret <5 x i32> %result
1314 define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1315 ; GFX6-LABEL: s_uaddsat_v5i32:
1317 ; GFX6-NEXT: s_not_b32 s10, s0
1318 ; GFX6-NEXT: s_min_u32 s5, s10, s5
1319 ; GFX6-NEXT: s_add_i32 s0, s0, s5
1320 ; GFX6-NEXT: s_not_b32 s5, s1
1321 ; GFX6-NEXT: s_min_u32 s5, s5, s6
1322 ; GFX6-NEXT: s_add_i32 s1, s1, s5
1323 ; GFX6-NEXT: s_not_b32 s5, s2
1324 ; GFX6-NEXT: s_min_u32 s5, s5, s7
1325 ; GFX6-NEXT: s_add_i32 s2, s2, s5
1326 ; GFX6-NEXT: s_not_b32 s5, s3
1327 ; GFX6-NEXT: s_min_u32 s5, s5, s8
1328 ; GFX6-NEXT: s_add_i32 s3, s3, s5
1329 ; GFX6-NEXT: s_not_b32 s5, s4
1330 ; GFX6-NEXT: s_min_u32 s5, s5, s9
1331 ; GFX6-NEXT: s_add_i32 s4, s4, s5
1332 ; GFX6-NEXT: ; return to shader part epilog
1334 ; GFX8-LABEL: s_uaddsat_v5i32:
1336 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1337 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
1338 ; GFX8-NEXT: v_mov_b32_e32 v2, s7
1339 ; GFX8-NEXT: v_mov_b32_e32 v3, s8
1340 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
1341 ; GFX8-NEXT: v_add_u32_e64 v0, s[10:11], s0, v0 clamp
1342 ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1343 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp
1344 ; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp
1345 ; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], s4, v4 clamp
1346 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1347 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1348 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1349 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1350 ; GFX8-NEXT: v_readfirstlane_b32 s4, v4
1351 ; GFX8-NEXT: ; return to shader part epilog
1353 ; GFX9-LABEL: s_uaddsat_v5i32:
1355 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1356 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1357 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1358 ; GFX9-NEXT: v_mov_b32_e32 v3, s8
1359 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
1360 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
1361 ; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp
1362 ; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp
1363 ; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp
1364 ; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp
1365 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1366 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1367 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1368 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1369 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1370 ; GFX9-NEXT: ; return to shader part epilog
1372 ; GFX10PLUS-LABEL: s_uaddsat_v5i32:
1373 ; GFX10PLUS: ; %bb.0:
1374 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp
1375 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp
1376 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp
1377 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp
1378 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v4, s4, s9 clamp
1379 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1380 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1381 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1382 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1383 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
1384 ; GFX10PLUS-NEXT: ; return to shader part epilog
1385 %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1386 ret <5 x i32> %result
1389 define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1390 ; GFX6-LABEL: v_uaddsat_v16i32:
1392 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1393 ; GFX6-NEXT: v_not_b32_e32 v31, v0
1394 ; GFX6-NEXT: v_min_u32_e32 v16, v31, v16
1395 ; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
1396 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16
1397 ; GFX6-NEXT: v_not_b32_e32 v16, v1
1398 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v17
1399 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16
1400 ; GFX6-NEXT: v_not_b32_e32 v16, v2
1401 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v18
1402 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16
1403 ; GFX6-NEXT: v_not_b32_e32 v16, v3
1404 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v19
1405 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v16
1406 ; GFX6-NEXT: v_not_b32_e32 v16, v4
1407 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v20
1408 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v16
1409 ; GFX6-NEXT: v_not_b32_e32 v16, v5
1410 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v21
1411 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v16
1412 ; GFX6-NEXT: v_not_b32_e32 v16, v6
1413 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v22
1414 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v16
1415 ; GFX6-NEXT: v_not_b32_e32 v16, v7
1416 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v23
1417 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v16
1418 ; GFX6-NEXT: v_not_b32_e32 v16, v8
1419 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v24
1420 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v16
1421 ; GFX6-NEXT: v_not_b32_e32 v16, v9
1422 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v25
1423 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v16
1424 ; GFX6-NEXT: v_not_b32_e32 v16, v10
1425 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v26
1426 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16
1427 ; GFX6-NEXT: v_not_b32_e32 v16, v11
1428 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v27
1429 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16
1430 ; GFX6-NEXT: v_not_b32_e32 v16, v12
1431 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v28
1432 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16
1433 ; GFX6-NEXT: v_not_b32_e32 v16, v13
1434 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v29
1435 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16
1436 ; GFX6-NEXT: v_not_b32_e32 v16, v14
1437 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v30
1438 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16
1439 ; GFX6-NEXT: v_not_b32_e32 v16, v15
1440 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1441 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v31
1442 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16
1443 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1445 ; GFX8-LABEL: v_uaddsat_v16i32:
1447 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1448 ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp
1449 ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
1450 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp
1451 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp
1452 ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp
1453 ; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v20 clamp
1454 ; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v21 clamp
1455 ; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v22 clamp
1456 ; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v23 clamp
1457 ; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v8, v24 clamp
1458 ; GFX8-NEXT: v_add_u32_e64 v9, s[4:5], v9, v25 clamp
1459 ; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v10, v26 clamp
1460 ; GFX8-NEXT: v_add_u32_e64 v11, s[4:5], v11, v27 clamp
1461 ; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp
1462 ; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp
1463 ; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp
1464 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1465 ; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v16 clamp
1466 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1468 ; GFX9-LABEL: v_uaddsat_v16i32:
1470 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1471 ; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp
1472 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
1473 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp
1474 ; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp
1475 ; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp
1476 ; GFX9-NEXT: v_add_u32_e64 v4, v4, v20 clamp
1477 ; GFX9-NEXT: v_add_u32_e64 v5, v5, v21 clamp
1478 ; GFX9-NEXT: v_add_u32_e64 v6, v6, v22 clamp
1479 ; GFX9-NEXT: v_add_u32_e64 v7, v7, v23 clamp
1480 ; GFX9-NEXT: v_add_u32_e64 v8, v8, v24 clamp
1481 ; GFX9-NEXT: v_add_u32_e64 v9, v9, v25 clamp
1482 ; GFX9-NEXT: v_add_u32_e64 v10, v10, v26 clamp
1483 ; GFX9-NEXT: v_add_u32_e64 v11, v11, v27 clamp
1484 ; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp
1485 ; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp
1486 ; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp
1487 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1488 ; GFX9-NEXT: v_add_u32_e64 v15, v15, v16 clamp
1489 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1491 ; GFX10-LABEL: v_uaddsat_v16i32:
1493 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1494 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
1495 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp
1496 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp
1497 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp
1498 ; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp
1499 ; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp
1500 ; GFX10-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp
1501 ; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp
1502 ; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp
1503 ; GFX10-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp
1504 ; GFX10-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp
1505 ; GFX10-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp
1506 ; GFX10-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp
1507 ; GFX10-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp
1508 ; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp
1509 ; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp
1510 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1511 ; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp
1512 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1514 ; GFX11-LABEL: v_uaddsat_v16i32:
1516 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1517 ; GFX11-NEXT: scratch_load_b32 v31, off, s32
1518 ; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp
1519 ; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp
1520 ; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp
1521 ; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp
1522 ; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp
1523 ; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp
1524 ; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp
1525 ; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp
1526 ; GFX11-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp
1527 ; GFX11-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp
1528 ; GFX11-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp
1529 ; GFX11-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp
1530 ; GFX11-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp
1531 ; GFX11-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp
1532 ; GFX11-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp
1533 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1534 ; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp
1535 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1536 %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1537 ret <16 x i32> %result
1540 define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
1541 ; GFX6-LABEL: s_uaddsat_v16i32:
1543 ; GFX6-NEXT: s_not_b32 s32, s0
1544 ; GFX6-NEXT: s_min_u32 s16, s32, s16
1545 ; GFX6-NEXT: s_add_i32 s0, s0, s16
1546 ; GFX6-NEXT: s_not_b32 s16, s1
1547 ; GFX6-NEXT: s_min_u32 s16, s16, s17
1548 ; GFX6-NEXT: s_add_i32 s1, s1, s16
1549 ; GFX6-NEXT: s_not_b32 s16, s2
1550 ; GFX6-NEXT: s_min_u32 s16, s16, s18
1551 ; GFX6-NEXT: s_add_i32 s2, s2, s16
1552 ; GFX6-NEXT: s_not_b32 s16, s3
1553 ; GFX6-NEXT: s_min_u32 s16, s16, s19
1554 ; GFX6-NEXT: s_add_i32 s3, s3, s16
1555 ; GFX6-NEXT: s_not_b32 s16, s4
1556 ; GFX6-NEXT: s_min_u32 s16, s16, s20
1557 ; GFX6-NEXT: s_add_i32 s4, s4, s16
1558 ; GFX6-NEXT: s_not_b32 s16, s5
1559 ; GFX6-NEXT: s_min_u32 s16, s16, s21
1560 ; GFX6-NEXT: s_add_i32 s5, s5, s16
1561 ; GFX6-NEXT: s_not_b32 s16, s6
1562 ; GFX6-NEXT: s_min_u32 s16, s16, s22
1563 ; GFX6-NEXT: s_add_i32 s6, s6, s16
1564 ; GFX6-NEXT: s_not_b32 s16, s7
1565 ; GFX6-NEXT: s_min_u32 s16, s16, s23
1566 ; GFX6-NEXT: s_add_i32 s7, s7, s16
1567 ; GFX6-NEXT: s_not_b32 s16, s8
1568 ; GFX6-NEXT: s_min_u32 s16, s16, s24
1569 ; GFX6-NEXT: s_add_i32 s8, s8, s16
1570 ; GFX6-NEXT: s_not_b32 s16, s9
1571 ; GFX6-NEXT: s_min_u32 s16, s16, s25
1572 ; GFX6-NEXT: s_add_i32 s9, s9, s16
1573 ; GFX6-NEXT: s_not_b32 s16, s10
1574 ; GFX6-NEXT: s_min_u32 s16, s16, s26
1575 ; GFX6-NEXT: s_add_i32 s10, s10, s16
1576 ; GFX6-NEXT: s_not_b32 s16, s11
1577 ; GFX6-NEXT: s_min_u32 s16, s16, s27
1578 ; GFX6-NEXT: s_add_i32 s11, s11, s16
1579 ; GFX6-NEXT: s_not_b32 s16, s12
1580 ; GFX6-NEXT: s_min_u32 s16, s16, s28
1581 ; GFX6-NEXT: s_add_i32 s12, s12, s16
1582 ; GFX6-NEXT: s_not_b32 s16, s13
1583 ; GFX6-NEXT: s_min_u32 s16, s16, s29
1584 ; GFX6-NEXT: s_add_i32 s13, s13, s16
1585 ; GFX6-NEXT: s_not_b32 s16, s14
1586 ; GFX6-NEXT: s_min_u32 s16, s16, s30
1587 ; GFX6-NEXT: s_add_i32 s14, s14, s16
1588 ; GFX6-NEXT: s_not_b32 s16, s15
1589 ; GFX6-NEXT: s_min_u32 s16, s16, s31
1590 ; GFX6-NEXT: s_add_i32 s15, s15, s16
1591 ; GFX6-NEXT: ; return to shader part epilog
1593 ; GFX8-LABEL: s_uaddsat_v16i32:
1595 ; GFX8-NEXT: v_mov_b32_e32 v0, s16
1596 ; GFX8-NEXT: v_mov_b32_e32 v1, s17
1597 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
1598 ; GFX8-NEXT: v_mov_b32_e32 v3, s19
1599 ; GFX8-NEXT: v_mov_b32_e32 v4, s20
1600 ; GFX8-NEXT: v_mov_b32_e32 v5, s21
1601 ; GFX8-NEXT: v_mov_b32_e32 v6, s22
1602 ; GFX8-NEXT: v_mov_b32_e32 v7, s23
1603 ; GFX8-NEXT: v_mov_b32_e32 v8, s24
1604 ; GFX8-NEXT: v_mov_b32_e32 v9, s25
1605 ; GFX8-NEXT: v_mov_b32_e32 v10, s26
1606 ; GFX8-NEXT: v_mov_b32_e32 v11, s27
1607 ; GFX8-NEXT: v_mov_b32_e32 v12, s28
1608 ; GFX8-NEXT: v_mov_b32_e32 v13, s29
1609 ; GFX8-NEXT: v_mov_b32_e32 v14, s30
1610 ; GFX8-NEXT: v_mov_b32_e32 v15, s31
1611 ; GFX8-NEXT: v_add_u32_e64 v0, s[32:33], s0, v0 clamp
1612 ; GFX8-NEXT: v_add_u32_e64 v1, s[16:17], s1, v1 clamp
1613 ; GFX8-NEXT: v_add_u32_e64 v2, s[16:17], s2, v2 clamp
1614 ; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], s3, v3 clamp
1615 ; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], s4, v4 clamp
1616 ; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], s5, v5 clamp
1617 ; GFX8-NEXT: v_add_u32_e64 v6, s[2:3], s6, v6 clamp
1618 ; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], s7, v7 clamp
1619 ; GFX8-NEXT: v_add_u32_e64 v8, s[2:3], s8, v8 clamp
1620 ; GFX8-NEXT: v_add_u32_e64 v9, s[2:3], s9, v9 clamp
1621 ; GFX8-NEXT: v_add_u32_e64 v10, s[2:3], s10, v10 clamp
1622 ; GFX8-NEXT: v_add_u32_e64 v11, s[2:3], s11, v11 clamp
1623 ; GFX8-NEXT: v_add_u32_e64 v12, s[2:3], s12, v12 clamp
1624 ; GFX8-NEXT: v_add_u32_e64 v13, s[2:3], s13, v13 clamp
1625 ; GFX8-NEXT: v_add_u32_e64 v14, s[2:3], s14, v14 clamp
1626 ; GFX8-NEXT: v_add_u32_e64 v15, s[2:3], s15, v15 clamp
1627 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1628 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1629 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
1630 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
1631 ; GFX8-NEXT: v_readfirstlane_b32 s4, v4
1632 ; GFX8-NEXT: v_readfirstlane_b32 s5, v5
1633 ; GFX8-NEXT: v_readfirstlane_b32 s6, v6
1634 ; GFX8-NEXT: v_readfirstlane_b32 s7, v7
1635 ; GFX8-NEXT: v_readfirstlane_b32 s8, v8
1636 ; GFX8-NEXT: v_readfirstlane_b32 s9, v9
1637 ; GFX8-NEXT: v_readfirstlane_b32 s10, v10
1638 ; GFX8-NEXT: v_readfirstlane_b32 s11, v11
1639 ; GFX8-NEXT: v_readfirstlane_b32 s12, v12
1640 ; GFX8-NEXT: v_readfirstlane_b32 s13, v13
1641 ; GFX8-NEXT: v_readfirstlane_b32 s14, v14
1642 ; GFX8-NEXT: v_readfirstlane_b32 s15, v15
1643 ; GFX8-NEXT: ; return to shader part epilog
1645 ; GFX9-LABEL: s_uaddsat_v16i32:
1647 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
1648 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
1649 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
1650 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
1651 ; GFX9-NEXT: v_mov_b32_e32 v4, s20
1652 ; GFX9-NEXT: v_mov_b32_e32 v5, s21
1653 ; GFX9-NEXT: v_mov_b32_e32 v6, s22
1654 ; GFX9-NEXT: v_mov_b32_e32 v7, s23
1655 ; GFX9-NEXT: v_mov_b32_e32 v8, s24
1656 ; GFX9-NEXT: v_mov_b32_e32 v9, s25
1657 ; GFX9-NEXT: v_mov_b32_e32 v10, s26
1658 ; GFX9-NEXT: v_mov_b32_e32 v11, s27
1659 ; GFX9-NEXT: v_mov_b32_e32 v12, s28
1660 ; GFX9-NEXT: v_mov_b32_e32 v13, s29
1661 ; GFX9-NEXT: v_mov_b32_e32 v14, s30
1662 ; GFX9-NEXT: v_mov_b32_e32 v15, s31
1663 ; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
1664 ; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp
1665 ; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp
1666 ; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp
1667 ; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp
1668 ; GFX9-NEXT: v_add_u32_e64 v5, s5, v5 clamp
1669 ; GFX9-NEXT: v_add_u32_e64 v6, s6, v6 clamp
1670 ; GFX9-NEXT: v_add_u32_e64 v7, s7, v7 clamp
1671 ; GFX9-NEXT: v_add_u32_e64 v8, s8, v8 clamp
1672 ; GFX9-NEXT: v_add_u32_e64 v9, s9, v9 clamp
1673 ; GFX9-NEXT: v_add_u32_e64 v10, s10, v10 clamp
1674 ; GFX9-NEXT: v_add_u32_e64 v11, s11, v11 clamp
1675 ; GFX9-NEXT: v_add_u32_e64 v12, s12, v12 clamp
1676 ; GFX9-NEXT: v_add_u32_e64 v13, s13, v13 clamp
1677 ; GFX9-NEXT: v_add_u32_e64 v14, s14, v14 clamp
1678 ; GFX9-NEXT: v_add_u32_e64 v15, s15, v15 clamp
1679 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1680 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
1681 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
1682 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
1683 ; GFX9-NEXT: v_readfirstlane_b32 s4, v4
1684 ; GFX9-NEXT: v_readfirstlane_b32 s5, v5
1685 ; GFX9-NEXT: v_readfirstlane_b32 s6, v6
1686 ; GFX9-NEXT: v_readfirstlane_b32 s7, v7
1687 ; GFX9-NEXT: v_readfirstlane_b32 s8, v8
1688 ; GFX9-NEXT: v_readfirstlane_b32 s9, v9
1689 ; GFX9-NEXT: v_readfirstlane_b32 s10, v10
1690 ; GFX9-NEXT: v_readfirstlane_b32 s11, v11
1691 ; GFX9-NEXT: v_readfirstlane_b32 s12, v12
1692 ; GFX9-NEXT: v_readfirstlane_b32 s13, v13
1693 ; GFX9-NEXT: v_readfirstlane_b32 s14, v14
1694 ; GFX9-NEXT: v_readfirstlane_b32 s15, v15
1695 ; GFX9-NEXT: ; return to shader part epilog
1697 ; GFX10PLUS-LABEL: s_uaddsat_v16i32:
1698 ; GFX10PLUS: ; %bb.0:
1699 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp
1700 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp
1701 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp
1702 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp
1703 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v4, s4, s20 clamp
1704 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v5, s5, s21 clamp
1705 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v6, s6, s22 clamp
1706 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v7, s7, s23 clamp
1707 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v8, s8, s24 clamp
1708 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v9, s9, s25 clamp
1709 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v10, s10, s26 clamp
1710 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v11, s11, s27 clamp
1711 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v12, s12, s28 clamp
1712 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v13, s13, s29 clamp
1713 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v14, s14, s30 clamp
1714 ; GFX10PLUS-NEXT: v_add_nc_u32_e64 v15, s15, s31 clamp
1715 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1716 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
1717 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
1718 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
1719 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4
1720 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5
1721 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6
1722 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7
1723 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8
1724 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9
1725 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10
1726 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11
1727 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12
1728 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13
1729 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14
1730 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15
1731 ; GFX10PLUS-NEXT: ; return to shader part epilog
1732 %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1733 ret <16 x i32> %result
1736 define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
1737 ; GFX6-LABEL: v_uaddsat_i16:
1739 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1740 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1741 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1742 ; GFX6-NEXT: v_not_b32_e32 v2, v0
1743 ; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
1744 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1745 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1746 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1748 ; GFX8-LABEL: v_uaddsat_i16:
1750 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp
1752 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1754 ; GFX9-LABEL: v_uaddsat_i16:
1756 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1757 ; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp
1758 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1760 ; GFX10PLUS-LABEL: v_uaddsat_i16:
1761 ; GFX10PLUS: ; %bb.0:
1762 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1763 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp
1764 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1765 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1769 define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
1770 ; GFX6-LABEL: s_uaddsat_i16:
1772 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1773 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1774 ; GFX6-NEXT: s_not_b32 s2, s0
1775 ; GFX6-NEXT: s_min_u32 s1, s2, s1
1776 ; GFX6-NEXT: s_add_i32 s0, s0, s1
1777 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16
1778 ; GFX6-NEXT: ; return to shader part epilog
1780 ; GFX8-LABEL: s_uaddsat_i16:
1782 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1783 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
1784 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1785 ; GFX8-NEXT: ; return to shader part epilog
1787 ; GFX9-LABEL: s_uaddsat_i16:
1789 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1790 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
1791 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1792 ; GFX9-NEXT: ; return to shader part epilog
1794 ; GFX10PLUS-LABEL: s_uaddsat_i16:
1795 ; GFX10PLUS: ; %bb.0:
1796 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
1797 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1798 ; GFX10PLUS-NEXT: ; return to shader part epilog
1799 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1803 define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
1804 ; GFX6-LABEL: uaddsat_i16_sv:
1806 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1807 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1808 ; GFX6-NEXT: s_not_b32 s1, s0
1809 ; GFX6-NEXT: v_min_u32_e32 v0, s1, v0
1810 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
1811 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1812 ; GFX6-NEXT: ; return to shader part epilog
1814 ; GFX8-LABEL: uaddsat_i16_sv:
1816 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
1817 ; GFX8-NEXT: ; return to shader part epilog
1819 ; GFX9-LABEL: uaddsat_i16_sv:
1821 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
1822 ; GFX9-NEXT: ; return to shader part epilog
1824 ; GFX10PLUS-LABEL: uaddsat_i16_sv:
1825 ; GFX10PLUS: ; %bb.0:
1826 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, v0 clamp
1827 ; GFX10PLUS-NEXT: ; return to shader part epilog
1828 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1829 %cast = bitcast i16 %result to half
1833 define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
1834 ; GFX6-LABEL: uaddsat_i16_vs:
1836 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1837 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1838 ; GFX6-NEXT: v_not_b32_e32 v1, v0
1839 ; GFX6-NEXT: v_min_u32_e32 v1, s0, v1
1840 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1841 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1842 ; GFX6-NEXT: ; return to shader part epilog
1844 ; GFX8-LABEL: uaddsat_i16_vs:
1846 ; GFX8-NEXT: v_add_u16_e64 v0, v0, s0 clamp
1847 ; GFX8-NEXT: ; return to shader part epilog
1849 ; GFX9-LABEL: uaddsat_i16_vs:
1851 ; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp
1852 ; GFX9-NEXT: ; return to shader part epilog
1854 ; GFX10PLUS-LABEL: uaddsat_i16_vs:
1855 ; GFX10PLUS: ; %bb.0:
1856 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, s0 clamp
1857 ; GFX10PLUS-NEXT: ; return to shader part epilog
1858 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1859 %cast = bitcast i16 %result to half
1863 define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
1864 ; GFX6-LABEL: v_uaddsat_v2i16:
1866 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1867 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1868 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1869 ; GFX6-NEXT: v_not_b32_e32 v4, v0
1870 ; GFX6-NEXT: v_min_u32_e32 v2, v4, v2
1871 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1872 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1873 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1874 ; GFX6-NEXT: v_not_b32_e32 v3, v1
1875 ; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
1876 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
1877 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1878 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1879 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1881 ; GFX8-LABEL: v_uaddsat_v2i16:
1883 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1884 ; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp
1885 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1886 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
1887 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1889 ; GFX9-LABEL: v_uaddsat_v2i16:
1891 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1892 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
1893 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1895 ; GFX10PLUS-LABEL: v_uaddsat_v2i16:
1896 ; GFX10PLUS: ; %bb.0:
1897 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1898 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, v0, v1 clamp
1899 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
1900 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1901 ret <2 x i16> %result
1904 define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
1905 ; GFX6-LABEL: s_uaddsat_v2i16:
1907 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1908 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
1909 ; GFX6-NEXT: s_not_b32 s4, s0
1910 ; GFX6-NEXT: s_min_u32 s2, s4, s2
1911 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
1912 ; GFX6-NEXT: s_add_i32 s0, s0, s2
1913 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16
1914 ; GFX6-NEXT: s_not_b32 s3, s1
1915 ; GFX6-NEXT: s_min_u32 s2, s3, s2
1916 ; GFX6-NEXT: s_add_i32 s1, s1, s2
1917 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
1918 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
1919 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
1920 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1921 ; GFX6-NEXT: ; return to shader part epilog
1923 ; GFX8-LABEL: s_uaddsat_v2i16:
1925 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
1926 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
1927 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1928 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1929 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1930 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
1931 ; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1932 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1933 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1934 ; GFX8-NEXT: ; return to shader part epilog
1936 ; GFX9-LABEL: s_uaddsat_v2i16:
1938 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
1939 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
1940 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
1941 ; GFX9-NEXT: ; return to shader part epilog
1943 ; GFX10PLUS-LABEL: s_uaddsat_v2i16:
1944 ; GFX10PLUS: ; %bb.0:
1945 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, s0, s1 clamp
1946 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
1947 ; GFX10PLUS-NEXT: ; return to shader part epilog
1948 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1949 %cast = bitcast <2 x i16> %result to i32
1953 define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
1954 ; GFX6-LABEL: uaddsat_v2i16_sv:
1956 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1957 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1958 ; GFX6-NEXT: s_not_b32 s2, s0
1959 ; GFX6-NEXT: v_min_u32_e32 v0, s2, v0
1960 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
1961 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
1962 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1963 ; GFX6-NEXT: s_not_b32 s1, s0
1964 ; GFX6-NEXT: v_min_u32_e32 v1, s1, v1
1965 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1
1966 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1967 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
1968 ; GFX6-NEXT: ; return to shader part epilog
1970 ; GFX8-LABEL: uaddsat_v2i16_sv:
1972 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
1973 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1974 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp
1975 ; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1976 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
1977 ; GFX8-NEXT: ; return to shader part epilog
1979 ; GFX9-LABEL: uaddsat_v2i16_sv:
1981 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
1982 ; GFX9-NEXT: ; return to shader part epilog
1984 ; GFX10PLUS-LABEL: uaddsat_v2i16_sv:
1985 ; GFX10PLUS: ; %bb.0:
1986 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, s0, v0 clamp
1987 ; GFX10PLUS-NEXT: ; return to shader part epilog
1988 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1989 %cast = bitcast <2 x i16> %result to float
1993 define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
1994 ; GFX6-LABEL: uaddsat_v2i16_vs:
1996 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1997 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
1998 ; GFX6-NEXT: v_not_b32_e32 v2, v0
1999 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2
2000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2001 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2002 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16
2003 ; GFX6-NEXT: v_not_b32_e32 v2, v1
2004 ; GFX6-NEXT: v_min_u32_e32 v2, s0, v2
2005 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
2006 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2007 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2008 ; GFX6-NEXT: ; return to shader part epilog
2010 ; GFX8-LABEL: uaddsat_v2i16_vs:
2012 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
2013 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2014 ; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp
2015 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2016 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
2017 ; GFX8-NEXT: ; return to shader part epilog
2019 ; GFX9-LABEL: uaddsat_v2i16_vs:
2021 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 clamp
2022 ; GFX9-NEXT: ; return to shader part epilog
2024 ; GFX10PLUS-LABEL: uaddsat_v2i16_vs:
2025 ; GFX10PLUS: ; %bb.0:
2026 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, v0, s0 clamp
2027 ; GFX10PLUS-NEXT: ; return to shader part epilog
2028 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2029 %cast = bitcast <2 x i16> %result to float
2033 ; FIXME: v3i16 insert/extract
2034 ; define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
2035 ; %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
2036 ; ret <3 x i16> %result
2039 ; define amdgpu_ps <3 x i16> @s_uaddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
2040 ; %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
2041 ; ret <3 x i16> %result
2044 define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
2045 ; GFX6-LABEL: v_uaddsat_v4i16:
2047 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2048 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2049 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2050 ; GFX6-NEXT: v_not_b32_e32 v8, v0
2051 ; GFX6-NEXT: v_min_u32_e32 v4, v8, v4
2052 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2053 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
2054 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
2055 ; GFX6-NEXT: v_not_b32_e32 v5, v1
2056 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
2057 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2058 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
2059 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
2060 ; GFX6-NEXT: v_not_b32_e32 v5, v2
2061 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
2062 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2063 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
2064 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
2065 ; GFX6-NEXT: v_not_b32_e32 v5, v3
2066 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
2067 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
2068 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2069 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2070 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2071 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2072 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2074 ; GFX8-LABEL: v_uaddsat_v4i16:
2076 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2077 ; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp
2078 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2079 ; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp
2080 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2081 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
2082 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
2083 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2085 ; GFX9-LABEL: v_uaddsat_v4i16:
2087 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2088 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp
2089 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp
2090 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2092 ; GFX10PLUS-LABEL: v_uaddsat_v4i16:
2093 ; GFX10PLUS: ; %bb.0:
2094 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2095 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, v0, v2 clamp
2096 ; GFX10PLUS-NEXT: v_pk_add_u16 v1, v1, v3 clamp
2097 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2098 %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2099 %cast = bitcast <4 x i16> %result to <2 x float>
2100 ret <2 x float> %cast
2103 define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
2104 ; GFX6-LABEL: s_uaddsat_v4i16:
2106 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2107 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2108 ; GFX6-NEXT: s_not_b32 s8, s0
2109 ; GFX6-NEXT: s_min_u32 s4, s8, s4
2110 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2111 ; GFX6-NEXT: s_add_i32 s0, s0, s4
2112 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16
2113 ; GFX6-NEXT: s_not_b32 s5, s1
2114 ; GFX6-NEXT: s_min_u32 s4, s5, s4
2115 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2116 ; GFX6-NEXT: s_add_i32 s1, s1, s4
2117 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16
2118 ; GFX6-NEXT: s_not_b32 s5, s2
2119 ; GFX6-NEXT: s_min_u32 s4, s5, s4
2120 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2121 ; GFX6-NEXT: s_add_i32 s2, s2, s4
2122 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16
2123 ; GFX6-NEXT: s_not_b32 s5, s3
2124 ; GFX6-NEXT: s_min_u32 s4, s5, s4
2125 ; GFX6-NEXT: s_add_i32 s3, s3, s4
2126 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2127 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2128 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2129 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2130 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2131 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2132 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2133 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2134 ; GFX6-NEXT: ; return to shader part epilog
2136 ; GFX8-LABEL: s_uaddsat_v4i16:
2138 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16
2139 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16
2140 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16
2141 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16
2142 ; GFX8-NEXT: v_mov_b32_e32 v1, s6
2143 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2144 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
2145 ; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2146 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
2147 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
2148 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
2149 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
2150 ; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
2151 ; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2152 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2153 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
2154 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2155 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2156 ; GFX8-NEXT: ; return to shader part epilog
2158 ; GFX9-LABEL: s_uaddsat_v4i16:
2160 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2161 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2162 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
2163 ; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp
2164 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2165 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2166 ; GFX9-NEXT: ; return to shader part epilog
2168 ; GFX10PLUS-LABEL: s_uaddsat_v4i16:
2169 ; GFX10PLUS: ; %bb.0:
2170 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, s0, s2 clamp
2171 ; GFX10PLUS-NEXT: v_pk_add_u16 v1, s1, s3 clamp
2172 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2173 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2174 ; GFX10PLUS-NEXT: ; return to shader part epilog
2175 %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2176 %cast = bitcast <4 x i16> %result to <2 x i32>
2181 ; define <5 x i16> @v_uaddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
2182 ; %result = call <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2183 ; ret <5 x i16> %result
2186 ; define amdgpu_ps <5 x i16> @s_uaddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
2187 ; %result = call <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2188 ; ret <5 x i16> %result
2191 define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
2192 ; GFX6-LABEL: v_uaddsat_v6i16:
2194 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2195 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2196 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2197 ; GFX6-NEXT: v_not_b32_e32 v12, v0
2198 ; GFX6-NEXT: v_min_u32_e32 v6, v12, v6
2199 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2200 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6
2201 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7
2202 ; GFX6-NEXT: v_not_b32_e32 v7, v1
2203 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
2204 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2205 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6
2206 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8
2207 ; GFX6-NEXT: v_not_b32_e32 v7, v2
2208 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
2209 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2210 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
2211 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9
2212 ; GFX6-NEXT: v_not_b32_e32 v7, v3
2213 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
2214 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2215 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
2216 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10
2217 ; GFX6-NEXT: v_not_b32_e32 v7, v4
2218 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
2219 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2220 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
2221 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11
2222 ; GFX6-NEXT: v_not_b32_e32 v7, v5
2223 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
2224 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6
2225 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2226 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2227 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2228 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2229 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2230 ; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16
2231 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2233 ; GFX8-LABEL: v_uaddsat_v6i16:
2235 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2236 ; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp
2237 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2238 ; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp
2239 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2240 ; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp
2241 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2242 ; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
2243 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
2244 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
2245 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2247 ; GFX9-LABEL: v_uaddsat_v6i16:
2249 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2250 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 clamp
2251 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 clamp
2252 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 clamp
2253 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2255 ; GFX10PLUS-LABEL: v_uaddsat_v6i16:
2256 ; GFX10PLUS: ; %bb.0:
2257 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2258 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, v0, v3 clamp
2259 ; GFX10PLUS-NEXT: v_pk_add_u16 v1, v1, v4 clamp
2260 ; GFX10PLUS-NEXT: v_pk_add_u16 v2, v2, v5 clamp
2261 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2262 %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2263 %cast = bitcast <6 x i16> %result to <3 x float>
2264 ret <3 x float> %cast
2267 define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
2268 ; GFX6-LABEL: s_uaddsat_v6i16:
2270 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2271 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
2272 ; GFX6-NEXT: s_not_b32 s12, s0
2273 ; GFX6-NEXT: s_min_u32 s6, s12, s6
2274 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2275 ; GFX6-NEXT: s_add_i32 s0, s0, s6
2276 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16
2277 ; GFX6-NEXT: s_not_b32 s7, s1
2278 ; GFX6-NEXT: s_min_u32 s6, s7, s6
2279 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2280 ; GFX6-NEXT: s_add_i32 s1, s1, s6
2281 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16
2282 ; GFX6-NEXT: s_not_b32 s7, s2
2283 ; GFX6-NEXT: s_min_u32 s6, s7, s6
2284 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2285 ; GFX6-NEXT: s_add_i32 s2, s2, s6
2286 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16
2287 ; GFX6-NEXT: s_not_b32 s7, s3
2288 ; GFX6-NEXT: s_min_u32 s6, s7, s6
2289 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2290 ; GFX6-NEXT: s_add_i32 s3, s3, s6
2291 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16
2292 ; GFX6-NEXT: s_not_b32 s7, s4
2293 ; GFX6-NEXT: s_min_u32 s6, s7, s6
2294 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
2295 ; GFX6-NEXT: s_add_i32 s4, s4, s6
2296 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16
2297 ; GFX6-NEXT: s_not_b32 s7, s5
2298 ; GFX6-NEXT: s_min_u32 s6, s7, s6
2299 ; GFX6-NEXT: s_add_i32 s5, s5, s6
2300 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2301 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2302 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16
2303 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2304 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2305 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2306 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2307 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2308 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
2309 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2310 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2311 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2312 ; GFX6-NEXT: ; return to shader part epilog
2314 ; GFX8-LABEL: s_uaddsat_v6i16:
2316 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16
2317 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16
2318 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16
2319 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16
2320 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16
2321 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16
2322 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
2323 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
2324 ; GFX8-NEXT: v_mov_b32_e32 v3, s10
2325 ; GFX8-NEXT: v_mov_b32_e32 v4, s7
2326 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
2327 ; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2328 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2329 ; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2330 ; GFX8-NEXT: v_mov_b32_e32 v4, s5
2331 ; GFX8-NEXT: v_mov_b32_e32 v5, s11
2332 ; GFX8-NEXT: v_mov_b32_e32 v6, s8
2333 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
2334 ; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
2335 ; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp
2336 ; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2337 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2338 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
2339 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
2340 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2341 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2342 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2343 ; GFX8-NEXT: ; return to shader part epilog
2345 ; GFX9-LABEL: s_uaddsat_v6i16:
2347 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
2348 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2349 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
2350 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
2351 ; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp
2352 ; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp
2353 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2354 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2355 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2356 ; GFX9-NEXT: ; return to shader part epilog
2358 ; GFX10PLUS-LABEL: s_uaddsat_v6i16:
2359 ; GFX10PLUS: ; %bb.0:
2360 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, s0, s3 clamp
2361 ; GFX10PLUS-NEXT: v_pk_add_u16 v1, s1, s4 clamp
2362 ; GFX10PLUS-NEXT: v_pk_add_u16 v2, s2, s5 clamp
2363 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2364 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2365 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
2366 ; GFX10PLUS-NEXT: ; return to shader part epilog
2367 %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2368 %cast = bitcast <6 x i16> %result to <3 x i32>
2372 define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
2373 ; GFX6-LABEL: v_uaddsat_v8i16:
2375 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2376 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2377 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
2378 ; GFX6-NEXT: v_not_b32_e32 v16, v0
2379 ; GFX6-NEXT: v_min_u32_e32 v8, v16, v8
2380 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2381 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8
2382 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9
2383 ; GFX6-NEXT: v_not_b32_e32 v9, v1
2384 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2385 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2386 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8
2387 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10
2388 ; GFX6-NEXT: v_not_b32_e32 v9, v2
2389 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2390 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2391 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8
2392 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11
2393 ; GFX6-NEXT: v_not_b32_e32 v9, v3
2394 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2395 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2396 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8
2397 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12
2398 ; GFX6-NEXT: v_not_b32_e32 v9, v4
2399 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2400 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2401 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8
2402 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13
2403 ; GFX6-NEXT: v_not_b32_e32 v9, v5
2404 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2405 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2406 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8
2407 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14
2408 ; GFX6-NEXT: v_not_b32_e32 v9, v6
2409 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2410 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2411 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8
2412 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15
2413 ; GFX6-NEXT: v_not_b32_e32 v9, v7
2414 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
2415 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8
2416 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2417 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2418 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2419 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2420 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
2421 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16
2422 ; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16
2423 ; GFX6-NEXT: v_alignbit_b32 v3, v7, v6, 16
2424 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2426 ; GFX8-LABEL: v_uaddsat_v8i16:
2428 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2429 ; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp
2430 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2431 ; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp
2432 ; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2433 ; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp
2434 ; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2435 ; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp
2436 ; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2437 ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
2438 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
2439 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
2440 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
2441 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2443 ; GFX9-LABEL: v_uaddsat_v8i16:
2445 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2446 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 clamp
2447 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 clamp
2448 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 clamp
2449 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 clamp
2450 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2452 ; GFX10PLUS-LABEL: v_uaddsat_v8i16:
2453 ; GFX10PLUS: ; %bb.0:
2454 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2455 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, v0, v4 clamp
2456 ; GFX10PLUS-NEXT: v_pk_add_u16 v1, v1, v5 clamp
2457 ; GFX10PLUS-NEXT: v_pk_add_u16 v2, v2, v6 clamp
2458 ; GFX10PLUS-NEXT: v_pk_add_u16 v3, v3, v7 clamp
2459 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2460 %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2461 %cast = bitcast <8 x i16> %result to <4 x float>
2462 ret <4 x float> %cast
2465 define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
2466 ; GFX6-LABEL: s_uaddsat_v8i16:
2468 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16
2469 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16
2470 ; GFX6-NEXT: s_not_b32 s16, s0
2471 ; GFX6-NEXT: s_min_u32 s8, s16, s8
2472 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16
2473 ; GFX6-NEXT: s_add_i32 s0, s0, s8
2474 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16
2475 ; GFX6-NEXT: s_not_b32 s9, s1
2476 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2477 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16
2478 ; GFX6-NEXT: s_add_i32 s1, s1, s8
2479 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16
2480 ; GFX6-NEXT: s_not_b32 s9, s2
2481 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2482 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2483 ; GFX6-NEXT: s_add_i32 s2, s2, s8
2484 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16
2485 ; GFX6-NEXT: s_not_b32 s9, s3
2486 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2487 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16
2488 ; GFX6-NEXT: s_add_i32 s3, s3, s8
2489 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16
2490 ; GFX6-NEXT: s_not_b32 s9, s4
2491 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2492 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16
2493 ; GFX6-NEXT: s_add_i32 s4, s4, s8
2494 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16
2495 ; GFX6-NEXT: s_not_b32 s9, s5
2496 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2497 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16
2498 ; GFX6-NEXT: s_add_i32 s5, s5, s8
2499 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16
2500 ; GFX6-NEXT: s_not_b32 s9, s6
2501 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2502 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16
2503 ; GFX6-NEXT: s_add_i32 s6, s6, s8
2504 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16
2505 ; GFX6-NEXT: s_not_b32 s9, s7
2506 ; GFX6-NEXT: s_min_u32 s8, s9, s8
2507 ; GFX6-NEXT: s_add_i32 s7, s7, s8
2508 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16
2509 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16
2510 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16
2511 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16
2512 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
2513 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
2514 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2515 ; GFX6-NEXT: v_mov_b32_e32 v3, s6
2516 ; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
2517 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
2518 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
2519 ; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16
2520 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2521 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1
2522 ; GFX6-NEXT: v_readfirstlane_b32 s2, v2
2523 ; GFX6-NEXT: v_readfirstlane_b32 s3, v3
2524 ; GFX6-NEXT: ; return to shader part epilog
2526 ; GFX8-LABEL: s_uaddsat_v8i16:
2528 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16
2529 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16
2530 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16
2531 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16
2532 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16
2533 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16
2534 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16
2535 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16
2536 ; GFX8-NEXT: v_mov_b32_e32 v1, s12
2537 ; GFX8-NEXT: v_mov_b32_e32 v2, s8
2538 ; GFX8-NEXT: v_mov_b32_e32 v3, s13
2539 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
2540 ; GFX8-NEXT: v_mov_b32_e32 v5, s14
2541 ; GFX8-NEXT: v_mov_b32_e32 v6, s10
2542 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2543 ; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2544 ; GFX8-NEXT: v_mov_b32_e32 v2, s5
2545 ; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2546 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
2547 ; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2548 ; GFX8-NEXT: v_mov_b32_e32 v6, s7
2549 ; GFX8-NEXT: v_mov_b32_e32 v7, s15
2550 ; GFX8-NEXT: v_mov_b32_e32 v8, s11
2551 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
2552 ; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
2553 ; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp
2554 ; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp
2555 ; GFX8-NEXT: v_add_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2556 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2557 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
2558 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
2559 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v7
2560 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2561 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
2562 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
2563 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
2564 ; GFX8-NEXT: ; return to shader part epilog
2566 ; GFX9-LABEL: s_uaddsat_v8i16:
2568 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2569 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2570 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
2571 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
2572 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
2573 ; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp
2574 ; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp
2575 ; GFX9-NEXT: v_pk_add_u16 v3, s3, v3 clamp
2576 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2577 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2578 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
2579 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
2580 ; GFX9-NEXT: ; return to shader part epilog
2582 ; GFX10PLUS-LABEL: s_uaddsat_v8i16:
2583 ; GFX10PLUS: ; %bb.0:
2584 ; GFX10PLUS-NEXT: v_pk_add_u16 v0, s0, s4 clamp
2585 ; GFX10PLUS-NEXT: v_pk_add_u16 v1, s1, s5 clamp
2586 ; GFX10PLUS-NEXT: v_pk_add_u16 v2, s2, s6 clamp
2587 ; GFX10PLUS-NEXT: v_pk_add_u16 v3, s3, s7 clamp
2588 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
2589 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
2590 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
2591 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
2592 ; GFX10PLUS-NEXT: ; return to shader part epilog
2593 %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2594 %cast = bitcast <8 x i16> %result to <4 x i32>
2598 define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
2599 ; GFX6-LABEL: v_uaddsat_i48:
2601 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2602 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2603 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
2604 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2605 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
2606 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2607 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
2608 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2609 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2610 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2611 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2612 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2613 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
2614 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2616 ; GFX8-LABEL: v_uaddsat_i48:
2618 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2619 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2620 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
2621 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
2622 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
2623 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2624 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2625 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2626 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2628 ; GFX9-LABEL: v_uaddsat_i48:
2630 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2632 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
2633 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
2634 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2635 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2636 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2637 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2638 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2640 ; GFX10PLUS-LABEL: v_uaddsat_i48:
2641 ; GFX10PLUS: ; %bb.0:
2642 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2643 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2644 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
2645 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2646 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2647 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2648 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2649 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2650 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2651 %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2655 define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
2656 ; GFX6-LABEL: s_uaddsat_i48:
2658 ; GFX6-NEXT: s_add_u32 s0, s0, s2
2659 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0
2660 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2661 ; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
2662 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0
2663 ; GFX6-NEXT: s_addc_u32 s2, s1, s3
2664 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
2665 ; GFX6-NEXT: s_cmp_lg_u32 s2, s1
2666 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0
2667 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16
2668 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
2669 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16
2670 ; GFX6-NEXT: s_or_b32 s0, s0, s3
2671 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0
2672 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2673 ; GFX6-NEXT: ; return to shader part epilog
2675 ; GFX8-LABEL: s_uaddsat_i48:
2677 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2678 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
2679 ; GFX8-NEXT: s_add_u32 s0, s0, s2
2680 ; GFX8-NEXT: s_addc_u32 s1, s1, s3
2681 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2682 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
2683 ; GFX8-NEXT: ; return to shader part epilog
2685 ; GFX9-LABEL: s_uaddsat_i48:
2687 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2688 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
2689 ; GFX9-NEXT: s_add_u32 s0, s0, s2
2690 ; GFX9-NEXT: s_addc_u32 s1, s1, s3
2691 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2692 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
2693 ; GFX9-NEXT: ; return to shader part epilog
2695 ; GFX10PLUS-LABEL: s_uaddsat_i48:
2696 ; GFX10PLUS: ; %bb.0:
2697 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2698 ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
2699 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2
2700 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
2701 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2702 ; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
2703 ; GFX10PLUS-NEXT: ; return to shader part epilog
2704 %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2708 define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
2709 ; GFX6-LABEL: uaddsat_i48_sv:
2711 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2712 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2713 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2714 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2715 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
2716 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
2717 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2718 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2719 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2720 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2721 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2722 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
2723 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2724 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2725 ; GFX6-NEXT: ; return to shader part epilog
2727 ; GFX8-LABEL: uaddsat_i48_sv:
2729 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2730 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2731 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2732 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2733 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
2734 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2735 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2736 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2737 ; GFX8-NEXT: ; return to shader part epilog
2739 ; GFX9-LABEL: uaddsat_i48_sv:
2741 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2742 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2743 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2744 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
2745 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
2746 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2747 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2748 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2749 ; GFX9-NEXT: ; return to shader part epilog
2751 ; GFX10PLUS-LABEL: uaddsat_i48_sv:
2752 ; GFX10PLUS: ; %bb.0:
2753 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2754 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2755 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
2756 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2757 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2758 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2759 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2760 ; GFX10PLUS-NEXT: ; return to shader part epilog
2761 %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2762 %ext.result = zext i48 %result to i64
2763 %cast = bitcast i64 %ext.result to <2 x float>
2764 ret <2 x float> %cast
2767 define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
2768 ; GFX6-LABEL: uaddsat_i48_vs:
2770 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
2771 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2772 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2773 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2774 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
2775 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
2776 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2777 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2778 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2779 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
2780 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
2781 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
2782 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2783 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2784 ; GFX6-NEXT: ; return to shader part epilog
2786 ; GFX8-LABEL: uaddsat_i48_vs:
2788 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2789 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2790 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2791 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2792 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
2793 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2794 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2795 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2796 ; GFX8-NEXT: ; return to shader part epilog
2798 ; GFX9-LABEL: uaddsat_i48_vs:
2800 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2801 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2802 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2803 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
2804 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
2805 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2806 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2807 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2808 ; GFX9-NEXT: ; return to shader part epilog
2810 ; GFX10PLUS-LABEL: uaddsat_i48_vs:
2811 ; GFX10PLUS: ; %bb.0:
2812 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
2813 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
2814 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
2815 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2816 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2817 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2818 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
2819 ; GFX10PLUS-NEXT: ; return to shader part epilog
2820 %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2821 %ext.result = zext i48 %result to i64
2822 %cast = bitcast i64 %ext.result to <2 x float>
2823 ret <2 x float> %cast
2826 define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
2827 ; GFX6-LABEL: v_uaddsat_i64:
2829 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2830 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2831 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
2832 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2833 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2834 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2836 ; GFX8-LABEL: v_uaddsat_i64:
2838 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2839 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
2840 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
2841 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2842 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2843 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2845 ; GFX9-LABEL: v_uaddsat_i64:
2847 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2848 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
2849 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2850 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2851 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2852 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2854 ; GFX10PLUS-LABEL: v_uaddsat_i64:
2855 ; GFX10PLUS: ; %bb.0:
2856 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2857 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
2858 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2859 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2860 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2861 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
2862 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2866 define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
2867 ; GFX6-LABEL: s_uaddsat_i64:
2869 ; GFX6-NEXT: s_add_u32 s0, s0, s2
2870 ; GFX6-NEXT: s_addc_u32 s1, s1, s3
2871 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2872 ; GFX6-NEXT: ; return to shader part epilog
2874 ; GFX8-LABEL: s_uaddsat_i64:
2876 ; GFX8-NEXT: s_add_u32 s0, s0, s2
2877 ; GFX8-NEXT: s_addc_u32 s1, s1, s3
2878 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2879 ; GFX8-NEXT: ; return to shader part epilog
2881 ; GFX9-LABEL: s_uaddsat_i64:
2883 ; GFX9-NEXT: s_add_u32 s0, s0, s2
2884 ; GFX9-NEXT: s_addc_u32 s1, s1, s3
2885 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2886 ; GFX9-NEXT: ; return to shader part epilog
2888 ; GFX10PLUS-LABEL: s_uaddsat_i64:
2889 ; GFX10PLUS: ; %bb.0:
2890 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2
2891 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
2892 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
2893 ; GFX10PLUS-NEXT: ; return to shader part epilog
2894 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2898 define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
2899 ; GFX6-LABEL: uaddsat_i64_sv:
2901 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2902 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2903 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
2904 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2905 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2906 ; GFX6-NEXT: ; return to shader part epilog
2908 ; GFX8-LABEL: uaddsat_i64_sv:
2910 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2911 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2912 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
2913 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2914 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2915 ; GFX8-NEXT: ; return to shader part epilog
2917 ; GFX9-LABEL: uaddsat_i64_sv:
2919 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2920 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
2921 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
2922 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2923 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2924 ; GFX9-NEXT: ; return to shader part epilog
2926 ; GFX10PLUS-LABEL: uaddsat_i64_sv:
2927 ; GFX10PLUS: ; %bb.0:
2928 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
2929 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2930 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2931 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2932 ; GFX10PLUS-NEXT: ; return to shader part epilog
2933 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2934 %cast = bitcast i64 %result to <2 x float>
2935 ret <2 x float> %cast
2938 define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
2939 ; GFX6-LABEL: uaddsat_i64_vs:
2941 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
2942 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2943 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
2944 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2945 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2946 ; GFX6-NEXT: ; return to shader part epilog
2948 ; GFX8-LABEL: uaddsat_i64_vs:
2950 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2951 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2952 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
2953 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2954 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2955 ; GFX8-NEXT: ; return to shader part epilog
2957 ; GFX9-LABEL: uaddsat_i64_vs:
2959 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2960 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
2961 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
2962 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2963 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2964 ; GFX9-NEXT: ; return to shader part epilog
2966 ; GFX10PLUS-LABEL: uaddsat_i64_vs:
2967 ; GFX10PLUS: ; %bb.0:
2968 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
2969 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2970 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2971 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2972 ; GFX10PLUS-NEXT: ; return to shader part epilog
2973 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2974 %cast = bitcast i64 %result to <2 x float>
2975 ret <2 x float> %cast
2978 define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
2979 ; GFX6-LABEL: v_uaddsat_v2i64:
2981 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2982 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
2983 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
2984 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2985 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2986 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
2987 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
2988 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
2989 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
2990 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2992 ; GFX8-LABEL: v_uaddsat_v2i64:
2994 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2995 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
2996 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
2997 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
2998 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
2999 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
3000 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
3001 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3002 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3003 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3005 ; GFX9-LABEL: v_uaddsat_v2i64:
3007 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3008 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
3009 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
3010 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3011 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3012 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
3013 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
3014 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3015 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3016 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3018 ; GFX10-LABEL: v_uaddsat_v2i64:
3020 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3021 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
3022 ; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v6
3023 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
3024 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4
3025 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3026 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3027 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4
3028 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4
3029 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3031 ; GFX11-LABEL: v_uaddsat_v2i64:
3033 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3034 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
3035 ; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v6
3036 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
3037 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v7, s0
3038 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3039 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3040 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, s0
3041 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, s0
3042 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3043 %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
3044 ret <2 x i64> %result
3047 define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
3048 ; GFX6-LABEL: s_uaddsat_v2i64:
3050 ; GFX6-NEXT: s_add_u32 s0, s0, s4
3051 ; GFX6-NEXT: s_addc_u32 s1, s1, s5
3052 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3053 ; GFX6-NEXT: s_add_u32 s2, s2, s6
3054 ; GFX6-NEXT: s_addc_u32 s3, s3, s7
3055 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3056 ; GFX6-NEXT: ; return to shader part epilog
3058 ; GFX8-LABEL: s_uaddsat_v2i64:
3060 ; GFX8-NEXT: s_add_u32 s0, s0, s4
3061 ; GFX8-NEXT: s_addc_u32 s1, s1, s5
3062 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3063 ; GFX8-NEXT: s_add_u32 s2, s2, s6
3064 ; GFX8-NEXT: s_addc_u32 s3, s3, s7
3065 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3066 ; GFX8-NEXT: ; return to shader part epilog
3068 ; GFX9-LABEL: s_uaddsat_v2i64:
3070 ; GFX9-NEXT: s_add_u32 s0, s0, s4
3071 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
3072 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3073 ; GFX9-NEXT: s_add_u32 s2, s2, s6
3074 ; GFX9-NEXT: s_addc_u32 s3, s3, s7
3075 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3076 ; GFX9-NEXT: ; return to shader part epilog
3078 ; GFX10PLUS-LABEL: s_uaddsat_v2i64:
3079 ; GFX10PLUS: ; %bb.0:
3080 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4
3081 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
3082 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3083 ; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6
3084 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7
3085 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3086 ; GFX10PLUS-NEXT: ; return to shader part epilog
3087 %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
3088 ret <2 x i64> %result
3091 define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
3092 ; GFX6-LABEL: s_uaddsat_i128:
3094 ; GFX6-NEXT: s_add_u32 s0, s0, s4
3095 ; GFX6-NEXT: s_addc_u32 s1, s1, s5
3096 ; GFX6-NEXT: s_addc_u32 s2, s2, s6
3097 ; GFX6-NEXT: s_addc_u32 s3, s3, s7
3098 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3099 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3100 ; GFX6-NEXT: ; return to shader part epilog
3102 ; GFX8-LABEL: s_uaddsat_i128:
3104 ; GFX8-NEXT: s_add_u32 s0, s0, s4
3105 ; GFX8-NEXT: s_addc_u32 s1, s1, s5
3106 ; GFX8-NEXT: s_addc_u32 s2, s2, s6
3107 ; GFX8-NEXT: s_addc_u32 s3, s3, s7
3108 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3109 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3110 ; GFX8-NEXT: ; return to shader part epilog
3112 ; GFX9-LABEL: s_uaddsat_i128:
3114 ; GFX9-NEXT: s_add_u32 s0, s0, s4
3115 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
3116 ; GFX9-NEXT: s_addc_u32 s2, s2, s6
3117 ; GFX9-NEXT: s_addc_u32 s3, s3, s7
3118 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3119 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3120 ; GFX9-NEXT: ; return to shader part epilog
3122 ; GFX10PLUS-LABEL: s_uaddsat_i128:
3123 ; GFX10PLUS: ; %bb.0:
3124 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4
3125 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
3126 ; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s6
3127 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7
3128 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3129 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3130 ; GFX10PLUS-NEXT: ; return to shader part epilog
3131 %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
3135 define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
3136 ; GFX6-LABEL: uaddsat_i128_sv:
3138 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
3139 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
3140 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
3141 ; GFX6-NEXT: v_mov_b32_e32 v4, s2
3142 ; GFX6-NEXT: v_mov_b32_e32 v5, s3
3143 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
3144 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
3145 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3146 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3147 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3148 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3149 ; GFX6-NEXT: ; return to shader part epilog
3151 ; GFX8-LABEL: uaddsat_i128_sv:
3153 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
3154 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
3155 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
3156 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
3157 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
3158 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
3159 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
3160 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3161 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3162 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3163 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3164 ; GFX8-NEXT: ; return to shader part epilog
3166 ; GFX9-LABEL: uaddsat_i128_sv:
3168 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
3169 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
3170 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
3171 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
3172 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
3173 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
3174 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
3175 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3176 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3177 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3178 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3179 ; GFX9-NEXT: ; return to shader part epilog
3181 ; GFX10PLUS-LABEL: uaddsat_i128_sv:
3182 ; GFX10PLUS: ; %bb.0:
3183 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
3184 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3185 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3186 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3187 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3188 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3189 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
3190 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
3191 ; GFX10PLUS-NEXT: ; return to shader part epilog
3192 %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
3193 %cast = bitcast i128 %result to <4 x float>
3194 ret <4 x float> %cast
3197 define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
3198 ; GFX6-LABEL: uaddsat_i128_vs:
3200 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
3201 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
3202 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
3203 ; GFX6-NEXT: v_mov_b32_e32 v4, s2
3204 ; GFX6-NEXT: v_mov_b32_e32 v5, s3
3205 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
3206 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
3207 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3208 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3209 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3210 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3211 ; GFX6-NEXT: ; return to shader part epilog
3213 ; GFX8-LABEL: uaddsat_i128_vs:
3215 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
3216 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
3217 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
3218 ; GFX8-NEXT: v_mov_b32_e32 v4, s2
3219 ; GFX8-NEXT: v_mov_b32_e32 v5, s3
3220 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
3221 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
3222 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3223 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3224 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3225 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3226 ; GFX8-NEXT: ; return to shader part epilog
3228 ; GFX9-LABEL: uaddsat_i128_vs:
3230 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
3231 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
3232 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
3233 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
3234 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
3235 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
3236 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
3237 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3238 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3239 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3240 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3241 ; GFX9-NEXT: ; return to shader part epilog
3243 ; GFX10PLUS-LABEL: uaddsat_i128_vs:
3244 ; GFX10PLUS: ; %bb.0:
3245 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
3246 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3247 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3248 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3249 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3250 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3251 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
3252 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
3253 ; GFX10PLUS-NEXT: ; return to shader part epilog
3254 %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
3255 %cast = bitcast i128 %result to <4 x float>
3256 ret <4 x float> %cast
3259 define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
3260 ; GFX6-LABEL: v_uaddsat_v2i128:
3262 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3263 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8
3264 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
3265 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc
3266 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
3267 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3268 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3269 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3270 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3271 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v12
3272 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
3273 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc
3274 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc
3275 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc
3276 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc
3277 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc
3278 ; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, -1, vcc
3279 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3281 ; GFX8-LABEL: v_uaddsat_v2i128:
3283 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3284 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
3285 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
3286 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc
3287 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
3288 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3289 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3290 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3291 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3292 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v12
3293 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
3294 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc
3295 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc
3296 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc
3297 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc
3298 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc
3299 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, -1, vcc
3300 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3302 ; GFX9-LABEL: v_uaddsat_v2i128:
3304 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3305 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
3306 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v9, vcc
3307 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v10, vcc
3308 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc
3309 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
3310 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
3311 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
3312 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
3313 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v12
3314 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v13, vcc
3315 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc
3316 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
3317 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc
3318 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc
3319 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc
3320 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, -1, vcc
3321 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3323 ; GFX10-LABEL: v_uaddsat_v2i128:
3325 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3326 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
3327 ; GFX10-NEXT: v_add_co_u32 v4, s4, v4, v12
3328 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3329 ; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, v5, v13, s4
3330 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3331 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, v6, v14, s4
3332 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3333 ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4
3334 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3335 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3336 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
3337 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
3338 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, -1, s4
3339 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, -1, s4
3340 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, -1, s4
3341 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, -1, s4
3342 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3344 ; GFX11-LABEL: v_uaddsat_v2i128:
3346 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3347 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
3348 ; GFX11-NEXT: v_add_co_u32 v4, s0, v4, v12
3349 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3350 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v5, v13, s0
3351 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3352 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v6, v14, s0
3353 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3354 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0
3355 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3356 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3357 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
3358 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
3359 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0
3360 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, -1, s0
3361 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, -1, s0
3362 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, -1, s0
3363 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3364 %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3365 ret <2 x i128> %result
3368 define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
3369 ; GFX6-LABEL: s_uaddsat_v2i128:
3371 ; GFX6-NEXT: s_add_u32 s0, s0, s8
3372 ; GFX6-NEXT: s_addc_u32 s1, s1, s9
3373 ; GFX6-NEXT: s_addc_u32 s2, s2, s10
3374 ; GFX6-NEXT: s_addc_u32 s3, s3, s11
3375 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3376 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3377 ; GFX6-NEXT: s_add_u32 s4, s4, s12
3378 ; GFX6-NEXT: s_addc_u32 s5, s5, s13
3379 ; GFX6-NEXT: s_addc_u32 s6, s6, s14
3380 ; GFX6-NEXT: s_addc_u32 s7, s7, s15
3381 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
3382 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
3383 ; GFX6-NEXT: ; return to shader part epilog
3385 ; GFX8-LABEL: s_uaddsat_v2i128:
3387 ; GFX8-NEXT: s_add_u32 s0, s0, s8
3388 ; GFX8-NEXT: s_addc_u32 s1, s1, s9
3389 ; GFX8-NEXT: s_addc_u32 s2, s2, s10
3390 ; GFX8-NEXT: s_addc_u32 s3, s3, s11
3391 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3392 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3393 ; GFX8-NEXT: s_add_u32 s4, s4, s12
3394 ; GFX8-NEXT: s_addc_u32 s5, s5, s13
3395 ; GFX8-NEXT: s_addc_u32 s6, s6, s14
3396 ; GFX8-NEXT: s_addc_u32 s7, s7, s15
3397 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
3398 ; GFX8-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
3399 ; GFX8-NEXT: ; return to shader part epilog
3401 ; GFX9-LABEL: s_uaddsat_v2i128:
3403 ; GFX9-NEXT: s_add_u32 s0, s0, s8
3404 ; GFX9-NEXT: s_addc_u32 s1, s1, s9
3405 ; GFX9-NEXT: s_addc_u32 s2, s2, s10
3406 ; GFX9-NEXT: s_addc_u32 s3, s3, s11
3407 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3408 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3409 ; GFX9-NEXT: s_add_u32 s4, s4, s12
3410 ; GFX9-NEXT: s_addc_u32 s5, s5, s13
3411 ; GFX9-NEXT: s_addc_u32 s6, s6, s14
3412 ; GFX9-NEXT: s_addc_u32 s7, s7, s15
3413 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
3414 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
3415 ; GFX9-NEXT: ; return to shader part epilog
3417 ; GFX10PLUS-LABEL: s_uaddsat_v2i128:
3418 ; GFX10PLUS: ; %bb.0:
3419 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s8
3420 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9
3421 ; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10
3422 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11
3423 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
3424 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
3425 ; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12
3426 ; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13
3427 ; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14
3428 ; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15
3429 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
3430 ; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
3431 ; GFX10PLUS-NEXT: ; return to shader part epilog
3432 %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3433 ret <2 x i128> %result
3436 declare i7 @llvm.uadd.sat.i7(i7, i7) #0
3437 declare i8 @llvm.uadd.sat.i8(i8, i8) #0
3438 declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) #0
3439 declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) #0
3441 declare i16 @llvm.uadd.sat.i16(i16, i16) #0
3442 declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
3443 declare <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
3444 declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
3445 declare <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16>, <5 x i16>) #0
3446 declare <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16>, <6 x i16>) #0
3447 declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) #0
3449 declare i24 @llvm.uadd.sat.i24(i24, i24) #0
3451 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
3452 declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
3453 declare <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32>, <3 x i32>) #0
3454 declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) #0
3455 declare <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32>, <5 x i32>) #0
3456 declare <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32>, <16 x i32>) #0
3458 declare i48 @llvm.uadd.sat.i48(i48, i48) #0
3460 declare i64 @llvm.uadd.sat.i64(i64, i64) #0
3461 declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) #0
3463 declare i128 @llvm.uadd.sat.i128(i128, i128) #0
3464 declare <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128>, <2 x i128>) #0
3466 attributes #0 = { nounwind readnone speculatable willreturn }