1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
9 define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
10 ; GFX6-LABEL: v_saddsat_i8:
12 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
14 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
15 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
16 ; GFX6-NEXT: s_movk_i32 s4, 0xff80
17 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f
18 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1
19 ; GFX6-NEXT: s_setpc_b64 s[30:31]
21 ; GFX8-LABEL: v_saddsat_i8:
23 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24 ; GFX8-NEXT: v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
25 ; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0
26 ; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0
27 ; GFX8-NEXT: s_setpc_b64 s[30:31]
29 ; GFX9-LABEL: v_saddsat_i8:
31 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
33 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
34 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
35 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
36 ; GFX9-NEXT: s_setpc_b64 s[30:31]
38 ; GFX10-LABEL: v_saddsat_i8:
40 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
42 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
43 ; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
44 ; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
45 ; GFX10-NEXT: s_setpc_b64 s[30:31]
47 ; GFX11-TRUE16-LABEL: v_saddsat_i8:
48 ; GFX11-TRUE16: ; %bb.0:
49 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
51 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
52 ; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp
53 ; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v0.l
54 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
56 ; GFX11-FAKE16-LABEL: v_saddsat_i8:
57 ; GFX11-FAKE16: ; %bb.0:
58 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
60 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0
61 ; GFX11-FAKE16-NEXT: v_add_nc_i16 v0, v0, v1 clamp
62 ; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0
63 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
64 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
68 define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
69 ; GFX6-LABEL: v_saddsat_i16:
71 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
73 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
74 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
75 ; GFX6-NEXT: s_movk_i32 s4, 0x8000
76 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff
77 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1
78 ; GFX6-NEXT: s_setpc_b64 s[30:31]
80 ; GFX8-LABEL: v_saddsat_i16:
82 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1
84 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1
85 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
86 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
87 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
88 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
89 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
90 ; GFX8-NEXT: s_setpc_b64 s[30:31]
92 ; GFX9-LABEL: v_saddsat_i16:
94 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
96 ; GFX9-NEXT: s_setpc_b64 s[30:31]
98 ; GFX10-LABEL: v_saddsat_i16:
100 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101 ; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
102 ; GFX10-NEXT: s_setpc_b64 s[30:31]
104 ; GFX11-TRUE16-LABEL: v_saddsat_i16:
105 ; GFX11-TRUE16: ; %bb.0:
106 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
108 ; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp
109 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
111 ; GFX11-FAKE16-LABEL: v_saddsat_i16:
112 ; GFX11-FAKE16: ; %bb.0:
113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114 ; GFX11-FAKE16-NEXT: v_add_nc_i16 v0, v0, v1 clamp
115 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
116 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
120 define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
121 ; GFX6-LABEL: v_saddsat_i32:
123 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
125 ; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1
126 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
127 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1
128 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
129 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
130 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
131 ; GFX6-NEXT: s_setpc_b64 s[30:31]
133 ; GFX8-LABEL: v_saddsat_i32:
135 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
137 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1
138 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
139 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
140 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
141 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
142 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
143 ; GFX8-NEXT: s_setpc_b64 s[30:31]
145 ; GFX9-LABEL: v_saddsat_i32:
147 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148 ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp
149 ; GFX9-NEXT: s_setpc_b64 s[30:31]
151 ; GFX10PLUS-LABEL: v_saddsat_i32:
152 ; GFX10PLUS: ; %bb.0:
153 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp
155 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
156 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
160 define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
161 ; GFX6-LABEL: v_saddsat_v2i16:
163 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
165 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
166 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
167 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
168 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
169 ; GFX6-NEXT: s_movk_i32 s4, 0x8000
170 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff
171 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
172 ; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3
173 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3
174 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
175 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
176 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
177 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
178 ; GFX6-NEXT: s_setpc_b64 s[30:31]
180 ; GFX8-LABEL: v_saddsat_v2i16:
182 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
184 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
185 ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2
186 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3
187 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
188 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4
189 ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
190 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
191 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
192 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1
193 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1
194 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
195 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
196 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
197 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
198 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
199 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
200 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
201 ; GFX8-NEXT: s_setpc_b64 s[30:31]
203 ; GFX9-LABEL: v_saddsat_v2i16:
205 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
207 ; GFX9-NEXT: s_setpc_b64 s[30:31]
209 ; GFX10PLUS-LABEL: v_saddsat_v2i16:
210 ; GFX10PLUS: ; %bb.0:
211 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v1 clamp
213 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
214 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
215 ret <2 x i16> %result
218 define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
219 ; GFX6-LABEL: v_saddsat_v3i16:
221 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
223 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
224 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
225 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
226 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
227 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
228 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
229 ; GFX6-NEXT: s_movk_i32 s4, 0x8000
230 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff
231 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
232 ; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4
233 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4
234 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
235 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
236 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
237 ; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4
238 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
239 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
240 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
241 ; GFX6-NEXT: s_setpc_b64 s[30:31]
243 ; GFX8-LABEL: v_saddsat_v3i16:
245 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
247 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
248 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4
249 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
250 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
251 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
252 ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
253 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
254 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
255 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3
256 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3
257 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
258 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
259 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
260 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
261 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
262 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2
263 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2
264 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
265 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
266 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
267 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
268 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
269 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
270 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
271 ; GFX8-NEXT: s_setpc_b64 s[30:31]
273 ; GFX9-LABEL: v_saddsat_v3i16:
275 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
277 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
278 ; GFX9-NEXT: s_setpc_b64 s[30:31]
280 ; GFX10PLUS-LABEL: v_saddsat_v3i16:
281 ; GFX10PLUS: ; %bb.0:
282 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp
284 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp
285 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
286 %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
287 ret <3 x i16> %result
290 define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
291 ; GFX6-LABEL: v_saddsat_v4i16:
293 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
295 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
296 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
297 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
298 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
299 ; GFX6-NEXT: s_movk_i32 s4, 0x8000
300 ; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff
301 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
302 ; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5
303 ; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5
304 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
305 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
306 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
307 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
308 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
309 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
310 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
311 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7
312 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
313 ; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5
314 ; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5
315 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
316 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
317 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
318 ; GFX6-NEXT: s_setpc_b64 s[30:31]
320 ; GFX8-LABEL: v_saddsat_v4i16:
322 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
324 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
325 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4
326 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
327 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
328 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
329 ; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4
330 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
331 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
332 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2
333 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2
334 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
335 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
336 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
337 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
338 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
339 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
340 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
341 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
342 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
343 ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2
344 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4
345 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
346 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5
347 ; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
348 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
349 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
350 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3
351 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3
352 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
353 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
354 ; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
355 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
356 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
357 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
358 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
359 ; GFX8-NEXT: s_setpc_b64 s[30:31]
361 ; GFX9-LABEL: v_saddsat_v4i16:
363 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
365 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
366 ; GFX9-NEXT: s_setpc_b64 s[30:31]
368 ; GFX10PLUS-LABEL: v_saddsat_v4i16:
369 ; GFX10PLUS: ; %bb.0:
370 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371 ; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp
372 ; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp
373 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
374 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
375 %cast = bitcast <4 x i16> %result to <2 x float>
376 ret <2 x float> %cast
379 define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
380 ; GFX6-LABEL: v_saddsat_v2i32:
382 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
384 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2
385 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
386 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2
387 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
388 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
389 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
390 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3
391 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
392 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
393 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2
394 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
395 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
396 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
397 ; GFX6-NEXT: s_setpc_b64 s[30:31]
399 ; GFX8-LABEL: v_saddsat_v2i32:
401 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
403 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2
404 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
405 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2
406 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
407 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
408 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
409 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3
410 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
411 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
412 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2
413 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
414 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
415 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
416 ; GFX8-NEXT: s_setpc_b64 s[30:31]
418 ; GFX9-LABEL: v_saddsat_v2i32:
420 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421 ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp
422 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
423 ; GFX9-NEXT: s_setpc_b64 s[30:31]
425 ; GFX10PLUS-LABEL: v_saddsat_v2i32:
426 ; GFX10PLUS: ; %bb.0:
427 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v2 clamp
429 ; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v3 clamp
430 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
431 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
432 ret <2 x i32> %result
435 define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
436 ; GFX6-LABEL: v_saddsat_i64:
438 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
440 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
441 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
442 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
443 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
444 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
445 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
446 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
447 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
448 ; GFX6-NEXT: s_setpc_b64 s[30:31]
450 ; GFX8-LABEL: v_saddsat_i64:
452 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
454 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
455 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
456 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
457 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
458 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
459 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
460 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
461 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
462 ; GFX8-NEXT: s_setpc_b64 s[30:31]
464 ; GFX9-LABEL: v_saddsat_i64:
466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
468 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
469 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
470 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
471 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
472 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
473 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
474 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
475 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
476 ; GFX9-NEXT: s_setpc_b64 s[30:31]
478 ; GFX10-LABEL: v_saddsat_i64:
480 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
482 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
483 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
484 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
485 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
486 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
487 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
488 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
489 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
490 ; GFX10-NEXT: s_setpc_b64 s[30:31]
492 ; GFX11-LABEL: v_saddsat_i64:
494 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
496 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
497 ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
498 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
499 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
500 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
501 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
502 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
503 ; GFX11-NEXT: s_setpc_b64 s[30:31]
504 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
508 declare i8 @llvm.sadd.sat.i8(i8, i8) #0
509 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
510 declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
511 declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
512 declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
513 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
514 declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
515 declare i64 @llvm.sadd.sat.i64(i64, i64) #0