1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
7 define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
8 ; GFX6-LABEL: v_saddsat_i8:
10 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
12 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
13 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
14 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0
15 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0
16 ; GFX6-NEXT: s_setpc_b64 s[30:31]
18 ; GFX8-LABEL: v_saddsat_i8:
20 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21 ; GFX8-NEXT: v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
22 ; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0
23 ; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0
24 ; GFX8-NEXT: s_setpc_b64 s[30:31]
26 ; GFX9-LABEL: v_saddsat_i8:
28 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
30 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
31 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
32 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
33 ; GFX9-NEXT: s_setpc_b64 s[30:31]
35 ; GFX10-LABEL: v_saddsat_i8:
37 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
39 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
40 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
41 ; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
42 ; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
43 ; GFX10-NEXT: s_setpc_b64 s[30:31]
44 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
48 define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
49 ; GFX6-LABEL: v_saddsat_i16:
51 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
53 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
54 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
55 ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
56 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
57 ; GFX6-NEXT: s_setpc_b64 s[30:31]
59 ; GFX8-LABEL: v_saddsat_i16:
61 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1
63 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1
64 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
65 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
66 ; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
67 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
68 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
69 ; GFX8-NEXT: s_setpc_b64 s[30:31]
71 ; GFX9-LABEL: v_saddsat_i16:
73 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
75 ; GFX9-NEXT: s_setpc_b64 s[30:31]
77 ; GFX10-LABEL: v_saddsat_i16:
79 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
81 ; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
82 ; GFX10-NEXT: s_setpc_b64 s[30:31]
83 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
87 define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
88 ; GFX6-LABEL: v_saddsat_i32:
90 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
92 ; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1
93 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
94 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1
95 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
96 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
97 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
98 ; GFX6-NEXT: s_setpc_b64 s[30:31]
100 ; GFX8-LABEL: v_saddsat_i32:
102 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
104 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1
105 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
106 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
107 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
108 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
109 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
110 ; GFX8-NEXT: s_setpc_b64 s[30:31]
112 ; GFX9-LABEL: v_saddsat_i32:
114 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115 ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp
116 ; GFX9-NEXT: s_setpc_b64 s[30:31]
118 ; GFX10-LABEL: v_saddsat_i32:
120 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
122 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp
123 ; GFX10-NEXT: s_setpc_b64 s[30:31]
124 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
128 define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
129 ; GFX6-LABEL: v_saddsat_v2i16:
131 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
133 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
134 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
135 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
136 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
137 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff
138 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
139 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
140 ; GFX6-NEXT: s_movk_i32 s5, 0x8000
141 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
142 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
143 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
144 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
145 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
146 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
147 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
148 ; GFX6-NEXT: s_setpc_b64 s[30:31]
150 ; GFX8-LABEL: v_saddsat_v2i16:
152 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
154 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
155 ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2
156 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
157 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3
158 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4
159 ; GFX8-NEXT: s_movk_i32 s6, 0x8000
160 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2
161 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
162 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
163 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1
164 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1
165 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0
166 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1
167 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0
168 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
169 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
170 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
171 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
172 ; GFX8-NEXT: s_setpc_b64 s[30:31]
174 ; GFX9-LABEL: v_saddsat_v2i16:
176 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
178 ; GFX9-NEXT: s_setpc_b64 s[30:31]
180 ; GFX10-LABEL: v_saddsat_v2i16:
182 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
184 ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp
185 ; GFX10-NEXT: s_setpc_b64 s[30:31]
186 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
187 ret <2 x i16> %result
190 define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
191 ; GFX6-LABEL: v_saddsat_v3i16:
193 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
195 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
196 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
197 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
198 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
199 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
200 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
201 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff
202 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
203 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
204 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
205 ; GFX6-NEXT: s_movk_i32 s5, 0x8000
206 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
207 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
208 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
209 ; GFX6-NEXT: v_min_i32_e32 v2, s4, v2
210 ; GFX6-NEXT: v_max_i32_e32 v3, s5, v2
211 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
212 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
213 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
214 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
215 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
216 ; GFX6-NEXT: s_setpc_b64 s[30:31]
218 ; GFX8-LABEL: v_saddsat_v3i16:
220 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
221 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
222 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
223 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4
224 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
225 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
226 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
227 ; GFX8-NEXT: s_movk_i32 s6, 0x8000
228 ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4
229 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
230 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
231 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3
232 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3
233 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
234 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
235 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1
236 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
237 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
238 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2
239 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2
240 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
241 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
242 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0
243 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
244 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
245 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
246 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
247 ; GFX8-NEXT: s_setpc_b64 s[30:31]
249 ; GFX9-LABEL: v_saddsat_v3i16:
251 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
253 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
254 ; GFX9-NEXT: s_setpc_b64 s[30:31]
256 ; GFX10-LABEL: v_saddsat_v3i16:
258 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
260 ; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp
261 ; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp
262 ; GFX10-NEXT: s_setpc_b64 s[30:31]
263 %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
264 ret <3 x i16> %result
267 define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
268 ; GFX6-LABEL: v_saddsat_v4i16:
270 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
272 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
273 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
274 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
275 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
276 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff
277 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
278 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
279 ; GFX6-NEXT: s_movk_i32 s5, 0x8000
280 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
281 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
282 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
283 ; GFX6-NEXT: s_mov_b32 s6, 0xffff
284 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
285 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
286 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
287 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
288 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
289 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0
290 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
291 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
292 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7
293 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
294 ; GFX6-NEXT: v_min_i32_e32 v2, s4, v2
295 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
296 ; GFX6-NEXT: v_max_i32_e32 v2, s5, v2
297 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
298 ; GFX6-NEXT: v_and_b32_e32 v2, s6, v2
299 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
300 ; GFX6-NEXT: s_setpc_b64 s[30:31]
302 ; GFX8-LABEL: v_saddsat_v4i16:
304 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
306 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
307 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4
308 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4
309 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5
310 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6
311 ; GFX8-NEXT: s_movk_i32 s6, 0x8000
312 ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4
313 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
314 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
315 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2
316 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2
317 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0
318 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2
319 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0
320 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
321 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
322 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
323 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
324 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
325 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
326 ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2
327 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2
328 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4
329 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5
330 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2
331 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
332 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
333 ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3
334 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3
335 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1
336 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3
337 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1
338 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
339 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
340 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
341 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
342 ; GFX8-NEXT: s_setpc_b64 s[30:31]
344 ; GFX9-LABEL: v_saddsat_v4i16:
346 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347 ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp
348 ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp
349 ; GFX9-NEXT: s_setpc_b64 s[30:31]
351 ; GFX10-LABEL: v_saddsat_v4i16:
353 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
355 ; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp
356 ; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp
357 ; GFX10-NEXT: s_setpc_b64 s[30:31]
358 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
359 %cast = bitcast <4 x i16> %result to <2 x float>
360 ret <2 x float> %cast
363 define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
364 ; GFX6-LABEL: v_saddsat_v2i32:
366 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
368 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2
369 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
370 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2
371 ; GFX6-NEXT: s_brev_b32 s6, 1
372 ; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0
373 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
374 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
375 ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3
376 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
377 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
378 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2
379 ; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1
380 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
381 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
382 ; GFX6-NEXT: s_setpc_b64 s[30:31]
384 ; GFX8-LABEL: v_saddsat_v2i32:
386 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
388 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2
389 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
390 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2
391 ; GFX8-NEXT: s_brev_b32 s6, 1
392 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0
393 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
394 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
395 ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3
396 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
397 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
398 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2
399 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1
400 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
401 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
402 ; GFX8-NEXT: s_setpc_b64 s[30:31]
404 ; GFX9-LABEL: v_saddsat_v2i32:
406 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407 ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp
408 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
409 ; GFX9-NEXT: s_setpc_b64 s[30:31]
411 ; GFX10-LABEL: v_saddsat_v2i32:
413 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
415 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp
416 ; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp
417 ; GFX10-NEXT: s_setpc_b64 s[30:31]
418 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
419 ret <2 x i32> %result
422 define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
423 ; GFX6-LABEL: v_saddsat_i64:
425 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
427 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
428 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
429 ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
430 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
431 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
432 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
433 ; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
434 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
435 ; GFX6-NEXT: s_setpc_b64 s[30:31]
437 ; GFX8-LABEL: v_saddsat_i64:
439 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
440 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
441 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
442 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
443 ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
444 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
445 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
446 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
447 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
448 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
449 ; GFX8-NEXT: s_setpc_b64 s[30:31]
451 ; GFX9-LABEL: v_saddsat_i64:
453 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
455 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
456 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
457 ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
458 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
459 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
460 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
461 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
462 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
463 ; GFX9-NEXT: s_setpc_b64 s[30:31]
465 ; GFX10-LABEL: v_saddsat_i64:
467 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
468 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
469 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
470 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
471 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
472 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
473 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
474 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
475 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
476 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
477 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
478 ; GFX10-NEXT: s_setpc_b64 s[30:31]
479 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
483 declare i8 @llvm.sadd.sat.i8(i8, i8) #0
484 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
485 declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
486 declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
487 declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
488 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
489 declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
490 declare i64 @llvm.sadd.sat.i64(i64, i64) #0