1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
7 define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
8 ; GFX9-LABEL: v_sub_v2i16:
10 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
12 ; GFX9-NEXT: s_setpc_b64 s[30:31]
14 ; GFX8-LABEL: v_sub_v2i16:
16 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1
18 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
19 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
20 ; GFX8-NEXT: s_setpc_b64 s[30:31]
22 ; GFX10-LABEL: v_sub_v2i16:
24 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1
26 ; GFX10-NEXT: s_setpc_b64 s[30:31]
28 ; GFX11-LABEL: v_sub_v2i16:
30 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1
32 ; GFX11-NEXT: s_setpc_b64 s[30:31]
33 %sub = sub <2 x i16> %a, %b
37 define <2 x i16> @v_sub_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
38 ; GFX9-LABEL: v_sub_v2i16_fneg_lhs:
40 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
42 ; GFX9-NEXT: s_setpc_b64 s[30:31]
44 ; GFX8-LABEL: v_sub_v2i16_fneg_lhs:
46 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
48 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1
49 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
50 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
51 ; GFX8-NEXT: s_setpc_b64 s[30:31]
53 ; GFX10-LABEL: v_sub_v2i16_fneg_lhs:
55 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
57 ; GFX10-NEXT: s_setpc_b64 s[30:31]
59 ; GFX11-LABEL: v_sub_v2i16_fneg_lhs:
61 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
63 ; GFX11-NEXT: s_setpc_b64 s[30:31]
64 %neg.a = fneg <2 x half> %a
65 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
66 %sub = sub <2 x i16> %cast.neg.a, %b
70 define <2 x i16> @v_sub_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
71 ; GFX9-LABEL: v_sub_v2i16_fneg_rhs:
73 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
75 ; GFX9-NEXT: s_setpc_b64 s[30:31]
77 ; GFX8-LABEL: v_sub_v2i16_fneg_rhs:
79 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
81 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1
82 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
84 ; GFX8-NEXT: s_setpc_b64 s[30:31]
86 ; GFX10-LABEL: v_sub_v2i16_fneg_rhs:
88 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
90 ; GFX10-NEXT: s_setpc_b64 s[30:31]
92 ; GFX11-LABEL: v_sub_v2i16_fneg_rhs:
94 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
96 ; GFX11-NEXT: s_setpc_b64 s[30:31]
97 %neg.b = fneg <2 x half> %b
98 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
99 %sub = sub <2 x i16> %a, %cast.neg.b
103 define <2 x i16> @v_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
104 ; GFX9-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
106 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
110 ; GFX8-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
112 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
114 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
115 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v1
116 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
117 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
118 ; GFX8-NEXT: s_setpc_b64 s[30:31]
120 ; GFX10-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
122 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
124 ; GFX10-NEXT: s_setpc_b64 s[30:31]
126 ; GFX11-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
128 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
130 ; GFX11-NEXT: s_setpc_b64 s[30:31]
131 %neg.a = fneg <2 x half> %a
132 %neg.b = fneg <2 x half> %b
133 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
134 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
135 %sub = sub <2 x i16> %cast.neg.a, %cast.neg.b
139 define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
140 ; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_splat:
142 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0
144 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
145 ; GFX9-NEXT: s_setpc_b64 s[30:31]
147 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
149 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0
151 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0xffc0, v0
152 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
153 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
154 ; GFX8-NEXT: s_setpc_b64 s[30:31]
156 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
158 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
160 ; GFX10-NEXT: s_setpc_b64 s[30:31]
162 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_splat:
164 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
166 ; GFX11-NEXT: s_setpc_b64 s[30:31]
167 %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
171 define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
172 ; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_lo:
174 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0
176 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
177 ; GFX9-NEXT: s_setpc_b64 s[30:31]
179 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo:
181 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; GFX8-NEXT: v_mov_b32_e32 v2, 4
183 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0xffc0, v0
184 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
185 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
186 ; GFX8-NEXT: s_setpc_b64 s[30:31]
188 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_lo:
190 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x4ffc0
192 ; GFX10-NEXT: s_setpc_b64 s[30:31]
194 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_lo:
196 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x4ffc0
198 ; GFX11-NEXT: s_setpc_b64 s[30:31]
199 %sub = sub <2 x i16> %a, <i16 -64, i16 4>
203 define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
204 ; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_hi:
206 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004
208 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
209 ; GFX9-NEXT: s_setpc_b64 s[30:31]
211 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi:
213 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0
215 ; GFX8-NEXT: v_subrev_u16_e32 v2, 4, v0
216 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
217 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
218 ; GFX8-NEXT: s_setpc_b64 s[30:31]
220 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi:
222 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xffc00004
224 ; GFX10-NEXT: s_setpc_b64 s[30:31]
226 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_hi:
228 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xffc00004
230 ; GFX11-NEXT: s_setpc_b64 s[30:31]
231 %sub = sub <2 x i16> %a, <i16 4, i16 -64>
235 define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
236 ; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_splat:
238 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
239 ; GFX9-NEXT: s_sub_i32 s0, s0, 0xffc0ffc0
240 ; GFX9-NEXT: s_sub_i32 s1, s1, 0xffc0
241 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
242 ; GFX9-NEXT: ; return to shader part epilog
244 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat:
246 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
247 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
248 ; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0
249 ; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0
250 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
251 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
252 ; GFX8-NEXT: s_or_b32 s0, s1, s0
253 ; GFX8-NEXT: ; return to shader part epilog
255 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat:
257 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
258 ; GFX10-NEXT: s_sub_i32 s0, s0, 0xffc0ffc0
259 ; GFX10-NEXT: s_sub_i32 s1, s1, 0xffc0
260 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
261 ; GFX10-NEXT: ; return to shader part epilog
263 ; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_splat:
265 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
266 ; GFX11-NEXT: s_sub_i32 s0, s0, 0xffc0ffc0
267 ; GFX11-NEXT: s_sub_i32 s1, s1, 0xffc0
268 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
269 ; GFX11-NEXT: ; return to shader part epilog
270 %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
271 %cast = bitcast <2 x i16> %sub to i32
275 define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
276 ; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_lo:
278 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
279 ; GFX9-NEXT: s_sub_i32 s0, s0, 0x4ffc0
280 ; GFX9-NEXT: s_sub_i32 s1, s1, 4
281 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
282 ; GFX9-NEXT: ; return to shader part epilog
284 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo:
286 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
287 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
288 ; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0
289 ; GFX8-NEXT: s_sub_i32 s1, s1, 4
290 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
291 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
292 ; GFX8-NEXT: s_or_b32 s0, s1, s0
293 ; GFX8-NEXT: ; return to shader part epilog
295 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo:
297 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
298 ; GFX10-NEXT: s_sub_i32 s0, s0, 0x4ffc0
299 ; GFX10-NEXT: s_sub_i32 s1, s1, 4
300 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
301 ; GFX10-NEXT: ; return to shader part epilog
303 ; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_lo:
305 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
306 ; GFX11-NEXT: s_sub_i32 s0, s0, 0x4ffc0
307 ; GFX11-NEXT: s_sub_i32 s1, s1, 4
308 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
309 ; GFX11-NEXT: ; return to shader part epilog
310 %sub = sub <2 x i16> %a, <i16 -64, i16 4>
311 %cast = bitcast <2 x i16> %sub to i32
315 define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
316 ; GFX9-LABEL: s_sub_v2i16_neg_inline_imm_hi:
318 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
319 ; GFX9-NEXT: s_sub_i32 s0, s0, 0xffc00004
320 ; GFX9-NEXT: s_sub_i32 s1, s1, 0xffc0
321 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
322 ; GFX9-NEXT: ; return to shader part epilog
324 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi:
326 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
327 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
328 ; GFX8-NEXT: s_sub_i32 s0, s0, 4
329 ; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0
330 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
331 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
332 ; GFX8-NEXT: s_or_b32 s0, s1, s0
333 ; GFX8-NEXT: ; return to shader part epilog
335 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi:
337 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
338 ; GFX10-NEXT: s_sub_i32 s0, s0, 0xffc00004
339 ; GFX10-NEXT: s_sub_i32 s1, s1, 0xffc0
340 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
341 ; GFX10-NEXT: ; return to shader part epilog
343 ; GFX11-LABEL: s_sub_v2i16_neg_inline_imm_hi:
345 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
346 ; GFX11-NEXT: s_sub_i32 s0, s0, 0xffc00004
347 ; GFX11-NEXT: s_sub_i32 s1, s1, 0xffc0
348 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
349 ; GFX11-NEXT: ; return to shader part epilog
350 %sub = sub <2 x i16> %a, <i16 4, i16 -64>
351 %cast = bitcast <2 x i16> %sub to i32
355 define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
356 ; GFX9-LABEL: s_sub_v2i16:
358 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
359 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
360 ; GFX9-NEXT: s_sub_i32 s0, s0, s1
361 ; GFX9-NEXT: s_sub_i32 s1, s2, s3
362 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
363 ; GFX9-NEXT: ; return to shader part epilog
365 ; GFX8-LABEL: s_sub_v2i16:
367 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
368 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
369 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
370 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
371 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
372 ; GFX8-NEXT: s_sub_i32 s1, s2, s3
373 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
374 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
375 ; GFX8-NEXT: s_or_b32 s0, s1, s0
376 ; GFX8-NEXT: ; return to shader part epilog
378 ; GFX10-LABEL: s_sub_v2i16:
380 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
381 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
382 ; GFX10-NEXT: s_sub_i32 s0, s0, s1
383 ; GFX10-NEXT: s_sub_i32 s1, s2, s3
384 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
385 ; GFX10-NEXT: ; return to shader part epilog
387 ; GFX11-LABEL: s_sub_v2i16:
389 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
390 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
391 ; GFX11-NEXT: s_sub_i32 s0, s0, s1
392 ; GFX11-NEXT: s_sub_i32 s1, s2, s3
393 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
394 ; GFX11-NEXT: ; return to shader part epilog
395 %sub = sub <2 x i16> %a, %b
396 %cast = bitcast <2 x i16> %sub to i32
400 define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) {
401 ; GFX9-LABEL: s_sub_v2i16_fneg_lhs:
403 ; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000
404 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
405 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
406 ; GFX9-NEXT: s_sub_i32 s0, s0, s1
407 ; GFX9-NEXT: s_sub_i32 s1, s2, s3
408 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
409 ; GFX9-NEXT: ; return to shader part epilog
411 ; GFX8-LABEL: s_sub_v2i16_fneg_lhs:
413 ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
414 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
415 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
416 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
417 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
418 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
419 ; GFX8-NEXT: s_sub_i32 s1, s2, s3
420 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
421 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
422 ; GFX8-NEXT: s_or_b32 s0, s1, s0
423 ; GFX8-NEXT: ; return to shader part epilog
425 ; GFX10-LABEL: s_sub_v2i16_fneg_lhs:
427 ; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000
428 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
429 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
430 ; GFX10-NEXT: s_sub_i32 s0, s0, s1
431 ; GFX10-NEXT: s_sub_i32 s1, s2, s3
432 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
433 ; GFX10-NEXT: ; return to shader part epilog
435 ; GFX11-LABEL: s_sub_v2i16_fneg_lhs:
437 ; GFX11-NEXT: s_xor_b32 s0, s0, 0x80008000
438 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
439 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
440 ; GFX11-NEXT: s_sub_i32 s0, s0, s1
441 ; GFX11-NEXT: s_sub_i32 s1, s2, s3
442 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
443 ; GFX11-NEXT: ; return to shader part epilog
444 %neg.a = fneg <2 x half> %a
445 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
446 %sub = sub <2 x i16> %cast.neg.a, %b
447 %cast = bitcast <2 x i16> %sub to i32
451 define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) {
452 ; GFX9-LABEL: s_sub_v2i16_fneg_rhs:
454 ; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000
455 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
456 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
457 ; GFX9-NEXT: s_sub_i32 s0, s0, s1
458 ; GFX9-NEXT: s_sub_i32 s1, s2, s3
459 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
460 ; GFX9-NEXT: ; return to shader part epilog
462 ; GFX8-LABEL: s_sub_v2i16_fneg_rhs:
464 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
465 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
466 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
467 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
468 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
469 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
470 ; GFX8-NEXT: s_sub_i32 s1, s2, s3
471 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
472 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
473 ; GFX8-NEXT: s_or_b32 s0, s1, s0
474 ; GFX8-NEXT: ; return to shader part epilog
476 ; GFX10-LABEL: s_sub_v2i16_fneg_rhs:
478 ; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000
479 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
480 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
481 ; GFX10-NEXT: s_sub_i32 s0, s0, s1
482 ; GFX10-NEXT: s_sub_i32 s1, s2, s3
483 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
484 ; GFX10-NEXT: ; return to shader part epilog
486 ; GFX11-LABEL: s_sub_v2i16_fneg_rhs:
488 ; GFX11-NEXT: s_xor_b32 s1, s1, 0x80008000
489 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
490 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
491 ; GFX11-NEXT: s_sub_i32 s0, s0, s1
492 ; GFX11-NEXT: s_sub_i32 s1, s2, s3
493 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
494 ; GFX11-NEXT: ; return to shader part epilog
495 %neg.b = fneg <2 x half> %b
496 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
497 %sub = sub <2 x i16> %a, %cast.neg.b
498 %cast = bitcast <2 x i16> %sub to i32
502 define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
503 ; GFX9-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
505 ; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000
506 ; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000
507 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
508 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
509 ; GFX9-NEXT: s_sub_i32 s0, s0, s1
510 ; GFX9-NEXT: s_sub_i32 s1, s2, s3
511 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
512 ; GFX9-NEXT: ; return to shader part epilog
514 ; GFX8-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
516 ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
517 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
518 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
519 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
520 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
521 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
522 ; GFX8-NEXT: s_sub_i32 s0, s0, s1
523 ; GFX8-NEXT: s_sub_i32 s1, s2, s3
524 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
525 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
526 ; GFX8-NEXT: s_or_b32 s0, s1, s0
527 ; GFX8-NEXT: ; return to shader part epilog
529 ; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
531 ; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000
532 ; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000
533 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
534 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
535 ; GFX10-NEXT: s_sub_i32 s0, s0, s1
536 ; GFX10-NEXT: s_sub_i32 s1, s2, s3
537 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
538 ; GFX10-NEXT: ; return to shader part epilog
540 ; GFX11-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
542 ; GFX11-NEXT: s_xor_b32 s0, s0, 0x80008000
543 ; GFX11-NEXT: s_xor_b32 s1, s1, 0x80008000
544 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16
545 ; GFX11-NEXT: s_lshr_b32 s3, s1, 16
546 ; GFX11-NEXT: s_sub_i32 s0, s0, s1
547 ; GFX11-NEXT: s_sub_i32 s1, s2, s3
548 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
549 ; GFX11-NEXT: ; return to shader part epilog
550 %neg.a = fneg <2 x half> %a
551 %neg.b = fneg <2 x half> %b
552 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
553 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
554 %sub = sub <2 x i16> %cast.neg.a, %cast.neg.b
555 %cast = bitcast <2 x i16> %sub to i32