1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
8 define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
9 ; GFX7-LABEL: v_add_v2i16:
11 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
13 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
14 ; GFX7-NEXT: s_setpc_b64 s[30:31]
16 ; GFX9-LABEL: v_add_v2i16:
18 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
20 ; GFX9-NEXT: s_setpc_b64 s[30:31]
22 ; GFX8-LABEL: v_add_v2i16:
24 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
26 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
27 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
28 ; GFX8-NEXT: s_setpc_b64 s[30:31]
30 ; GFX10-LABEL: v_add_v2i16:
32 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1
34 ; GFX10-NEXT: s_setpc_b64 s[30:31]
35 %add = add <2 x i16> %a, %b
39 define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
40 ; GFX7-LABEL: v_add_v2i16_fneg_lhs:
42 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
44 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
45 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
46 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
47 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
48 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
49 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
50 ; GFX7-NEXT: s_setpc_b64 s[30:31]
52 ; GFX9-LABEL: v_add_v2i16_fneg_lhs:
54 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
56 ; GFX9-NEXT: s_setpc_b64 s[30:31]
58 ; GFX8-LABEL: v_add_v2i16_fneg_lhs:
60 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
62 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
63 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
64 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
65 ; GFX8-NEXT: s_setpc_b64 s[30:31]
67 ; GFX10-LABEL: v_add_v2i16_fneg_lhs:
69 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
71 ; GFX10-NEXT: s_setpc_b64 s[30:31]
72 %neg.a = fneg <2 x half> %a
73 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
74 %add = add <2 x i16> %cast.neg.a, %b
78 define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
79 ; GFX7-LABEL: v_add_v2i16_fneg_rhs:
81 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
83 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
84 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
85 ; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
86 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
87 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
88 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
89 ; GFX7-NEXT: s_setpc_b64 s[30:31]
91 ; GFX9-LABEL: v_add_v2i16_fneg_rhs:
93 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
95 ; GFX9-NEXT: s_setpc_b64 s[30:31]
97 ; GFX8-LABEL: v_add_v2i16_fneg_rhs:
99 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
101 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
102 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
103 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
104 ; GFX8-NEXT: s_setpc_b64 s[30:31]
106 ; GFX10-LABEL: v_add_v2i16_fneg_rhs:
108 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
110 ; GFX10-NEXT: s_setpc_b64 s[30:31]
111 %neg.b = fneg <2 x half> %b
112 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
113 %add = add <2 x i16> %a, %cast.neg.b
117 define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
118 ; GFX7-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
120 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
122 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
123 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
124 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
125 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
126 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
127 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
128 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
129 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
130 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
131 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
132 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
133 ; GFX7-NEXT: s_setpc_b64 s[30:31]
135 ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
137 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
139 ; GFX9-NEXT: s_setpc_b64 s[30:31]
141 ; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
143 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144 ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
145 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
146 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
147 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
148 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
149 ; GFX8-NEXT: s_setpc_b64 s[30:31]
151 ; GFX10-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
153 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
155 ; GFX10-NEXT: s_setpc_b64 s[30:31]
156 %neg.a = fneg <2 x half> %a
157 %neg.b = fneg <2 x half> %b
158 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
159 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
160 %add = add <2 x i16> %cast.neg.a, %cast.neg.b
164 define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
165 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
167 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168 ; GFX7-NEXT: s_movk_i32 s4, 0xffc0
169 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0
170 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s4, v1
171 ; GFX7-NEXT: s_setpc_b64 s[30:31]
173 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
175 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0
177 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
178 ; GFX9-NEXT: s_setpc_b64 s[30:31]
180 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
182 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0
184 ; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0
185 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
186 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
187 ; GFX8-NEXT: s_setpc_b64 s[30:31]
189 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_splat:
191 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192 ; GFX10-NEXT: v_pk_add_u16 v0, 0xffc0, v0 op_sel_hi:[0,1]
193 ; GFX10-NEXT: s_setpc_b64 s[30:31]
194 %add = add <2 x i16> %a, <i16 -64, i16 -64>
198 define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
199 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo:
201 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
203 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1
204 ; GFX7-NEXT: s_setpc_b64 s[30:31]
206 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
208 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0
210 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
211 ; GFX9-NEXT: s_setpc_b64 s[30:31]
213 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo:
215 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216 ; GFX8-NEXT: v_mov_b32_e32 v2, 4
217 ; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0
218 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
219 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
220 ; GFX8-NEXT: s_setpc_b64 s[30:31]
222 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_lo:
224 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225 ; GFX10-NEXT: v_pk_add_u16 v0, 0x4ffc0, v0
226 ; GFX10-NEXT: s_setpc_b64 s[30:31]
227 %add = add <2 x i16> %a, <i16 -64, i16 4>
231 define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
232 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi:
234 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0
236 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1
237 ; GFX7-NEXT: s_setpc_b64 s[30:31]
239 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
241 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004
243 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
244 ; GFX9-NEXT: s_setpc_b64 s[30:31]
246 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi:
248 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0
250 ; GFX8-NEXT: v_add_u16_e32 v2, 4, v0
251 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
252 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
253 ; GFX8-NEXT: s_setpc_b64 s[30:31]
255 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_hi:
257 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258 ; GFX10-NEXT: v_pk_add_u16 v0, 0xffc00004, v0
259 ; GFX10-NEXT: s_setpc_b64 s[30:31]
260 %add = add <2 x i16> %a, <i16 4, i16 -64>
264 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
265 ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_splat:
267 ; GFX7-NEXT: s_sub_i32 s1, s1, 64
268 ; GFX7-NEXT: s_sub_i32 s0, s0, 64
269 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
270 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
271 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
272 ; GFX7-NEXT: s_or_b32 s0, s0, s1
273 ; GFX7-NEXT: ; return to shader part epilog
275 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat:
277 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
278 ; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0
279 ; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0
280 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
281 ; GFX9-NEXT: ; return to shader part epilog
283 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
285 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
286 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
287 ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
288 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0
289 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
290 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
291 ; GFX8-NEXT: s_or_b32 s0, s1, s0
292 ; GFX8-NEXT: ; return to shader part epilog
294 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat:
296 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
297 ; GFX10-NEXT: s_add_i32 s0, s0, 0xffc0ffc0
298 ; GFX10-NEXT: s_add_i32 s1, s1, 0xffc0
299 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
300 ; GFX10-NEXT: ; return to shader part epilog
301 %add = add <2 x i16> %a, <i16 -64, i16 -64>
302 %cast = bitcast <2 x i16> %add to i32
306 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
307 ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_lo:
309 ; GFX7-NEXT: s_add_i32 s1, s1, 4
310 ; GFX7-NEXT: s_sub_i32 s0, s0, 64
311 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
312 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
313 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
314 ; GFX7-NEXT: s_or_b32 s0, s0, s1
315 ; GFX7-NEXT: ; return to shader part epilog
317 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo:
319 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
320 ; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0
321 ; GFX9-NEXT: s_add_i32 s1, s1, 4
322 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
323 ; GFX9-NEXT: ; return to shader part epilog
325 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
327 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
328 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
329 ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
330 ; GFX8-NEXT: s_add_i32 s1, s1, 4
331 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
332 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
333 ; GFX8-NEXT: s_or_b32 s0, s1, s0
334 ; GFX8-NEXT: ; return to shader part epilog
336 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo:
338 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
339 ; GFX10-NEXT: s_add_i32 s0, s0, 0x4ffc0
340 ; GFX10-NEXT: s_add_i32 s1, s1, 4
341 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
342 ; GFX10-NEXT: ; return to shader part epilog
343 %add = add <2 x i16> %a, <i16 -64, i16 4>
344 %cast = bitcast <2 x i16> %add to i32
348 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
349 ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_hi:
351 ; GFX7-NEXT: s_sub_i32 s1, s1, 64
352 ; GFX7-NEXT: s_add_i32 s0, s0, 4
353 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
354 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
355 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
356 ; GFX7-NEXT: s_or_b32 s0, s0, s1
357 ; GFX7-NEXT: ; return to shader part epilog
359 ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi:
361 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
362 ; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004
363 ; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0
364 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
365 ; GFX9-NEXT: ; return to shader part epilog
367 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
369 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
370 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
371 ; GFX8-NEXT: s_add_i32 s0, s0, 4
372 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0
373 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
374 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
375 ; GFX8-NEXT: s_or_b32 s0, s1, s0
376 ; GFX8-NEXT: ; return to shader part epilog
378 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi:
380 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
381 ; GFX10-NEXT: s_add_i32 s0, s0, 0xffc00004
382 ; GFX10-NEXT: s_add_i32 s1, s1, 0xffc0
383 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
384 ; GFX10-NEXT: ; return to shader part epilog
385 %add = add <2 x i16> %a, <i16 4, i16 -64>
386 %cast = bitcast <2 x i16> %add to i32
390 define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
391 ; GFX7-LABEL: s_add_v2i16:
393 ; GFX7-NEXT: s_add_i32 s1, s1, s3
394 ; GFX7-NEXT: s_add_i32 s0, s0, s2
395 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
396 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
397 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
398 ; GFX7-NEXT: s_or_b32 s0, s0, s1
399 ; GFX7-NEXT: ; return to shader part epilog
401 ; GFX9-LABEL: s_add_v2i16:
403 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
404 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
405 ; GFX9-NEXT: s_add_i32 s0, s0, s1
406 ; GFX9-NEXT: s_add_i32 s2, s2, s3
407 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
408 ; GFX9-NEXT: ; return to shader part epilog
410 ; GFX8-LABEL: s_add_v2i16:
412 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
413 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
414 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
415 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
416 ; GFX8-NEXT: s_add_i32 s0, s0, s1
417 ; GFX8-NEXT: s_add_i32 s2, s2, s3
418 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16
419 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
420 ; GFX8-NEXT: s_or_b32 s0, s1, s0
421 ; GFX8-NEXT: ; return to shader part epilog
423 ; GFX10-LABEL: s_add_v2i16:
425 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
426 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
427 ; GFX10-NEXT: s_add_i32 s0, s0, s1
428 ; GFX10-NEXT: s_add_i32 s2, s2, s3
429 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
430 ; GFX10-NEXT: ; return to shader part epilog
431 %add = add <2 x i16> %a, %b
432 %cast = bitcast <2 x i16> %add to i32
436 define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) {
437 ; GFX7-LABEL: s_add_v2i16_fneg_lhs:
439 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
440 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
441 ; GFX7-NEXT: s_or_b32 s0, s1, s0
442 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000
443 ; GFX7-NEXT: s_lshr_b32 s1, s0, 16
444 ; GFX7-NEXT: s_add_i32 s1, s1, s3
445 ; GFX7-NEXT: s_add_i32 s0, s0, s2
446 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
447 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
448 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
449 ; GFX7-NEXT: s_or_b32 s0, s0, s1
450 ; GFX7-NEXT: ; return to shader part epilog
452 ; GFX9-LABEL: s_add_v2i16_fneg_lhs:
454 ; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000
455 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
456 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
457 ; GFX9-NEXT: s_add_i32 s0, s0, s1
458 ; GFX9-NEXT: s_add_i32 s2, s2, s3
459 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
460 ; GFX9-NEXT: ; return to shader part epilog
462 ; GFX8-LABEL: s_add_v2i16_fneg_lhs:
464 ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
465 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
466 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
467 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
468 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
469 ; GFX8-NEXT: s_add_i32 s0, s0, s1
470 ; GFX8-NEXT: s_add_i32 s2, s2, s3
471 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16
472 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
473 ; GFX8-NEXT: s_or_b32 s0, s1, s0
474 ; GFX8-NEXT: ; return to shader part epilog
476 ; GFX10-LABEL: s_add_v2i16_fneg_lhs:
478 ; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000
479 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
480 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
481 ; GFX10-NEXT: s_add_i32 s0, s0, s1
482 ; GFX10-NEXT: s_add_i32 s2, s2, s3
483 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
484 ; GFX10-NEXT: ; return to shader part epilog
485 %neg.a = fneg <2 x half> %a
486 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
487 %add = add <2 x i16> %cast.neg.a, %b
488 %cast = bitcast <2 x i16> %add to i32
492 define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) {
493 ; GFX7-LABEL: s_add_v2i16_fneg_rhs:
495 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16
496 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
497 ; GFX7-NEXT: s_or_b32 s2, s3, s2
498 ; GFX7-NEXT: s_xor_b32 s2, s2, 0x80008000
499 ; GFX7-NEXT: s_lshr_b32 s3, s2, 16
500 ; GFX7-NEXT: s_add_i32 s1, s1, s3
501 ; GFX7-NEXT: s_add_i32 s0, s0, s2
502 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
503 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
504 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
505 ; GFX7-NEXT: s_or_b32 s0, s0, s1
506 ; GFX7-NEXT: ; return to shader part epilog
508 ; GFX9-LABEL: s_add_v2i16_fneg_rhs:
510 ; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000
511 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
512 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
513 ; GFX9-NEXT: s_add_i32 s0, s0, s1
514 ; GFX9-NEXT: s_add_i32 s2, s2, s3
515 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
516 ; GFX9-NEXT: ; return to shader part epilog
518 ; GFX8-LABEL: s_add_v2i16_fneg_rhs:
520 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
521 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
522 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
523 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
524 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
525 ; GFX8-NEXT: s_add_i32 s0, s0, s1
526 ; GFX8-NEXT: s_add_i32 s2, s2, s3
527 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16
528 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
529 ; GFX8-NEXT: s_or_b32 s0, s1, s0
530 ; GFX8-NEXT: ; return to shader part epilog
532 ; GFX10-LABEL: s_add_v2i16_fneg_rhs:
534 ; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000
535 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
536 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
537 ; GFX10-NEXT: s_add_i32 s0, s0, s1
538 ; GFX10-NEXT: s_add_i32 s2, s2, s3
539 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
540 ; GFX10-NEXT: ; return to shader part epilog
541 %neg.b = fneg <2 x half> %b
542 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
543 %add = add <2 x i16> %a, %cast.neg.b
544 %cast = bitcast <2 x i16> %add to i32
548 define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
549 ; GFX7-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
551 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
552 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
553 ; GFX7-NEXT: s_or_b32 s0, s1, s0
554 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16
555 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
556 ; GFX7-NEXT: s_or_b32 s1, s1, s2
557 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000
558 ; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000
559 ; GFX7-NEXT: s_lshr_b32 s2, s0, 16
560 ; GFX7-NEXT: s_lshr_b32 s3, s1, 16
561 ; GFX7-NEXT: s_add_i32 s2, s2, s3
562 ; GFX7-NEXT: s_add_i32 s0, s0, s1
563 ; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
564 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
565 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
566 ; GFX7-NEXT: s_or_b32 s0, s0, s1
567 ; GFX7-NEXT: ; return to shader part epilog
569 ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
571 ; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000
572 ; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000
573 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16
574 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16
575 ; GFX9-NEXT: s_add_i32 s0, s0, s1
576 ; GFX9-NEXT: s_add_i32 s2, s2, s3
577 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
578 ; GFX9-NEXT: ; return to shader part epilog
580 ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
582 ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
583 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
584 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
585 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
586 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
587 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
588 ; GFX8-NEXT: s_add_i32 s0, s0, s1
589 ; GFX8-NEXT: s_add_i32 s2, s2, s3
590 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16
591 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
592 ; GFX8-NEXT: s_or_b32 s0, s1, s0
593 ; GFX8-NEXT: ; return to shader part epilog
595 ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
597 ; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000
598 ; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000
599 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16
600 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16
601 ; GFX10-NEXT: s_add_i32 s0, s0, s1
602 ; GFX10-NEXT: s_add_i32 s2, s2, s3
603 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
604 ; GFX10-NEXT: ; return to shader part epilog
605 %neg.a = fneg <2 x half> %a
606 %neg.b = fneg <2 x half> %b
607 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
608 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
609 %add = add <2 x i16> %cast.neg.a, %cast.neg.b
610 %cast = bitcast <2 x i16> %add to i32