1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8 ; FIXME: VI or should be unnecessary
9 define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
10 ; VI-LABEL: v_test_add_v2i16:
12 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
13 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
14 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
15 ; VI-NEXT: s_waitcnt lgkmcnt(0)
16 ; VI-NEXT: v_mov_b32_e32 v1, s7
17 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
18 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
19 ; VI-NEXT: v_mov_b32_e32 v3, s1
20 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
21 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
22 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
23 ; VI-NEXT: s_waitcnt vmcnt(0)
24 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
25 ; VI-NEXT: s_waitcnt vmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s4
27 ; VI-NEXT: v_mov_b32_e32 v1, s5
28 ; VI-NEXT: v_add_u16_e32 v3, v4, v2
29 ; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
31 ; VI-NEXT: flat_store_dword v[0:1], v2
34 ; GFX9-LABEL: v_test_add_v2i16:
36 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
37 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
38 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
39 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
41 ; GFX9-NEXT: s_waitcnt vmcnt(0)
42 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
43 ; GFX9-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
45 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
46 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
49 ; GFX10-LABEL: v_test_add_v2i16:
51 ; GFX10-NEXT: s_clause 0x1
52 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
53 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
61 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2
62 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
63 ; GFX10-NEXT: s_endpgm
65 ; GFX11-LABEL: v_test_add_v2i16:
67 ; GFX11-NEXT: s_clause 0x1
68 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
69 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
70 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
71 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
72 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
73 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
74 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
76 ; GFX11-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
78 ; GFX11-NEXT: s_waitcnt vmcnt(0)
79 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
80 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
82 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
83 ; GFX11-NEXT: s_endpgm
84 %tid = call i32 @llvm.amdgcn.workitem.id.x()
85 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
86 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
87 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
88 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
89 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
90 %add = add <2 x i16> %a, %b
91 store <2 x i16> %add, ptr addrspace(1) %out
95 define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
96 ; VI-LABEL: s_test_add_v2i16:
98 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
99 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
101 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
102 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
103 ; VI-NEXT: v_mov_b32_e32 v0, s4
104 ; VI-NEXT: v_mov_b32_e32 v1, s5
105 ; VI-NEXT: s_waitcnt lgkmcnt(0)
106 ; VI-NEXT: s_lshr_b32 s1, s2, 16
107 ; VI-NEXT: s_lshr_b32 s3, s0, 16
108 ; VI-NEXT: s_add_i32 s2, s2, s0
109 ; VI-NEXT: s_add_i32 s1, s1, s3
110 ; VI-NEXT: s_and_b32 s0, s2, 0xffff
111 ; VI-NEXT: s_lshl_b32 s1, s1, 16
112 ; VI-NEXT: s_or_b32 s0, s0, s1
113 ; VI-NEXT: v_mov_b32_e32 v2, s0
114 ; VI-NEXT: flat_store_dword v[0:1], v2
117 ; GFX9-LABEL: s_test_add_v2i16:
119 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
120 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
121 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
122 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
124 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
125 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
126 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
127 ; GFX9-NEXT: v_pk_add_u16 v1, s3, v1
128 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
129 ; GFX9-NEXT: s_endpgm
131 ; GFX10-LABEL: s_test_add_v2i16:
133 ; GFX10-NEXT: s_clause 0x1
134 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
135 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
136 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
137 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0
139 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0
140 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3
142 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
143 ; GFX10-NEXT: s_endpgm
145 ; GFX11-LABEL: s_test_add_v2i16:
147 ; GFX11-NEXT: s_clause 0x1
148 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
149 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
150 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
151 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
153 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
154 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
155 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s0
156 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
157 ; GFX11-NEXT: s_nop 0
158 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
159 ; GFX11-NEXT: s_endpgm
160 %a = load <2 x i16>, ptr addrspace(4) %in0
161 %b = load <2 x i16>, ptr addrspace(4) %in1
162 %add = add <2 x i16> %a, %b
163 store <2 x i16> %add, ptr addrspace(1) %out
167 define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
168 ; VI-LABEL: s_test_add_self_v2i16:
170 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
171 ; VI-NEXT: s_waitcnt lgkmcnt(0)
172 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
173 ; VI-NEXT: v_mov_b32_e32 v0, s0
174 ; VI-NEXT: v_mov_b32_e32 v1, s1
175 ; VI-NEXT: s_waitcnt lgkmcnt(0)
176 ; VI-NEXT: s_lshr_b32 s0, s2, 16
177 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
178 ; VI-NEXT: s_add_i32 s1, s1, s1
179 ; VI-NEXT: s_add_i32 s0, s0, s0
180 ; VI-NEXT: s_lshl_b32 s0, s0, 16
181 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
182 ; VI-NEXT: s_or_b32 s0, s1, s0
183 ; VI-NEXT: v_mov_b32_e32 v2, s0
184 ; VI-NEXT: flat_store_dword v[0:1], v2
187 ; GFX9-LABEL: s_test_add_self_v2i16:
189 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
190 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
191 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
192 ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
193 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX9-NEXT: v_pk_add_u16 v1, s0, s0
195 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
196 ; GFX9-NEXT: s_endpgm
198 ; GFX10-LABEL: s_test_add_self_v2i16:
200 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
201 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
202 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
204 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
205 ; GFX10-NEXT: v_pk_add_u16 v1, s0, s0
206 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
207 ; GFX10-NEXT: s_endpgm
209 ; GFX11-LABEL: s_test_add_self_v2i16:
211 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
212 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
213 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
215 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
217 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
218 ; GFX11-NEXT: s_nop 0
219 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
220 ; GFX11-NEXT: s_endpgm
221 %a = load <2 x i16>, ptr addrspace(4) %in0
222 %add = add <2 x i16> %a, %a
223 store <2 x i16> %add, ptr addrspace(1) %out
227 ; FIXME: VI should not scalarize arg access.
228 define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
229 ; VI-LABEL: s_test_add_v2i16_kernarg:
231 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
232 ; VI-NEXT: s_waitcnt lgkmcnt(0)
233 ; VI-NEXT: s_lshr_b32 s4, s2, 16
234 ; VI-NEXT: s_lshr_b32 s5, s3, 16
235 ; VI-NEXT: s_add_i32 s2, s2, s3
236 ; VI-NEXT: s_add_i32 s4, s4, s5
237 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
238 ; VI-NEXT: s_lshl_b32 s3, s4, 16
239 ; VI-NEXT: s_or_b32 s2, s2, s3
240 ; VI-NEXT: v_mov_b32_e32 v0, s0
241 ; VI-NEXT: v_mov_b32_e32 v1, s1
242 ; VI-NEXT: v_mov_b32_e32 v2, s2
243 ; VI-NEXT: flat_store_dword v[0:1], v2
246 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
248 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
249 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
250 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
252 ; GFX9-NEXT: v_pk_add_u16 v1, s6, v1
253 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
254 ; GFX9-NEXT: s_endpgm
256 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
258 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
259 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
260 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX10-NEXT: v_pk_add_u16 v1, s6, s7
262 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
263 ; GFX10-NEXT: s_endpgm
265 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
267 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
268 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
269 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3
271 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
272 ; GFX11-NEXT: s_nop 0
273 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
274 ; GFX11-NEXT: s_endpgm
275 %add = add <2 x i16> %a, %b
276 store <2 x i16> %add, ptr addrspace(1) %out
280 ; FIXME: Eliminate or with sdwa
281 define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
282 ; VI-LABEL: v_test_add_v2i16_constant:
284 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
285 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
286 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
287 ; VI-NEXT: s_waitcnt lgkmcnt(0)
288 ; VI-NEXT: v_mov_b32_e32 v1, s3
289 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
290 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
291 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
292 ; VI-NEXT: s_waitcnt vmcnt(0)
293 ; VI-NEXT: v_mov_b32_e32 v0, s0
294 ; VI-NEXT: v_mov_b32_e32 v1, s1
295 ; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
296 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
297 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
298 ; VI-NEXT: flat_store_dword v[0:1], v2
301 ; GFX9-LABEL: v_test_add_v2i16_constant:
303 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
304 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
305 ; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
306 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
307 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
308 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
309 ; GFX9-NEXT: s_waitcnt vmcnt(0)
310 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
311 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
312 ; GFX9-NEXT: s_endpgm
314 ; GFX10-LABEL: v_test_add_v2i16_constant:
316 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
317 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
318 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
319 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
321 ; GFX10-NEXT: s_waitcnt vmcnt(0)
322 ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
323 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
324 ; GFX10-NEXT: s_endpgm
326 ; GFX11-LABEL: v_test_add_v2i16_constant:
328 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
329 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
330 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
331 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
332 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
334 ; GFX11-NEXT: s_waitcnt vmcnt(0)
335 ; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
336 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
337 ; GFX11-NEXT: s_nop 0
338 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
339 ; GFX11-NEXT: s_endpgm
340 %tid = call i32 @llvm.amdgcn.workitem.id.x()
341 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
342 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
343 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
344 %add = add <2 x i16> %a, <i16 123, i16 456>
345 store <2 x i16> %add, ptr addrspace(1) %out
349 ; FIXME: Need to handle non-uniform case for function below (load without gep).
350 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
351 ; VI-LABEL: v_test_add_v2i16_neg_constant:
353 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
354 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
356 ; VI-NEXT: s_waitcnt lgkmcnt(0)
357 ; VI-NEXT: v_mov_b32_e32 v1, s3
358 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
359 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
360 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
361 ; VI-NEXT: s_waitcnt vmcnt(0)
362 ; VI-NEXT: v_mov_b32_e32 v0, s0
363 ; VI-NEXT: v_mov_b32_e32 v1, s1
364 ; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
365 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
366 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
367 ; VI-NEXT: flat_store_dword v[0:1], v2
370 ; GFX9-LABEL: v_test_add_v2i16_neg_constant:
372 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
373 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
374 ; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
375 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
376 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
377 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
379 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
380 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
381 ; GFX9-NEXT: s_endpgm
383 ; GFX10-LABEL: v_test_add_v2i16_neg_constant:
385 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
386 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
387 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
388 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
390 ; GFX10-NEXT: s_waitcnt vmcnt(0)
391 ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
392 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
393 ; GFX10-NEXT: s_endpgm
395 ; GFX11-LABEL: v_test_add_v2i16_neg_constant:
397 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
398 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
400 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
401 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
403 ; GFX11-NEXT: s_waitcnt vmcnt(0)
404 ; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
405 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
406 ; GFX11-NEXT: s_nop 0
407 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
408 ; GFX11-NEXT: s_endpgm
409 %tid = call i32 @llvm.amdgcn.workitem.id.x()
410 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
411 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
412 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
413 %add = add <2 x i16> %a, <i16 -845, i16 -991>
414 store <2 x i16> %add, ptr addrspace(1) %out
418 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
419 ; VI-LABEL: v_test_add_v2i16_inline_neg1:
421 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
422 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
423 ; VI-NEXT: v_mov_b32_e32 v3, -1
424 ; VI-NEXT: s_waitcnt lgkmcnt(0)
425 ; VI-NEXT: v_mov_b32_e32 v1, s3
426 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
427 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
428 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
429 ; VI-NEXT: s_waitcnt vmcnt(0)
430 ; VI-NEXT: v_mov_b32_e32 v0, s0
431 ; VI-NEXT: v_mov_b32_e32 v1, s1
432 ; VI-NEXT: v_add_u16_e32 v4, -1, v2
433 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
434 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
435 ; VI-NEXT: flat_store_dword v[0:1], v2
438 ; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
440 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
441 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
442 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
443 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
445 ; GFX9-NEXT: s_waitcnt vmcnt(0)
446 ; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
447 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
448 ; GFX9-NEXT: s_endpgm
450 ; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
452 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
453 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
454 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
455 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
457 ; GFX10-NEXT: s_waitcnt vmcnt(0)
458 ; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
459 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
460 ; GFX10-NEXT: s_endpgm
462 ; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
464 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
465 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
466 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
467 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
468 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
469 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
470 ; GFX11-NEXT: s_waitcnt vmcnt(0)
471 ; GFX11-NEXT: v_pk_add_u16 v0, v0, -1
472 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
473 ; GFX11-NEXT: s_nop 0
474 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
475 ; GFX11-NEXT: s_endpgm
476 %tid = call i32 @llvm.amdgcn.workitem.id.x()
477 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
478 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
479 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
480 %add = add <2 x i16> %a, <i16 -1, i16 -1>
481 store <2 x i16> %add, ptr addrspace(1) %out
485 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
486 ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
488 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
489 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
490 ; VI-NEXT: s_waitcnt lgkmcnt(0)
491 ; VI-NEXT: v_mov_b32_e32 v1, s3
492 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
493 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
494 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
495 ; VI-NEXT: s_waitcnt vmcnt(0)
496 ; VI-NEXT: v_mov_b32_e32 v0, s0
497 ; VI-NEXT: v_mov_b32_e32 v1, s1
498 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
499 ; VI-NEXT: v_add_u16_e32 v2, 32, v2
500 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
501 ; VI-NEXT: flat_store_dword v[0:1], v2
504 ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
506 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
507 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
508 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
509 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
511 ; GFX9-NEXT: s_waitcnt vmcnt(0)
512 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
513 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
514 ; GFX9-NEXT: s_endpgm
516 ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
518 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
519 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
520 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
521 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
522 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
523 ; GFX10-NEXT: s_waitcnt vmcnt(0)
524 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
525 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
526 ; GFX10-NEXT: s_endpgm
528 ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
530 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
531 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
532 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
533 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
534 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
536 ; GFX11-NEXT: s_waitcnt vmcnt(0)
537 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
538 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
539 ; GFX11-NEXT: s_nop 0
540 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
541 ; GFX11-NEXT: s_endpgm
542 %tid = call i32 @llvm.amdgcn.workitem.id.x()
543 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
544 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
545 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
546 %add = add <2 x i16> %a, <i16 32, i16 0>
547 store <2 x i16> %add, ptr addrspace(1) %out
551 ; The high element gives fp
552 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
553 ; VI-LABEL: v_test_add_v2i16_inline_fp_split:
555 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
556 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
557 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
558 ; VI-NEXT: s_waitcnt lgkmcnt(0)
559 ; VI-NEXT: v_mov_b32_e32 v1, s3
560 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
561 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
562 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
563 ; VI-NEXT: s_waitcnt vmcnt(0)
564 ; VI-NEXT: v_mov_b32_e32 v0, s0
565 ; VI-NEXT: v_mov_b32_e32 v1, s1
566 ; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
567 ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
568 ; VI-NEXT: flat_store_dword v[0:1], v2
571 ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
573 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
574 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
575 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
576 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
578 ; GFX9-NEXT: s_waitcnt vmcnt(0)
579 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
580 ; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
581 ; GFX9-NEXT: s_endpgm
583 ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
585 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
586 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
587 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
588 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
589 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
590 ; GFX10-NEXT: s_waitcnt vmcnt(0)
591 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
592 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
593 ; GFX10-NEXT: s_endpgm
595 ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
597 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
598 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
599 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
600 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
601 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
602 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
603 ; GFX11-NEXT: s_waitcnt vmcnt(0)
604 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0
605 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
606 ; GFX11-NEXT: s_nop 0
607 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
608 ; GFX11-NEXT: s_endpgm
609 %tid = call i32 @llvm.amdgcn.workitem.id.x()
610 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
611 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
612 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
613 %add = add <2 x i16> %a, <i16 0, i16 16256>
614 store <2 x i16> %add, ptr addrspace(1) %out
618 ; FIXME: Need to handle non-uniform case for function below (load without gep).
619 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
620 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
622 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
623 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
624 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
625 ; VI-NEXT: s_waitcnt lgkmcnt(0)
626 ; VI-NEXT: v_mov_b32_e32 v1, s7
627 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
628 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
629 ; VI-NEXT: v_mov_b32_e32 v3, s1
630 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
631 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
632 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
633 ; VI-NEXT: s_waitcnt vmcnt(0)
634 ; VI-NEXT: flat_load_dword v3, v[2:3] glc
635 ; VI-NEXT: s_waitcnt vmcnt(0)
636 ; VI-NEXT: v_mov_b32_e32 v0, s4
637 ; VI-NEXT: v_mov_b32_e32 v1, s5
638 ; VI-NEXT: v_add_u16_e32 v2, v4, v3
639 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
640 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
643 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32:
645 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
646 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
647 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
648 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
651 ; GFX9-NEXT: s_waitcnt vmcnt(0)
652 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
653 ; GFX9-NEXT: s_waitcnt vmcnt(0)
654 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
655 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
656 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
657 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
658 ; GFX9-NEXT: s_endpgm
660 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32:
662 ; GFX10-NEXT: s_clause 0x1
663 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
664 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
665 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
666 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
668 ; GFX10-NEXT: s_waitcnt vmcnt(0)
669 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc
670 ; GFX10-NEXT: s_waitcnt vmcnt(0)
671 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
672 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
673 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
674 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
675 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
676 ; GFX10-NEXT: s_endpgm
678 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
680 ; GFX11-NEXT: s_clause 0x1
681 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
682 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
683 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
684 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
685 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
686 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
687 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
688 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
689 ; GFX11-NEXT: s_waitcnt vmcnt(0)
690 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
691 ; GFX11-NEXT: s_waitcnt vmcnt(0)
692 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
693 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
694 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
695 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
696 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
697 ; GFX11-NEXT: s_nop 0
698 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
699 ; GFX11-NEXT: s_endpgm
700 %tid = call i32 @llvm.amdgcn.workitem.id.x()
701 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
702 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
703 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
704 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
705 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
706 %add = add <2 x i16> %a, %b
707 %ext = zext <2 x i16> %add to <2 x i32>
708 store <2 x i32> %ext, ptr addrspace(1) %out
712 ; FIXME: Need to handle non-uniform case for function below (load without gep).
713 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
714 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
716 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
717 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
718 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
719 ; VI-NEXT: s_waitcnt lgkmcnt(0)
720 ; VI-NEXT: v_mov_b32_e32 v1, s7
721 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
722 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
723 ; VI-NEXT: v_mov_b32_e32 v3, s1
724 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
725 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
726 ; VI-NEXT: flat_load_dword v6, v[0:1] glc
727 ; VI-NEXT: s_waitcnt vmcnt(0)
728 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
729 ; VI-NEXT: s_waitcnt vmcnt(0)
730 ; VI-NEXT: v_mov_b32_e32 v1, 0
731 ; VI-NEXT: v_mov_b32_e32 v4, s4
732 ; VI-NEXT: v_mov_b32_e32 v5, s5
733 ; VI-NEXT: v_mov_b32_e32 v3, v1
734 ; VI-NEXT: v_add_u16_e32 v0, v6, v2
735 ; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
736 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
739 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64:
741 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
742 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
743 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
744 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
745 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
747 ; GFX9-NEXT: s_waitcnt vmcnt(0)
748 ; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc
749 ; GFX9-NEXT: s_waitcnt vmcnt(0)
750 ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3
751 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16
752 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
753 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
754 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5]
755 ; GFX9-NEXT: s_endpgm
757 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64:
759 ; GFX10-NEXT: s_clause 0x1
760 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
761 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
762 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
763 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
764 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
765 ; GFX10-NEXT: s_waitcnt vmcnt(0)
766 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc
767 ; GFX10-NEXT: s_waitcnt vmcnt(0)
768 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
769 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
770 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16
771 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
772 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
773 ; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5]
774 ; GFX10-NEXT: s_endpgm
776 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
778 ; GFX11-NEXT: s_clause 0x1
779 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
780 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
781 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
782 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
783 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
784 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
785 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
786 ; GFX11-NEXT: s_waitcnt vmcnt(0)
787 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
788 ; GFX11-NEXT: s_waitcnt vmcnt(0)
789 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
790 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
791 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
792 ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
793 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
794 ; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5]
795 ; GFX11-NEXT: s_nop 0
796 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
797 ; GFX11-NEXT: s_endpgm
798 %tid = call i32 @llvm.amdgcn.workitem.id.x()
799 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
800 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
801 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
802 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
803 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
804 %add = add <2 x i16> %a, %b
805 %ext = zext <2 x i16> %add to <2 x i64>
806 store <2 x i64> %ext, ptr addrspace(1) %out
810 ; FIXME: Need to handle non-uniform case for function below (load without gep).
811 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
812 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
814 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
815 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
816 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
817 ; VI-NEXT: s_waitcnt lgkmcnt(0)
818 ; VI-NEXT: v_mov_b32_e32 v1, s7
819 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
820 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
821 ; VI-NEXT: v_mov_b32_e32 v3, s1
822 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
823 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
824 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
825 ; VI-NEXT: s_waitcnt vmcnt(0)
826 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
827 ; VI-NEXT: s_waitcnt vmcnt(0)
828 ; VI-NEXT: v_mov_b32_e32 v0, s4
829 ; VI-NEXT: v_mov_b32_e32 v1, s5
830 ; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
831 ; VI-NEXT: v_add_u16_e32 v2, v4, v2
832 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
833 ; VI-NEXT: v_bfe_i32 v3, v3, 0, 16
834 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
837 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32:
839 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
840 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
841 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
842 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
843 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
844 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
845 ; GFX9-NEXT: s_waitcnt vmcnt(0)
846 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
847 ; GFX9-NEXT: s_waitcnt vmcnt(0)
848 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
849 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
850 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
851 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
852 ; GFX9-NEXT: s_endpgm
854 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32:
856 ; GFX10-NEXT: s_clause 0x1
857 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
858 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
859 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
860 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
861 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
862 ; GFX10-NEXT: s_waitcnt vmcnt(0)
863 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc
864 ; GFX10-NEXT: s_waitcnt vmcnt(0)
865 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
866 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
867 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
868 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
869 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
870 ; GFX10-NEXT: s_endpgm
872 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32:
874 ; GFX11-NEXT: s_clause 0x1
875 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
876 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
877 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
878 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
879 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
880 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
881 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
882 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
883 ; GFX11-NEXT: s_waitcnt vmcnt(0)
884 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
885 ; GFX11-NEXT: s_waitcnt vmcnt(0)
886 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
887 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
888 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
889 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
890 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
891 ; GFX11-NEXT: s_nop 0
892 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
893 ; GFX11-NEXT: s_endpgm
894 %tid = call i32 @llvm.amdgcn.workitem.id.x()
895 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
896 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
897 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
898 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
899 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
900 %add = add <2 x i16> %a, %b
901 %ext = sext <2 x i16> %add to <2 x i32>
902 store <2 x i32> %ext, ptr addrspace(1) %out
906 ; FIXME: Need to handle non-uniform case for function below (load without gep).
907 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
908 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
910 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
911 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
912 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
913 ; VI-NEXT: s_waitcnt lgkmcnt(0)
914 ; VI-NEXT: v_mov_b32_e32 v1, s7
915 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
916 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
917 ; VI-NEXT: v_mov_b32_e32 v3, s1
918 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
919 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
920 ; VI-NEXT: flat_load_dword v0, v[0:1]
921 ; VI-NEXT: flat_load_dword v1, v[2:3]
922 ; VI-NEXT: v_mov_b32_e32 v4, s4
923 ; VI-NEXT: v_mov_b32_e32 v5, s5
924 ; VI-NEXT: s_waitcnt vmcnt(0)
925 ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
926 ; VI-NEXT: v_add_u16_e32 v0, v0, v1
927 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
928 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
929 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
930 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
931 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
934 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64:
936 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
937 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
938 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
939 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
940 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
941 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
942 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
943 ; GFX9-NEXT: s_waitcnt vmcnt(0)
944 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
945 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
946 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
947 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
948 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
949 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
950 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
951 ; GFX9-NEXT: s_endpgm
953 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64:
955 ; GFX10-NEXT: s_clause 0x1
956 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
957 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
958 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
959 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
960 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
961 ; GFX10-NEXT: s_clause 0x1
962 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
963 ; GFX10-NEXT: global_load_dword v2, v0, s[0:1]
964 ; GFX10-NEXT: s_waitcnt vmcnt(0)
965 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
966 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
967 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
968 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
969 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
970 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
971 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
972 ; GFX10-NEXT: s_endpgm
974 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
976 ; GFX11-NEXT: s_clause 0x1
977 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
978 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
979 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
980 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
981 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
982 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
983 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX11-NEXT: s_clause 0x1
985 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
986 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
987 ; GFX11-NEXT: s_waitcnt vmcnt(0)
988 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
989 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
990 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
991 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
992 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
993 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
994 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
995 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
996 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
997 ; GFX11-NEXT: s_nop 0
998 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
999 ; GFX11-NEXT: s_endpgm
1000 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1001 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
1002 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
1003 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
1004 %a = load <2 x i16>, ptr addrspace(1) %gep.in0
1005 %b = load <2 x i16>, ptr addrspace(1) %gep.in1
1006 %add = add <2 x i16> %a, %b
1007 %ext = sext <2 x i16> %add to <2 x i64>
1008 store <2 x i64> %ext, ptr addrspace(1) %out
1012 define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
1013 ; VI-LABEL: add_inline_imm_neg1_0:
1015 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
1017 ; VI-NEXT: v_add_u16_e32 v0, -1, v0
1018 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1019 ; VI-NEXT: s_setpc_b64 s[30:31]
1021 ; GFX9-LABEL: add_inline_imm_neg1_0:
1023 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1
1025 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1027 ; GFX10-LABEL: add_inline_imm_neg1_0:
1029 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1030 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1
1031 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1033 ; GFX11-LABEL: add_inline_imm_neg1_0:
1035 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1
1037 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1038 %y = add <2 x i16> %x, <i16 -1, i16 0>
1042 define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
1043 ; VI-LABEL: add_inline_imm_1_0:
1045 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
1047 ; VI-NEXT: v_add_u16_e32 v0, 1, v0
1048 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1049 ; VI-NEXT: s_setpc_b64 s[30:31]
1051 ; GFX9-LABEL: add_inline_imm_1_0:
1053 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1054 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1
1055 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1057 ; GFX10-LABEL: add_inline_imm_1_0:
1059 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1
1061 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1063 ; GFX11-LABEL: add_inline_imm_1_0:
1065 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1066 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1
1067 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1068 %y = add <2 x i16> %x, <i16 1, i16 0>
1072 declare i32 @llvm.amdgcn.workitem.id.x() #0
1074 attributes #0 = { nounwind readnone }
1075 attributes #1 = { nounwind }