1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8 ; FIXME: VI or should be unnecessary
9 define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
10 ; VI-LABEL: v_test_add_v2i16:
12 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
13 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
14 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
15 ; VI-NEXT: s_waitcnt lgkmcnt(0)
16 ; VI-NEXT: v_mov_b32_e32 v1, s7
17 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
18 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
19 ; VI-NEXT: v_mov_b32_e32 v3, s1
20 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
21 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
22 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
23 ; VI-NEXT: s_waitcnt vmcnt(0)
24 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
25 ; VI-NEXT: s_waitcnt vmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s4
27 ; VI-NEXT: v_mov_b32_e32 v1, s5
28 ; VI-NEXT: v_add_u16_e32 v3, v4, v2
29 ; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
31 ; VI-NEXT: flat_store_dword v[0:1], v2
34 ; GFX9-LABEL: v_test_add_v2i16:
36 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
37 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
38 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
39 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
41 ; GFX9-NEXT: s_waitcnt vmcnt(0)
42 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
43 ; GFX9-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
45 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
46 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
49 ; GFX10-LABEL: v_test_add_v2i16:
51 ; GFX10-NEXT: s_clause 0x1
52 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
53 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
61 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2
62 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
63 ; GFX10-NEXT: s_endpgm
65 ; GFX11-LABEL: v_test_add_v2i16:
67 ; GFX11-NEXT: s_clause 0x1
68 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
69 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
70 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
71 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
72 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
76 ; GFX11-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
78 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
80 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81 ; GFX11-NEXT: s_endpgm
82 %tid = call i32 @llvm.amdgcn.workitem.id.x()
83 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
84 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
85 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
86 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
87 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
88 %add = add <2 x i16> %a, %b
89 store <2 x i16> %add, ptr addrspace(1) %out
93 define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
94 ; VI-LABEL: s_test_add_v2i16:
96 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
97 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
99 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
100 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
101 ; VI-NEXT: v_mov_b32_e32 v0, s4
102 ; VI-NEXT: v_mov_b32_e32 v1, s5
103 ; VI-NEXT: s_waitcnt lgkmcnt(0)
104 ; VI-NEXT: s_lshr_b32 s1, s2, 16
105 ; VI-NEXT: s_lshr_b32 s3, s0, 16
106 ; VI-NEXT: s_add_i32 s2, s2, s0
107 ; VI-NEXT: s_add_i32 s1, s1, s3
108 ; VI-NEXT: s_and_b32 s0, s2, 0xffff
109 ; VI-NEXT: s_lshl_b32 s1, s1, 16
110 ; VI-NEXT: s_or_b32 s0, s0, s1
111 ; VI-NEXT: v_mov_b32_e32 v2, s0
112 ; VI-NEXT: flat_store_dword v[0:1], v2
115 ; GFX9-LABEL: s_test_add_v2i16:
117 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
118 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
119 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
120 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
121 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
122 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
125 ; GFX9-NEXT: v_pk_add_u16 v1, s1, v1
126 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
127 ; GFX9-NEXT: s_endpgm
129 ; GFX10-LABEL: s_test_add_v2i16:
131 ; GFX10-NEXT: s_clause 0x1
132 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
133 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
134 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
135 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
137 ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0
138 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX10-NEXT: v_pk_add_u16 v1, s0, s1
140 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
141 ; GFX10-NEXT: s_endpgm
143 ; GFX11-LABEL: s_test_add_v2i16:
145 ; GFX11-NEXT: s_clause 0x1
146 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
147 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
148 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
149 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
151 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
152 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s0
154 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
155 ; GFX11-NEXT: s_nop 0
156 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
157 ; GFX11-NEXT: s_endpgm
158 %a = load <2 x i16>, ptr addrspace(4) %in0
159 %b = load <2 x i16>, ptr addrspace(4) %in1
160 %add = add <2 x i16> %a, %b
161 store <2 x i16> %add, ptr addrspace(1) %out
165 define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
166 ; VI-LABEL: s_test_add_self_v2i16:
168 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
169 ; VI-NEXT: s_waitcnt lgkmcnt(0)
170 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
171 ; VI-NEXT: v_mov_b32_e32 v0, s0
172 ; VI-NEXT: v_mov_b32_e32 v1, s1
173 ; VI-NEXT: s_waitcnt lgkmcnt(0)
174 ; VI-NEXT: s_lshr_b32 s0, s2, 16
175 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
176 ; VI-NEXT: s_add_i32 s1, s1, s1
177 ; VI-NEXT: s_add_i32 s0, s0, s0
178 ; VI-NEXT: s_lshl_b32 s0, s0, 16
179 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
180 ; VI-NEXT: s_or_b32 s0, s1, s0
181 ; VI-NEXT: v_mov_b32_e32 v2, s0
182 ; VI-NEXT: flat_store_dword v[0:1], v2
185 ; GFX9-LABEL: s_test_add_self_v2i16:
187 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
188 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
189 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
191 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
192 ; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
193 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
194 ; GFX9-NEXT: s_endpgm
196 ; GFX10-LABEL: s_test_add_self_v2i16:
198 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
199 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
202 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
204 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
205 ; GFX10-NEXT: s_endpgm
207 ; GFX11-LABEL: s_test_add_self_v2i16:
209 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
210 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
211 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
213 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
215 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
216 ; GFX11-NEXT: s_nop 0
217 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
218 ; GFX11-NEXT: s_endpgm
219 %a = load <2 x i16>, ptr addrspace(4) %in0
220 %add = add <2 x i16> %a, %a
221 store <2 x i16> %add, ptr addrspace(1) %out
225 ; FIXME: VI should not scalarize arg access.
226 define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
227 ; VI-LABEL: s_test_add_v2i16_kernarg:
229 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
230 ; VI-NEXT: s_waitcnt lgkmcnt(0)
231 ; VI-NEXT: s_lshr_b32 s4, s2, 16
232 ; VI-NEXT: s_lshr_b32 s5, s3, 16
233 ; VI-NEXT: s_add_i32 s2, s2, s3
234 ; VI-NEXT: s_add_i32 s4, s4, s5
235 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
236 ; VI-NEXT: s_lshl_b32 s3, s4, 16
237 ; VI-NEXT: s_or_b32 s2, s2, s3
238 ; VI-NEXT: v_mov_b32_e32 v0, s0
239 ; VI-NEXT: v_mov_b32_e32 v1, s1
240 ; VI-NEXT: v_mov_b32_e32 v2, s2
241 ; VI-NEXT: flat_store_dword v[0:1], v2
244 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
246 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
247 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
248 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
250 ; GFX9-NEXT: v_pk_add_u16 v1, s2, v1
251 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
252 ; GFX9-NEXT: s_endpgm
254 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
256 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
257 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
258 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3
260 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
261 ; GFX10-NEXT: s_endpgm
263 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
265 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
266 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
267 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3
269 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
270 ; GFX11-NEXT: s_nop 0
271 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
272 ; GFX11-NEXT: s_endpgm
273 %add = add <2 x i16> %a, %b
274 store <2 x i16> %add, ptr addrspace(1) %out
278 ; FIXME: Eliminate or with sdwa
279 define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
280 ; VI-LABEL: v_test_add_v2i16_constant:
282 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
283 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
285 ; VI-NEXT: s_waitcnt lgkmcnt(0)
286 ; VI-NEXT: v_mov_b32_e32 v1, s3
287 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
288 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
289 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
290 ; VI-NEXT: s_waitcnt vmcnt(0)
291 ; VI-NEXT: v_mov_b32_e32 v0, s0
292 ; VI-NEXT: v_mov_b32_e32 v1, s1
293 ; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
294 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
295 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
296 ; VI-NEXT: flat_store_dword v[0:1], v2
299 ; GFX9-LABEL: v_test_add_v2i16_constant:
301 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
302 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
303 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
304 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
307 ; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
308 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
309 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
310 ; GFX9-NEXT: s_endpgm
312 ; GFX10-LABEL: v_test_add_v2i16_constant:
314 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
315 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
317 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
318 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
319 ; GFX10-NEXT: s_waitcnt vmcnt(0)
320 ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
321 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
322 ; GFX10-NEXT: s_endpgm
324 ; GFX11-LABEL: v_test_add_v2i16_constant:
326 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
327 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
328 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
330 ; GFX11-NEXT: s_waitcnt vmcnt(0)
331 ; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
332 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
333 ; GFX11-NEXT: s_nop 0
334 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
335 ; GFX11-NEXT: s_endpgm
336 %tid = call i32 @llvm.amdgcn.workitem.id.x()
337 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
338 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
339 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
340 %add = add <2 x i16> %a, <i16 123, i16 456>
341 store <2 x i16> %add, ptr addrspace(1) %out
345 ; FIXME: Need to handle non-uniform case for function below (load without gep).
346 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
347 ; VI-LABEL: v_test_add_v2i16_neg_constant:
349 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
350 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
351 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
352 ; VI-NEXT: s_waitcnt lgkmcnt(0)
353 ; VI-NEXT: v_mov_b32_e32 v1, s3
354 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
355 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
356 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
357 ; VI-NEXT: s_waitcnt vmcnt(0)
358 ; VI-NEXT: v_mov_b32_e32 v0, s0
359 ; VI-NEXT: v_mov_b32_e32 v1, s1
360 ; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
361 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
362 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
363 ; VI-NEXT: flat_store_dword v[0:1], v2
366 ; GFX9-LABEL: v_test_add_v2i16_neg_constant:
368 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
369 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
370 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
371 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
374 ; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
375 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
376 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
377 ; GFX9-NEXT: s_endpgm
379 ; GFX10-LABEL: v_test_add_v2i16_neg_constant:
381 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
382 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
383 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
384 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
386 ; GFX10-NEXT: s_waitcnt vmcnt(0)
387 ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
388 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
389 ; GFX10-NEXT: s_endpgm
391 ; GFX11-LABEL: v_test_add_v2i16_neg_constant:
393 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
394 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
395 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
397 ; GFX11-NEXT: s_waitcnt vmcnt(0)
398 ; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
399 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
400 ; GFX11-NEXT: s_nop 0
401 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
402 ; GFX11-NEXT: s_endpgm
403 %tid = call i32 @llvm.amdgcn.workitem.id.x()
404 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
405 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
406 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
407 %add = add <2 x i16> %a, <i16 -845, i16 -991>
408 store <2 x i16> %add, ptr addrspace(1) %out
412 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
413 ; VI-LABEL: v_test_add_v2i16_inline_neg1:
415 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
416 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
417 ; VI-NEXT: v_mov_b32_e32 v3, -1
418 ; VI-NEXT: s_waitcnt lgkmcnt(0)
419 ; VI-NEXT: v_mov_b32_e32 v1, s3
420 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
421 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
422 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
423 ; VI-NEXT: s_waitcnt vmcnt(0)
424 ; VI-NEXT: v_mov_b32_e32 v0, s0
425 ; VI-NEXT: v_mov_b32_e32 v1, s1
426 ; VI-NEXT: v_add_u16_e32 v4, -1, v2
427 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
428 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
429 ; VI-NEXT: flat_store_dword v[0:1], v2
432 ; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
434 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
435 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
436 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
437 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
439 ; GFX9-NEXT: s_waitcnt vmcnt(0)
440 ; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
441 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
442 ; GFX9-NEXT: s_endpgm
444 ; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
446 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
447 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
448 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
449 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
450 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
451 ; GFX10-NEXT: s_waitcnt vmcnt(0)
452 ; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
453 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
454 ; GFX10-NEXT: s_endpgm
456 ; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
458 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
459 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
460 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
461 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
462 ; GFX11-NEXT: s_waitcnt vmcnt(0)
463 ; GFX11-NEXT: v_pk_add_u16 v0, v0, -1
464 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
465 ; GFX11-NEXT: s_nop 0
466 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
467 ; GFX11-NEXT: s_endpgm
468 %tid = call i32 @llvm.amdgcn.workitem.id.x()
469 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
470 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
471 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
472 %add = add <2 x i16> %a, <i16 -1, i16 -1>
473 store <2 x i16> %add, ptr addrspace(1) %out
477 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
478 ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
480 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
481 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
482 ; VI-NEXT: s_waitcnt lgkmcnt(0)
483 ; VI-NEXT: v_mov_b32_e32 v1, s3
484 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
485 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
486 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
487 ; VI-NEXT: s_waitcnt vmcnt(0)
488 ; VI-NEXT: v_mov_b32_e32 v0, s0
489 ; VI-NEXT: v_mov_b32_e32 v1, s1
490 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
491 ; VI-NEXT: v_add_u16_e32 v2, 32, v2
492 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
493 ; VI-NEXT: flat_store_dword v[0:1], v2
496 ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
498 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
499 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
500 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
501 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
503 ; GFX9-NEXT: s_waitcnt vmcnt(0)
504 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
505 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
506 ; GFX9-NEXT: s_endpgm
508 ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
510 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
511 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
512 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
513 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
515 ; GFX10-NEXT: s_waitcnt vmcnt(0)
516 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
517 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
518 ; GFX10-NEXT: s_endpgm
520 ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
522 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
523 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
524 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
525 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
526 ; GFX11-NEXT: s_waitcnt vmcnt(0)
527 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
528 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
529 ; GFX11-NEXT: s_nop 0
530 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
531 ; GFX11-NEXT: s_endpgm
532 %tid = call i32 @llvm.amdgcn.workitem.id.x()
533 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
534 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
535 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
536 %add = add <2 x i16> %a, <i16 32, i16 0>
537 store <2 x i16> %add, ptr addrspace(1) %out
541 ; The high element gives fp
542 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
543 ; VI-LABEL: v_test_add_v2i16_inline_fp_split:
545 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
546 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
547 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
548 ; VI-NEXT: s_waitcnt lgkmcnt(0)
549 ; VI-NEXT: v_mov_b32_e32 v1, s3
550 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
551 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
552 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
553 ; VI-NEXT: s_waitcnt vmcnt(0)
554 ; VI-NEXT: v_mov_b32_e32 v0, s0
555 ; VI-NEXT: v_mov_b32_e32 v1, s1
556 ; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
557 ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
558 ; VI-NEXT: flat_store_dword v[0:1], v2
561 ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
563 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
565 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
568 ; GFX9-NEXT: s_waitcnt vmcnt(0)
569 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
570 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
571 ; GFX9-NEXT: s_endpgm
573 ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
575 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
576 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
577 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
578 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
580 ; GFX10-NEXT: s_waitcnt vmcnt(0)
581 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
582 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
583 ; GFX10-NEXT: s_endpgm
585 ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
587 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
588 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
589 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
590 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
591 ; GFX11-NEXT: s_waitcnt vmcnt(0)
592 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0
593 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
594 ; GFX11-NEXT: s_nop 0
595 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
596 ; GFX11-NEXT: s_endpgm
597 %tid = call i32 @llvm.amdgcn.workitem.id.x()
598 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
599 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
600 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
601 %add = add <2 x i16> %a, <i16 0, i16 16256>
602 store <2 x i16> %add, ptr addrspace(1) %out
606 ; FIXME: Need to handle non-uniform case for function below (load without gep).
607 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
608 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
610 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
611 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
612 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
613 ; VI-NEXT: s_waitcnt lgkmcnt(0)
614 ; VI-NEXT: v_mov_b32_e32 v1, s7
615 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
616 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
617 ; VI-NEXT: v_mov_b32_e32 v3, s1
618 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
619 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
620 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
621 ; VI-NEXT: s_waitcnt vmcnt(0)
622 ; VI-NEXT: flat_load_dword v3, v[2:3] glc
623 ; VI-NEXT: s_waitcnt vmcnt(0)
624 ; VI-NEXT: v_mov_b32_e32 v0, s4
625 ; VI-NEXT: v_mov_b32_e32 v1, s5
626 ; VI-NEXT: v_add_u16_e32 v2, v4, v3
627 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
628 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
631 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32:
633 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
634 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
635 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
636 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
637 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
638 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
640 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
642 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
643 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
644 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
645 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
646 ; GFX9-NEXT: s_endpgm
648 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32:
650 ; GFX10-NEXT: s_clause 0x1
651 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
652 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
653 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
654 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
656 ; GFX10-NEXT: s_waitcnt vmcnt(0)
657 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
658 ; GFX10-NEXT: s_waitcnt vmcnt(0)
659 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
660 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
661 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
662 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
663 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
664 ; GFX10-NEXT: s_endpgm
666 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
668 ; GFX11-NEXT: s_clause 0x1
669 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
670 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
671 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
672 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
673 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
674 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
675 ; GFX11-NEXT: s_waitcnt vmcnt(0)
676 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
677 ; GFX11-NEXT: s_waitcnt vmcnt(0)
678 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
679 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
680 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
681 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
682 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
683 ; GFX11-NEXT: s_nop 0
684 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
685 ; GFX11-NEXT: s_endpgm
686 %tid = call i32 @llvm.amdgcn.workitem.id.x()
687 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
688 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
689 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
690 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
691 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
692 %add = add <2 x i16> %a, %b
693 %ext = zext <2 x i16> %add to <2 x i32>
694 store <2 x i32> %ext, ptr addrspace(1) %out
698 ; FIXME: Need to handle non-uniform case for function below (load without gep).
699 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
700 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
702 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
703 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
704 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
705 ; VI-NEXT: s_waitcnt lgkmcnt(0)
706 ; VI-NEXT: v_mov_b32_e32 v1, s7
707 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
708 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
709 ; VI-NEXT: v_mov_b32_e32 v3, s1
710 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
711 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
712 ; VI-NEXT: flat_load_dword v6, v[0:1] glc
713 ; VI-NEXT: s_waitcnt vmcnt(0)
714 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
715 ; VI-NEXT: s_waitcnt vmcnt(0)
716 ; VI-NEXT: v_mov_b32_e32 v1, 0
717 ; VI-NEXT: v_mov_b32_e32 v4, s4
718 ; VI-NEXT: v_mov_b32_e32 v5, s5
719 ; VI-NEXT: v_mov_b32_e32 v3, v1
720 ; VI-NEXT: v_add_u16_e32 v0, v6, v2
721 ; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
722 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
725 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64:
727 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
728 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
729 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
730 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
731 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
732 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
733 ; GFX9-NEXT: s_waitcnt vmcnt(0)
734 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc
735 ; GFX9-NEXT: s_waitcnt vmcnt(0)
736 ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3
737 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16
738 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
739 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
740 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5]
741 ; GFX9-NEXT: s_endpgm
743 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64:
745 ; GFX10-NEXT: s_clause 0x1
746 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
747 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
748 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
749 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
750 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
751 ; GFX10-NEXT: s_waitcnt vmcnt(0)
752 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
753 ; GFX10-NEXT: s_waitcnt vmcnt(0)
754 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
755 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
756 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16
757 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
758 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
759 ; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5]
760 ; GFX10-NEXT: s_endpgm
762 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
764 ; GFX11-NEXT: s_clause 0x1
765 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
766 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
767 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
768 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
769 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
770 ; GFX11-NEXT: s_waitcnt vmcnt(0)
771 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
772 ; GFX11-NEXT: s_waitcnt vmcnt(0)
773 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
774 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
775 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
776 ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
777 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
778 ; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5]
779 ; GFX11-NEXT: s_nop 0
780 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
781 ; GFX11-NEXT: s_endpgm
782 %tid = call i32 @llvm.amdgcn.workitem.id.x()
783 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
784 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
785 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
786 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
787 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
788 %add = add <2 x i16> %a, %b
789 %ext = zext <2 x i16> %add to <2 x i64>
790 store <2 x i64> %ext, ptr addrspace(1) %out
794 ; FIXME: Need to handle non-uniform case for function below (load without gep).
795 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
796 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
798 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
799 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
800 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
801 ; VI-NEXT: s_waitcnt lgkmcnt(0)
802 ; VI-NEXT: v_mov_b32_e32 v1, s7
803 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
804 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
805 ; VI-NEXT: v_mov_b32_e32 v3, s1
806 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
807 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
808 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
809 ; VI-NEXT: s_waitcnt vmcnt(0)
810 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
811 ; VI-NEXT: s_waitcnt vmcnt(0)
812 ; VI-NEXT: v_mov_b32_e32 v0, s4
813 ; VI-NEXT: v_mov_b32_e32 v1, s5
814 ; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
815 ; VI-NEXT: v_add_u16_e32 v2, v4, v2
816 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
817 ; VI-NEXT: v_bfe_i32 v3, v3, 0, 16
818 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
821 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32:
823 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
824 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
825 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
826 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
827 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
828 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
829 ; GFX9-NEXT: s_waitcnt vmcnt(0)
830 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
831 ; GFX9-NEXT: s_waitcnt vmcnt(0)
832 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
833 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
834 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
835 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
836 ; GFX9-NEXT: s_endpgm
838 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32:
840 ; GFX10-NEXT: s_clause 0x1
841 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
842 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
843 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
844 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
845 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
846 ; GFX10-NEXT: s_waitcnt vmcnt(0)
847 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
848 ; GFX10-NEXT: s_waitcnt vmcnt(0)
849 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
850 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
851 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
852 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
853 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
854 ; GFX10-NEXT: s_endpgm
856 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32:
858 ; GFX11-NEXT: s_clause 0x1
859 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
860 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
861 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
862 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
863 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
864 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
865 ; GFX11-NEXT: s_waitcnt vmcnt(0)
866 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
867 ; GFX11-NEXT: s_waitcnt vmcnt(0)
868 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
869 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
870 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
871 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
872 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
873 ; GFX11-NEXT: s_nop 0
874 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
875 ; GFX11-NEXT: s_endpgm
876 %tid = call i32 @llvm.amdgcn.workitem.id.x()
877 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
878 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
879 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
880 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
881 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
882 %add = add <2 x i16> %a, %b
883 %ext = sext <2 x i16> %add to <2 x i32>
884 store <2 x i32> %ext, ptr addrspace(1) %out
888 ; FIXME: Need to handle non-uniform case for function below (load without gep).
889 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
890 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
892 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
893 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
894 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
895 ; VI-NEXT: s_waitcnt lgkmcnt(0)
896 ; VI-NEXT: v_mov_b32_e32 v1, s7
897 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
898 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
899 ; VI-NEXT: v_mov_b32_e32 v3, s1
900 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
901 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
902 ; VI-NEXT: flat_load_dword v0, v[0:1]
903 ; VI-NEXT: flat_load_dword v1, v[2:3]
904 ; VI-NEXT: v_mov_b32_e32 v4, s4
905 ; VI-NEXT: v_mov_b32_e32 v5, s5
906 ; VI-NEXT: s_waitcnt vmcnt(0)
907 ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
908 ; VI-NEXT: v_add_u16_e32 v0, v0, v1
909 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
910 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
911 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
912 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
913 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
916 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64:
918 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
919 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
920 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
921 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
922 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
923 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
924 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
925 ; GFX9-NEXT: s_waitcnt vmcnt(0)
926 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
927 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
928 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
929 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
930 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
931 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
932 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
933 ; GFX9-NEXT: s_endpgm
935 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64:
937 ; GFX10-NEXT: s_clause 0x1
938 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
939 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
940 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
941 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
942 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
943 ; GFX10-NEXT: s_clause 0x1
944 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
945 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
946 ; GFX10-NEXT: s_waitcnt vmcnt(0)
947 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
948 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
949 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
950 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
951 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
952 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
953 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
954 ; GFX10-NEXT: s_endpgm
956 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
958 ; GFX11-NEXT: s_clause 0x1
959 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
960 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
961 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
962 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
963 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
964 ; GFX11-NEXT: s_clause 0x1
965 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
966 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
967 ; GFX11-NEXT: s_waitcnt vmcnt(0)
968 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
969 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
970 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
971 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
972 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
973 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
974 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
975 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
976 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
977 ; GFX11-NEXT: s_nop 0
978 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
979 ; GFX11-NEXT: s_endpgm
980 %tid = call i32 @llvm.amdgcn.workitem.id.x()
981 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
982 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
983 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
984 %a = load <2 x i16>, ptr addrspace(1) %gep.in0
985 %b = load <2 x i16>, ptr addrspace(1) %gep.in1
986 %add = add <2 x i16> %a, %b
987 %ext = sext <2 x i16> %add to <2 x i64>
988 store <2 x i64> %ext, ptr addrspace(1) %out
992 define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
993 ; VI-LABEL: add_inline_imm_neg1_0:
995 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
996 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
997 ; VI-NEXT: v_add_u16_e32 v0, -1, v0
998 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
999 ; VI-NEXT: s_setpc_b64 s[30:31]
1001 ; GFX9-LABEL: add_inline_imm_neg1_0:
1003 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1
1005 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1007 ; GFX10-LABEL: add_inline_imm_neg1_0:
1009 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1
1011 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1013 ; GFX11-LABEL: add_inline_imm_neg1_0:
1015 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1
1017 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1018 %y = add <2 x i16> %x, <i16 -1, i16 0>
1022 define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
1023 ; VI-LABEL: add_inline_imm_1_0:
1025 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1026 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
1027 ; VI-NEXT: v_add_u16_e32 v0, 1, v0
1028 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1029 ; VI-NEXT: s_setpc_b64 s[30:31]
1031 ; GFX9-LABEL: add_inline_imm_1_0:
1033 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1
1035 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1037 ; GFX10-LABEL: add_inline_imm_1_0:
1039 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1
1041 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1043 ; GFX11-LABEL: add_inline_imm_1_0:
1045 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1
1047 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1048 %y = add <2 x i16> %x, <i16 1, i16 0>
1052 declare i32 @llvm.amdgcn.workitem.id.x() #0
1054 attributes #0 = { nounwind readnone }
1055 attributes #1 = { nounwind }