1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8 ; FIXME: VI or should be unnecessary
9 define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
10 ; VI-LABEL: v_test_add_v2i16:
12 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
13 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
14 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
15 ; VI-NEXT: s_waitcnt lgkmcnt(0)
16 ; VI-NEXT: v_mov_b32_e32 v1, s7
17 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
18 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
19 ; VI-NEXT: v_mov_b32_e32 v3, s1
20 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
21 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
22 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
23 ; VI-NEXT: s_waitcnt vmcnt(0)
24 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
25 ; VI-NEXT: s_waitcnt vmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s4
27 ; VI-NEXT: v_mov_b32_e32 v1, s5
28 ; VI-NEXT: v_add_u16_e32 v3, v4, v2
29 ; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
31 ; VI-NEXT: flat_store_dword v[0:1], v2
34 ; GFX9-LABEL: v_test_add_v2i16:
36 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
37 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
38 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
39 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
41 ; GFX9-NEXT: s_waitcnt vmcnt(0)
42 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
43 ; GFX9-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
45 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
46 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
49 ; GFX10-LABEL: v_test_add_v2i16:
51 ; GFX10-NEXT: s_clause 0x1
52 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
53 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
61 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2
62 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
63 ; GFX10-NEXT: s_endpgm
65 ; GFX11-LABEL: v_test_add_v2i16:
67 ; GFX11-NEXT: s_clause 0x1
68 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
69 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
70 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
71 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
72 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
76 ; GFX11-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
78 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
80 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81 ; GFX11-NEXT: s_endpgm
82 %tid = call i32 @llvm.amdgcn.workitem.id.x()
83 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
84 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
85 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
86 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
87 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
88 %add = add <2 x i16> %a, %b
89 store <2 x i16> %add, ptr addrspace(1) %out
93 define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
94 ; VI-LABEL: s_test_add_v2i16:
96 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
97 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
99 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
100 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
101 ; VI-NEXT: v_mov_b32_e32 v0, s4
102 ; VI-NEXT: v_mov_b32_e32 v1, s5
103 ; VI-NEXT: s_waitcnt lgkmcnt(0)
104 ; VI-NEXT: s_lshr_b32 s1, s2, 16
105 ; VI-NEXT: s_lshr_b32 s3, s0, 16
106 ; VI-NEXT: s_add_i32 s2, s2, s0
107 ; VI-NEXT: s_add_i32 s1, s1, s3
108 ; VI-NEXT: s_and_b32 s0, s2, 0xffff
109 ; VI-NEXT: s_lshl_b32 s1, s1, 16
110 ; VI-NEXT: s_or_b32 s0, s0, s1
111 ; VI-NEXT: v_mov_b32_e32 v2, s0
112 ; VI-NEXT: flat_store_dword v[0:1], v2
115 ; GFX9-LABEL: s_test_add_v2i16:
117 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
118 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
119 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
120 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
121 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
122 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
125 ; GFX9-NEXT: v_pk_add_u16 v1, s1, v1
126 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
127 ; GFX9-NEXT: s_endpgm
129 ; GFX10-LABEL: s_test_add_v2i16:
131 ; GFX10-NEXT: s_clause 0x1
132 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
133 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
134 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
135 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
137 ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0
138 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX10-NEXT: v_pk_add_u16 v1, s0, s1
140 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
141 ; GFX10-NEXT: s_endpgm
143 ; GFX11-LABEL: s_test_add_v2i16:
145 ; GFX11-NEXT: s_clause 0x1
146 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
147 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
148 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
149 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
151 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
152 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s0
154 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
155 ; GFX11-NEXT: s_nop 0
156 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
157 ; GFX11-NEXT: s_endpgm
158 %a = load <2 x i16>, ptr addrspace(4) %in0
159 %b = load <2 x i16>, ptr addrspace(4) %in1
160 %add = add <2 x i16> %a, %b
161 store <2 x i16> %add, ptr addrspace(1) %out
165 define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
166 ; VI-LABEL: s_test_add_self_v2i16:
168 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
169 ; VI-NEXT: s_waitcnt lgkmcnt(0)
170 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
171 ; VI-NEXT: v_mov_b32_e32 v0, s0
172 ; VI-NEXT: v_mov_b32_e32 v1, s1
173 ; VI-NEXT: s_waitcnt lgkmcnt(0)
174 ; VI-NEXT: s_lshr_b32 s0, s2, 16
175 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
176 ; VI-NEXT: s_add_i32 s1, s1, s1
177 ; VI-NEXT: s_add_i32 s0, s0, s0
178 ; VI-NEXT: s_lshl_b32 s0, s0, 16
179 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
180 ; VI-NEXT: s_or_b32 s0, s1, s0
181 ; VI-NEXT: v_mov_b32_e32 v2, s0
182 ; VI-NEXT: flat_store_dword v[0:1], v2
185 ; GFX9-LABEL: s_test_add_self_v2i16:
187 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
188 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
189 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
191 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
192 ; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
193 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
194 ; GFX9-NEXT: s_endpgm
196 ; GFX10-LABEL: s_test_add_self_v2i16:
198 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
199 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
202 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
204 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
205 ; GFX10-NEXT: s_endpgm
207 ; GFX11-LABEL: s_test_add_self_v2i16:
209 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
210 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
211 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
213 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
215 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
216 ; GFX11-NEXT: s_nop 0
217 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
218 ; GFX11-NEXT: s_endpgm
219 %a = load <2 x i16>, ptr addrspace(4) %in0
220 %add = add <2 x i16> %a, %a
221 store <2 x i16> %add, ptr addrspace(1) %out
225 ; FIXME: VI should not scalarize arg access.
226 define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
227 ; VI-LABEL: s_test_add_v2i16_kernarg:
229 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
230 ; VI-NEXT: s_waitcnt lgkmcnt(0)
231 ; VI-NEXT: s_lshr_b32 s4, s2, 16
232 ; VI-NEXT: s_lshr_b32 s5, s3, 16
233 ; VI-NEXT: s_add_i32 s2, s2, s3
234 ; VI-NEXT: s_add_i32 s4, s4, s5
235 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
236 ; VI-NEXT: s_lshl_b32 s3, s4, 16
237 ; VI-NEXT: s_or_b32 s2, s2, s3
238 ; VI-NEXT: v_mov_b32_e32 v0, s0
239 ; VI-NEXT: v_mov_b32_e32 v1, s1
240 ; VI-NEXT: v_mov_b32_e32 v2, s2
241 ; VI-NEXT: flat_store_dword v[0:1], v2
244 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
246 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
247 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
248 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
250 ; GFX9-NEXT: v_pk_add_u16 v1, s2, v1
251 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
252 ; GFX9-NEXT: s_endpgm
254 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
256 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
257 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
258 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3
260 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
261 ; GFX10-NEXT: s_endpgm
263 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
265 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
266 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
267 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3
269 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
270 ; GFX11-NEXT: s_nop 0
271 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
272 ; GFX11-NEXT: s_endpgm
273 %add = add <2 x i16> %a, %b
274 store <2 x i16> %add, ptr addrspace(1) %out
278 ; FIXME: Eliminate or with sdwa
279 define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
280 ; VI-LABEL: v_test_add_v2i16_constant:
282 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
283 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
285 ; VI-NEXT: s_waitcnt lgkmcnt(0)
286 ; VI-NEXT: v_mov_b32_e32 v1, s3
287 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
288 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
289 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
290 ; VI-NEXT: s_waitcnt vmcnt(0)
291 ; VI-NEXT: v_mov_b32_e32 v0, s0
292 ; VI-NEXT: v_mov_b32_e32 v1, s1
293 ; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
294 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
295 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
296 ; VI-NEXT: flat_store_dword v[0:1], v2
299 ; GFX9-LABEL: v_test_add_v2i16_constant:
301 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
302 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
303 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
304 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
307 ; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
308 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
309 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
310 ; GFX9-NEXT: s_endpgm
312 ; GFX10-LABEL: v_test_add_v2i16_constant:
314 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
315 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
317 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
318 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
319 ; GFX10-NEXT: s_waitcnt vmcnt(0)
320 ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
321 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
322 ; GFX10-NEXT: s_endpgm
324 ; GFX11-LABEL: v_test_add_v2i16_constant:
326 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
327 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
328 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
330 ; GFX11-NEXT: s_waitcnt vmcnt(0)
331 ; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
332 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
333 ; GFX11-NEXT: s_nop 0
334 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
335 ; GFX11-NEXT: s_endpgm
336 %tid = call i32 @llvm.amdgcn.workitem.id.x()
337 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
338 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
339 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
340 %add = add <2 x i16> %a, <i16 123, i16 456>
341 store <2 x i16> %add, ptr addrspace(1) %out
345 ; FIXME: Need to handle non-uniform case for function below (load without gep).
346 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
347 ; VI-LABEL: v_test_add_v2i16_neg_constant:
349 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
350 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
351 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
352 ; VI-NEXT: s_waitcnt lgkmcnt(0)
353 ; VI-NEXT: v_mov_b32_e32 v1, s3
354 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
355 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
356 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
357 ; VI-NEXT: s_waitcnt vmcnt(0)
358 ; VI-NEXT: v_mov_b32_e32 v0, s0
359 ; VI-NEXT: v_mov_b32_e32 v1, s1
360 ; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
361 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
362 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
363 ; VI-NEXT: flat_store_dword v[0:1], v2
366 ; GFX9-LABEL: v_test_add_v2i16_neg_constant:
368 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
369 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
370 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
371 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
374 ; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
375 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
376 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
377 ; GFX9-NEXT: s_endpgm
379 ; GFX10-LABEL: v_test_add_v2i16_neg_constant:
381 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
382 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
383 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
384 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
386 ; GFX10-NEXT: s_waitcnt vmcnt(0)
387 ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
388 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
389 ; GFX10-NEXT: s_endpgm
391 ; GFX11-LABEL: v_test_add_v2i16_neg_constant:
393 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
394 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
395 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
397 ; GFX11-NEXT: s_waitcnt vmcnt(0)
398 ; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
399 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
400 ; GFX11-NEXT: s_nop 0
401 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
402 ; GFX11-NEXT: s_endpgm
403 %tid = call i32 @llvm.amdgcn.workitem.id.x()
404 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
405 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
406 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
407 %add = add <2 x i16> %a, <i16 -845, i16 -991>
408 store <2 x i16> %add, ptr addrspace(1) %out
412 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
413 ; VI-LABEL: v_test_add_v2i16_inline_neg1:
415 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
416 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
417 ; VI-NEXT: v_mov_b32_e32 v3, -1
418 ; VI-NEXT: s_waitcnt lgkmcnt(0)
419 ; VI-NEXT: v_mov_b32_e32 v1, s3
420 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
421 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
422 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
423 ; VI-NEXT: s_waitcnt vmcnt(0)
424 ; VI-NEXT: v_mov_b32_e32 v0, s0
425 ; VI-NEXT: v_mov_b32_e32 v1, s1
426 ; VI-NEXT: v_add_u16_e32 v4, -1, v2
427 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
428 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
429 ; VI-NEXT: flat_store_dword v[0:1], v2
432 ; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
434 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
435 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
436 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
437 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
439 ; GFX9-NEXT: s_waitcnt vmcnt(0)
440 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
441 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
442 ; GFX9-NEXT: s_endpgm
444 ; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
446 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
447 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
448 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
449 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
450 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
451 ; GFX10-NEXT: s_waitcnt vmcnt(0)
452 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
453 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
454 ; GFX10-NEXT: s_endpgm
456 ; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
458 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
459 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
460 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
461 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
462 ; GFX11-NEXT: s_waitcnt vmcnt(0)
463 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
464 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
465 ; GFX11-NEXT: s_nop 0
466 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
467 ; GFX11-NEXT: s_endpgm
468 %tid = call i32 @llvm.amdgcn.workitem.id.x()
469 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
470 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
471 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
472 %add = add <2 x i16> %a, <i16 -1, i16 -1>
473 store <2 x i16> %add, ptr addrspace(1) %out
477 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
478 ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
480 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
481 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
482 ; VI-NEXT: s_waitcnt lgkmcnt(0)
483 ; VI-NEXT: v_mov_b32_e32 v1, s3
484 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
485 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
486 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
487 ; VI-NEXT: s_waitcnt vmcnt(0)
488 ; VI-NEXT: v_mov_b32_e32 v0, s0
489 ; VI-NEXT: v_mov_b32_e32 v1, s1
490 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
491 ; VI-NEXT: v_add_u16_e32 v2, 32, v2
492 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
493 ; VI-NEXT: flat_store_dword v[0:1], v2
496 ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
498 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
499 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
500 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
501 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
503 ; GFX9-NEXT: s_waitcnt vmcnt(0)
504 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
505 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
506 ; GFX9-NEXT: s_endpgm
508 ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
510 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
511 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
512 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
513 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
515 ; GFX10-NEXT: s_waitcnt vmcnt(0)
516 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
517 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
518 ; GFX10-NEXT: s_endpgm
520 ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
522 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
523 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
524 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
525 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
526 ; GFX11-NEXT: s_waitcnt vmcnt(0)
527 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
528 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
529 ; GFX11-NEXT: s_nop 0
530 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
531 ; GFX11-NEXT: s_endpgm
532 %tid = call i32 @llvm.amdgcn.workitem.id.x()
533 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
534 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
535 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
536 %add = add <2 x i16> %a, <i16 32, i16 0>
537 store <2 x i16> %add, ptr addrspace(1) %out
541 ; The high element gives fp
542 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
543 ; VI-LABEL: v_test_add_v2i16_inline_fp_split:
545 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
546 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
547 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
548 ; VI-NEXT: s_waitcnt lgkmcnt(0)
549 ; VI-NEXT: v_mov_b32_e32 v1, s3
550 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
551 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
552 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
553 ; VI-NEXT: s_waitcnt vmcnt(0)
554 ; VI-NEXT: v_mov_b32_e32 v0, s0
555 ; VI-NEXT: v_mov_b32_e32 v1, s1
556 ; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
557 ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
558 ; VI-NEXT: flat_store_dword v[0:1], v2
561 ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
563 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
565 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
568 ; GFX9-NEXT: s_waitcnt vmcnt(0)
569 ; GFX9-NEXT: s_mov_b32 s2, 1.0
570 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
571 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
572 ; GFX9-NEXT: s_endpgm
574 ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
576 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
577 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
578 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
579 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
581 ; GFX10-NEXT: s_waitcnt vmcnt(0)
582 ; GFX10-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1]
583 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
584 ; GFX10-NEXT: s_endpgm
586 ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
588 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
589 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
590 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
591 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
592 ; GFX11-NEXT: s_waitcnt vmcnt(0)
593 ; GFX11-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1]
594 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
595 ; GFX11-NEXT: s_nop 0
596 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
597 ; GFX11-NEXT: s_endpgm
598 %tid = call i32 @llvm.amdgcn.workitem.id.x()
599 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
600 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
601 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
602 %add = add <2 x i16> %a, <i16 0, i16 16256>
603 store <2 x i16> %add, ptr addrspace(1) %out
607 ; FIXME: Need to handle non-uniform case for function below (load without gep).
608 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
609 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
611 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
612 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
613 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
614 ; VI-NEXT: s_waitcnt lgkmcnt(0)
615 ; VI-NEXT: v_mov_b32_e32 v1, s7
616 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
617 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
618 ; VI-NEXT: v_mov_b32_e32 v3, s1
619 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
620 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
621 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
622 ; VI-NEXT: s_waitcnt vmcnt(0)
623 ; VI-NEXT: flat_load_dword v3, v[2:3] glc
624 ; VI-NEXT: s_waitcnt vmcnt(0)
625 ; VI-NEXT: v_mov_b32_e32 v0, s4
626 ; VI-NEXT: v_mov_b32_e32 v1, s5
627 ; VI-NEXT: v_add_u16_e32 v2, v4, v3
628 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
629 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
632 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32:
634 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
635 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
636 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
637 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
638 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
639 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
640 ; GFX9-NEXT: s_waitcnt vmcnt(0)
641 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
642 ; GFX9-NEXT: s_waitcnt vmcnt(0)
643 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
644 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
645 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
646 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
647 ; GFX9-NEXT: s_endpgm
649 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32:
651 ; GFX10-NEXT: s_clause 0x1
652 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
653 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
654 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
655 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
656 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
657 ; GFX10-NEXT: s_waitcnt vmcnt(0)
658 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
659 ; GFX10-NEXT: s_waitcnt vmcnt(0)
660 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
661 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
662 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
663 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
664 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
665 ; GFX10-NEXT: s_endpgm
667 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
669 ; GFX11-NEXT: s_clause 0x1
670 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
671 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
672 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
673 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
674 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
675 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
676 ; GFX11-NEXT: s_waitcnt vmcnt(0)
677 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
678 ; GFX11-NEXT: s_waitcnt vmcnt(0)
679 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
680 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
681 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
682 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
683 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
684 ; GFX11-NEXT: s_nop 0
685 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
686 ; GFX11-NEXT: s_endpgm
687 %tid = call i32 @llvm.amdgcn.workitem.id.x()
688 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
689 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
690 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
691 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
692 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
693 %add = add <2 x i16> %a, %b
694 %ext = zext <2 x i16> %add to <2 x i32>
695 store <2 x i32> %ext, ptr addrspace(1) %out
699 ; FIXME: Need to handle non-uniform case for function below (load without gep).
700 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
701 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
703 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
704 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
705 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
706 ; VI-NEXT: s_waitcnt lgkmcnt(0)
707 ; VI-NEXT: v_mov_b32_e32 v1, s7
708 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
709 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
710 ; VI-NEXT: v_mov_b32_e32 v3, s1
711 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
712 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
713 ; VI-NEXT: flat_load_dword v6, v[0:1] glc
714 ; VI-NEXT: s_waitcnt vmcnt(0)
715 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
716 ; VI-NEXT: s_waitcnt vmcnt(0)
717 ; VI-NEXT: v_mov_b32_e32 v1, 0
718 ; VI-NEXT: v_mov_b32_e32 v4, s4
719 ; VI-NEXT: v_mov_b32_e32 v5, s5
720 ; VI-NEXT: v_mov_b32_e32 v3, v1
721 ; VI-NEXT: v_add_u16_e32 v0, v6, v2
722 ; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
723 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
726 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64:
728 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
729 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
730 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
731 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
732 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
733 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
734 ; GFX9-NEXT: s_waitcnt vmcnt(0)
735 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc
736 ; GFX9-NEXT: s_waitcnt vmcnt(0)
737 ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3
738 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16
739 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
740 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
741 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5]
742 ; GFX9-NEXT: s_endpgm
744 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64:
746 ; GFX10-NEXT: s_clause 0x1
747 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
748 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
749 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
750 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
751 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
752 ; GFX10-NEXT: s_waitcnt vmcnt(0)
753 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
754 ; GFX10-NEXT: s_waitcnt vmcnt(0)
755 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
756 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
757 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16
758 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
759 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
760 ; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5]
761 ; GFX10-NEXT: s_endpgm
763 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
765 ; GFX11-NEXT: s_clause 0x1
766 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
767 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
768 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
769 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
770 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
771 ; GFX11-NEXT: s_waitcnt vmcnt(0)
772 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
773 ; GFX11-NEXT: s_waitcnt vmcnt(0)
774 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
775 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
776 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
777 ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
778 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
779 ; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5]
780 ; GFX11-NEXT: s_nop 0
781 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
782 ; GFX11-NEXT: s_endpgm
783 %tid = call i32 @llvm.amdgcn.workitem.id.x()
784 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
785 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
786 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
787 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
788 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
789 %add = add <2 x i16> %a, %b
790 %ext = zext <2 x i16> %add to <2 x i64>
791 store <2 x i64> %ext, ptr addrspace(1) %out
795 ; FIXME: Need to handle non-uniform case for function below (load without gep).
796 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
797 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
799 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
800 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
801 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
802 ; VI-NEXT: s_waitcnt lgkmcnt(0)
803 ; VI-NEXT: v_mov_b32_e32 v1, s7
804 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
805 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
806 ; VI-NEXT: v_mov_b32_e32 v3, s1
807 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
808 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
809 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
810 ; VI-NEXT: s_waitcnt vmcnt(0)
811 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
812 ; VI-NEXT: s_waitcnt vmcnt(0)
813 ; VI-NEXT: v_mov_b32_e32 v0, s4
814 ; VI-NEXT: v_mov_b32_e32 v1, s5
815 ; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
816 ; VI-NEXT: v_add_u16_e32 v2, v4, v2
817 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
818 ; VI-NEXT: v_bfe_i32 v3, v3, 0, 16
819 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
822 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32:
824 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
825 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
826 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
827 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
828 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
829 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
830 ; GFX9-NEXT: s_waitcnt vmcnt(0)
831 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
832 ; GFX9-NEXT: s_waitcnt vmcnt(0)
833 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
834 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
835 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
836 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
837 ; GFX9-NEXT: s_endpgm
839 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32:
841 ; GFX10-NEXT: s_clause 0x1
842 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
843 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
844 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
845 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
846 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
847 ; GFX10-NEXT: s_waitcnt vmcnt(0)
848 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
849 ; GFX10-NEXT: s_waitcnt vmcnt(0)
850 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
851 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
852 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
853 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
854 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
855 ; GFX10-NEXT: s_endpgm
857 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32:
859 ; GFX11-NEXT: s_clause 0x1
860 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
861 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
862 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
863 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
864 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
865 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
866 ; GFX11-NEXT: s_waitcnt vmcnt(0)
867 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
868 ; GFX11-NEXT: s_waitcnt vmcnt(0)
869 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
870 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
871 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
872 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
873 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
874 ; GFX11-NEXT: s_nop 0
875 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
876 ; GFX11-NEXT: s_endpgm
877 %tid = call i32 @llvm.amdgcn.workitem.id.x()
878 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
879 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
880 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
881 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
882 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
883 %add = add <2 x i16> %a, %b
884 %ext = sext <2 x i16> %add to <2 x i32>
885 store <2 x i32> %ext, ptr addrspace(1) %out
889 ; FIXME: Need to handle non-uniform case for function below (load without gep).
890 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
891 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
893 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
894 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
895 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
896 ; VI-NEXT: s_waitcnt lgkmcnt(0)
897 ; VI-NEXT: v_mov_b32_e32 v1, s7
898 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
899 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
900 ; VI-NEXT: v_mov_b32_e32 v3, s1
901 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
902 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
903 ; VI-NEXT: flat_load_dword v0, v[0:1]
904 ; VI-NEXT: flat_load_dword v1, v[2:3]
905 ; VI-NEXT: v_mov_b32_e32 v4, s4
906 ; VI-NEXT: v_mov_b32_e32 v5, s5
907 ; VI-NEXT: s_waitcnt vmcnt(0)
908 ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
909 ; VI-NEXT: v_add_u16_e32 v0, v0, v1
910 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
911 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
912 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
913 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
914 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
917 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64:
919 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
920 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
921 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
922 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
923 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
924 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
925 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
926 ; GFX9-NEXT: s_waitcnt vmcnt(0)
927 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
928 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
929 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
930 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
931 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
932 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
933 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
934 ; GFX9-NEXT: s_endpgm
936 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64:
938 ; GFX10-NEXT: s_clause 0x1
939 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
940 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
941 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
942 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
943 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
944 ; GFX10-NEXT: s_clause 0x1
945 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
946 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
947 ; GFX10-NEXT: s_waitcnt vmcnt(0)
948 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
949 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
950 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
951 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
952 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
953 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
954 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
955 ; GFX10-NEXT: s_endpgm
957 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
959 ; GFX11-NEXT: s_clause 0x1
960 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
961 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
962 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
963 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
964 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
965 ; GFX11-NEXT: s_clause 0x1
966 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
967 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
968 ; GFX11-NEXT: s_waitcnt vmcnt(0)
969 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
970 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
971 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
972 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
973 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
974 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
975 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
976 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
977 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
978 ; GFX11-NEXT: s_nop 0
979 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
980 ; GFX11-NEXT: s_endpgm
981 %tid = call i32 @llvm.amdgcn.workitem.id.x()
982 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
983 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
984 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
985 %a = load <2 x i16>, ptr addrspace(1) %gep.in0
986 %b = load <2 x i16>, ptr addrspace(1) %gep.in1
987 %add = add <2 x i16> %a, %b
988 %ext = sext <2 x i16> %add to <2 x i64>
989 store <2 x i64> %ext, ptr addrspace(1) %out
993 declare i32 @llvm.amdgcn.workitem.id.x() #0
995 attributes #0 = { nounwind readnone }
996 attributes #1 = { nounwind }