1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8 ; FIXME: VI or should be unnecessary
9 define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
10 ; VI-LABEL: v_test_add_v2i16:
12 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
13 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
14 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
15 ; VI-NEXT: s_waitcnt lgkmcnt(0)
16 ; VI-NEXT: v_mov_b32_e32 v1, s3
17 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
18 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
19 ; VI-NEXT: v_mov_b32_e32 v3, s5
20 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
21 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
22 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
23 ; VI-NEXT: s_waitcnt vmcnt(0)
24 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
25 ; VI-NEXT: s_waitcnt vmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s0
27 ; VI-NEXT: v_mov_b32_e32 v1, s1
28 ; VI-NEXT: v_add_u16_e32 v3, v4, v2
29 ; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
30 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
31 ; VI-NEXT: flat_store_dword v[0:1], v2
34 ; GFX9-LABEL: v_test_add_v2i16:
36 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
37 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
38 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
39 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
41 ; GFX9-NEXT: s_waitcnt vmcnt(0)
42 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
43 ; GFX9-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
45 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
46 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
49 ; GFX10-LABEL: v_test_add_v2i16:
51 ; GFX10-NEXT: s_clause 0x1
52 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
53 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
61 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2
62 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
63 ; GFX10-NEXT: s_endpgm
65 ; GFX11-LABEL: v_test_add_v2i16:
67 ; GFX11-NEXT: s_clause 0x1
68 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
69 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
70 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
71 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
72 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
73 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
74 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
76 ; GFX11-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
78 ; GFX11-NEXT: s_waitcnt vmcnt(0)
79 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
80 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
81 ; GFX11-NEXT: s_endpgm
82 %tid = call i32 @llvm.amdgcn.workitem.id.x()
83 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
84 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
85 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
86 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
87 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
88 %add = add <2 x i16> %a, %b
89 store <2 x i16> %add, ptr addrspace(1) %out
93 define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
94 ; VI-LABEL: s_test_add_v2i16:
96 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
97 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
99 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
100 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0
101 ; VI-NEXT: v_mov_b32_e32 v0, s0
102 ; VI-NEXT: v_mov_b32_e32 v1, s1
103 ; VI-NEXT: s_waitcnt lgkmcnt(0)
104 ; VI-NEXT: s_lshr_b32 s0, s2, 16
105 ; VI-NEXT: s_lshr_b32 s1, s3, 16
106 ; VI-NEXT: s_add_i32 s2, s2, s3
107 ; VI-NEXT: s_add_i32 s0, s0, s1
108 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
109 ; VI-NEXT: s_lshl_b32 s0, s0, 16
110 ; VI-NEXT: s_or_b32 s0, s1, s0
111 ; VI-NEXT: v_mov_b32_e32 v2, s0
112 ; VI-NEXT: flat_store_dword v[0:1], v2
115 ; GFX9-LABEL: s_test_add_v2i16:
117 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
118 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
119 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
120 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
121 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
122 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
125 ; GFX9-NEXT: v_pk_add_u16 v1, s5, v1
126 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
127 ; GFX9-NEXT: s_endpgm
129 ; GFX10-LABEL: s_test_add_v2i16:
131 ; GFX10-NEXT: s_clause 0x1
132 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
133 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
134 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
135 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0
137 ; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0
138 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX10-NEXT: v_pk_add_u16 v1, s4, s5
140 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
141 ; GFX10-NEXT: s_endpgm
143 ; GFX11-LABEL: s_test_add_v2i16:
145 ; GFX11-NEXT: s_clause 0x1
146 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
147 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
148 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
149 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
151 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
152 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3
154 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
155 ; GFX11-NEXT: s_endpgm
156 %a = load <2 x i16>, ptr addrspace(4) %in0
157 %b = load <2 x i16>, ptr addrspace(4) %in1
158 %add = add <2 x i16> %a, %b
159 store <2 x i16> %add, ptr addrspace(1) %out
163 define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
164 ; VI-LABEL: s_test_add_self_v2i16:
166 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
167 ; VI-NEXT: s_waitcnt lgkmcnt(0)
168 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
169 ; VI-NEXT: v_mov_b32_e32 v0, s0
170 ; VI-NEXT: v_mov_b32_e32 v1, s1
171 ; VI-NEXT: s_waitcnt lgkmcnt(0)
172 ; VI-NEXT: s_lshr_b32 s0, s2, 16
173 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
174 ; VI-NEXT: s_add_i32 s1, s1, s1
175 ; VI-NEXT: s_add_i32 s0, s0, s0
176 ; VI-NEXT: s_lshl_b32 s0, s0, 16
177 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
178 ; VI-NEXT: s_or_b32 s0, s1, s0
179 ; VI-NEXT: v_mov_b32_e32 v2, s0
180 ; VI-NEXT: flat_store_dword v[0:1], v2
183 ; GFX9-LABEL: s_test_add_self_v2i16:
185 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
186 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
187 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
189 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
191 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
192 ; GFX9-NEXT: s_endpgm
194 ; GFX10-LABEL: s_test_add_self_v2i16:
196 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
197 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
198 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
199 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
202 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
203 ; GFX10-NEXT: s_endpgm
205 ; GFX11-LABEL: s_test_add_self_v2i16:
207 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
208 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
209 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
211 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
213 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
214 ; GFX11-NEXT: s_endpgm
215 %a = load <2 x i16>, ptr addrspace(4) %in0
216 %add = add <2 x i16> %a, %a
217 store <2 x i16> %add, ptr addrspace(1) %out
221 ; FIXME: VI should not scalarize arg access.
222 define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
223 ; VI-LABEL: s_test_add_v2i16_kernarg:
225 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
226 ; VI-NEXT: s_waitcnt lgkmcnt(0)
227 ; VI-NEXT: s_lshr_b32 s4, s2, 16
228 ; VI-NEXT: s_lshr_b32 s5, s3, 16
229 ; VI-NEXT: s_add_i32 s2, s2, s3
230 ; VI-NEXT: s_add_i32 s4, s4, s5
231 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
232 ; VI-NEXT: s_lshl_b32 s3, s4, 16
233 ; VI-NEXT: s_or_b32 s2, s2, s3
234 ; VI-NEXT: v_mov_b32_e32 v0, s0
235 ; VI-NEXT: v_mov_b32_e32 v1, s1
236 ; VI-NEXT: v_mov_b32_e32 v2, s2
237 ; VI-NEXT: flat_store_dword v[0:1], v2
240 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
242 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
243 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
244 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
245 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
246 ; GFX9-NEXT: v_pk_add_u16 v1, s2, v1
247 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
248 ; GFX9-NEXT: s_endpgm
250 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
252 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
253 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
254 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3
256 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
257 ; GFX10-NEXT: s_endpgm
259 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
261 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
262 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3
265 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
266 ; GFX11-NEXT: s_endpgm
267 %add = add <2 x i16> %a, %b
268 store <2 x i16> %add, ptr addrspace(1) %out
272 ; FIXME: Eliminate or with sdwa
273 define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
274 ; VI-LABEL: v_test_add_v2i16_constant:
276 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
277 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
278 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
279 ; VI-NEXT: s_waitcnt lgkmcnt(0)
280 ; VI-NEXT: v_mov_b32_e32 v1, s3
281 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
282 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
283 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
284 ; VI-NEXT: s_waitcnt vmcnt(0)
285 ; VI-NEXT: v_mov_b32_e32 v0, s0
286 ; VI-NEXT: v_mov_b32_e32 v1, s1
287 ; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
288 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
289 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
290 ; VI-NEXT: flat_store_dword v[0:1], v2
293 ; GFX9-LABEL: v_test_add_v2i16_constant:
295 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
296 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
297 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
298 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
299 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
301 ; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
302 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
303 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
304 ; GFX9-NEXT: s_endpgm
306 ; GFX10-LABEL: v_test_add_v2i16_constant:
308 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
309 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
310 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
311 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
312 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
313 ; GFX10-NEXT: s_waitcnt vmcnt(0)
314 ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
315 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
316 ; GFX10-NEXT: s_endpgm
318 ; GFX11-LABEL: v_test_add_v2i16_constant:
320 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
321 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
322 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
323 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
324 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
325 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
327 ; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
328 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
329 ; GFX11-NEXT: s_endpgm
330 %tid = call i32 @llvm.amdgcn.workitem.id.x()
331 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
332 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
333 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
334 %add = add <2 x i16> %a, <i16 123, i16 456>
335 store <2 x i16> %add, ptr addrspace(1) %out
339 ; FIXME: Need to handle non-uniform case for function below (load without gep).
340 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
341 ; VI-LABEL: v_test_add_v2i16_neg_constant:
343 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
344 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
345 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
346 ; VI-NEXT: s_waitcnt lgkmcnt(0)
347 ; VI-NEXT: v_mov_b32_e32 v1, s3
348 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
349 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
350 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
351 ; VI-NEXT: s_waitcnt vmcnt(0)
352 ; VI-NEXT: v_mov_b32_e32 v0, s0
353 ; VI-NEXT: v_mov_b32_e32 v1, s1
354 ; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
355 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
356 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
357 ; VI-NEXT: flat_store_dword v[0:1], v2
360 ; GFX9-LABEL: v_test_add_v2i16_neg_constant:
362 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
363 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
364 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
365 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
367 ; GFX9-NEXT: s_waitcnt vmcnt(0)
368 ; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
369 ; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
370 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
371 ; GFX9-NEXT: s_endpgm
373 ; GFX10-LABEL: v_test_add_v2i16_neg_constant:
375 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
376 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
377 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
378 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
380 ; GFX10-NEXT: s_waitcnt vmcnt(0)
381 ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
382 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
383 ; GFX10-NEXT: s_endpgm
385 ; GFX11-LABEL: v_test_add_v2i16_neg_constant:
387 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
388 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
389 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
390 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
391 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
393 ; GFX11-NEXT: s_waitcnt vmcnt(0)
394 ; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
395 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
396 ; GFX11-NEXT: s_endpgm
397 %tid = call i32 @llvm.amdgcn.workitem.id.x()
398 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
399 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
400 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
401 %add = add <2 x i16> %a, <i16 -845, i16 -991>
402 store <2 x i16> %add, ptr addrspace(1) %out
406 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
407 ; VI-LABEL: v_test_add_v2i16_inline_neg1:
409 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
410 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
411 ; VI-NEXT: v_mov_b32_e32 v3, -1
412 ; VI-NEXT: s_waitcnt lgkmcnt(0)
413 ; VI-NEXT: v_mov_b32_e32 v1, s3
414 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
415 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
416 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
417 ; VI-NEXT: s_waitcnt vmcnt(0)
418 ; VI-NEXT: v_mov_b32_e32 v0, s0
419 ; VI-NEXT: v_mov_b32_e32 v1, s1
420 ; VI-NEXT: v_add_u16_e32 v4, -1, v2
421 ; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
422 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
423 ; VI-NEXT: flat_store_dword v[0:1], v2
426 ; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
428 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
429 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
430 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
431 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
432 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
433 ; GFX9-NEXT: s_waitcnt vmcnt(0)
434 ; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
435 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
436 ; GFX9-NEXT: s_endpgm
438 ; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
440 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
441 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
442 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
443 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
445 ; GFX10-NEXT: s_waitcnt vmcnt(0)
446 ; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
447 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
448 ; GFX10-NEXT: s_endpgm
450 ; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
452 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
453 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
454 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
455 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
456 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
457 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
458 ; GFX11-NEXT: s_waitcnt vmcnt(0)
459 ; GFX11-NEXT: v_pk_add_u16 v0, v0, -1
460 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
461 ; GFX11-NEXT: s_endpgm
462 %tid = call i32 @llvm.amdgcn.workitem.id.x()
463 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
464 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
465 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
466 %add = add <2 x i16> %a, <i16 -1, i16 -1>
467 store <2 x i16> %add, ptr addrspace(1) %out
471 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
472 ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
474 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
475 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
476 ; VI-NEXT: s_waitcnt lgkmcnt(0)
477 ; VI-NEXT: v_mov_b32_e32 v1, s3
478 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
479 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
480 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
481 ; VI-NEXT: s_waitcnt vmcnt(0)
482 ; VI-NEXT: v_mov_b32_e32 v0, s0
483 ; VI-NEXT: v_mov_b32_e32 v1, s1
484 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
485 ; VI-NEXT: v_add_u16_e32 v2, 32, v2
486 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
487 ; VI-NEXT: flat_store_dword v[0:1], v2
490 ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
492 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
493 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
494 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
497 ; GFX9-NEXT: s_waitcnt vmcnt(0)
498 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
499 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
500 ; GFX9-NEXT: s_endpgm
502 ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
504 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
505 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
506 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
507 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
509 ; GFX10-NEXT: s_waitcnt vmcnt(0)
510 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
511 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
512 ; GFX10-NEXT: s_endpgm
514 ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
516 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
517 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
518 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
519 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
520 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
522 ; GFX11-NEXT: s_waitcnt vmcnt(0)
523 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
524 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
525 ; GFX11-NEXT: s_endpgm
526 %tid = call i32 @llvm.amdgcn.workitem.id.x()
527 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
528 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
529 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
530 %add = add <2 x i16> %a, <i16 32, i16 0>
531 store <2 x i16> %add, ptr addrspace(1) %out
535 ; The high element gives fp
536 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
537 ; VI-LABEL: v_test_add_v2i16_inline_fp_split:
539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
540 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
541 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
542 ; VI-NEXT: s_waitcnt lgkmcnt(0)
543 ; VI-NEXT: v_mov_b32_e32 v1, s3
544 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
545 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
546 ; VI-NEXT: flat_load_dword v2, v[0:1] glc
547 ; VI-NEXT: s_waitcnt vmcnt(0)
548 ; VI-NEXT: v_mov_b32_e32 v0, s0
549 ; VI-NEXT: v_mov_b32_e32 v1, s1
550 ; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
551 ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
552 ; VI-NEXT: flat_store_dword v[0:1], v2
555 ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
557 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
558 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
559 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
560 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
562 ; GFX9-NEXT: s_waitcnt vmcnt(0)
563 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
564 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
565 ; GFX9-NEXT: s_endpgm
567 ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
569 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
570 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
571 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
572 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
574 ; GFX10-NEXT: s_waitcnt vmcnt(0)
575 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
576 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
577 ; GFX10-NEXT: s_endpgm
579 ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
581 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
582 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
584 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
585 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
587 ; GFX11-NEXT: s_waitcnt vmcnt(0)
588 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0
589 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
590 ; GFX11-NEXT: s_endpgm
591 %tid = call i32 @llvm.amdgcn.workitem.id.x()
592 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
593 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
594 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
595 %add = add <2 x i16> %a, <i16 0, i16 16256>
596 store <2 x i16> %add, ptr addrspace(1) %out
600 ; FIXME: Need to handle non-uniform case for function below (load without gep).
601 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
602 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
604 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
605 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
606 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
607 ; VI-NEXT: s_waitcnt lgkmcnt(0)
608 ; VI-NEXT: v_mov_b32_e32 v1, s3
609 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
610 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
611 ; VI-NEXT: v_mov_b32_e32 v3, s5
612 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
613 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
614 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
615 ; VI-NEXT: s_waitcnt vmcnt(0)
616 ; VI-NEXT: flat_load_dword v3, v[2:3] glc
617 ; VI-NEXT: s_waitcnt vmcnt(0)
618 ; VI-NEXT: v_mov_b32_e32 v0, s0
619 ; VI-NEXT: v_mov_b32_e32 v1, s1
620 ; VI-NEXT: v_add_u16_e32 v2, v4, v3
621 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
622 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
625 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32:
627 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
628 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
629 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
630 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
631 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
632 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
633 ; GFX9-NEXT: s_waitcnt vmcnt(0)
634 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
635 ; GFX9-NEXT: s_waitcnt vmcnt(0)
636 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
637 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
638 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
639 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
640 ; GFX9-NEXT: s_endpgm
642 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32:
644 ; GFX10-NEXT: s_clause 0x1
645 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
646 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
647 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
648 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
650 ; GFX10-NEXT: s_waitcnt vmcnt(0)
651 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
652 ; GFX10-NEXT: s_waitcnt vmcnt(0)
653 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
654 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
655 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
656 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
657 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
658 ; GFX10-NEXT: s_endpgm
660 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32:
662 ; GFX11-NEXT: s_clause 0x1
663 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
664 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
665 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
666 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
667 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
668 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
669 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
671 ; GFX11-NEXT: s_waitcnt vmcnt(0)
672 ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
673 ; GFX11-NEXT: s_waitcnt vmcnt(0)
674 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
675 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
676 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
677 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
678 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
679 ; GFX11-NEXT: s_endpgm
680 %tid = call i32 @llvm.amdgcn.workitem.id.x()
681 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
682 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
683 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
684 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
685 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
686 %add = add <2 x i16> %a, %b
687 %ext = zext <2 x i16> %add to <2 x i32>
688 store <2 x i32> %ext, ptr addrspace(1) %out
692 ; FIXME: Need to handle non-uniform case for function below (load without gep).
693 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
694 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
696 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
697 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
698 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
699 ; VI-NEXT: s_waitcnt lgkmcnt(0)
700 ; VI-NEXT: v_mov_b32_e32 v1, s3
701 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
702 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
703 ; VI-NEXT: v_mov_b32_e32 v3, s5
704 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
705 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
706 ; VI-NEXT: flat_load_dword v6, v[0:1] glc
707 ; VI-NEXT: s_waitcnt vmcnt(0)
708 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
709 ; VI-NEXT: s_waitcnt vmcnt(0)
710 ; VI-NEXT: v_mov_b32_e32 v1, 0
711 ; VI-NEXT: v_mov_b32_e32 v4, s0
712 ; VI-NEXT: v_mov_b32_e32 v5, s1
713 ; VI-NEXT: v_mov_b32_e32 v3, v1
714 ; VI-NEXT: v_add_u16_e32 v0, v6, v2
715 ; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
716 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
719 ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64:
721 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
722 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
723 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
724 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
725 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
726 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
728 ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
729 ; GFX9-NEXT: s_waitcnt vmcnt(0)
730 ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3
731 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16
732 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
733 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
734 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
735 ; GFX9-NEXT: s_endpgm
737 ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64:
739 ; GFX10-NEXT: s_clause 0x1
740 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
741 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
742 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
743 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
744 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
745 ; GFX10-NEXT: s_waitcnt vmcnt(0)
746 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
747 ; GFX10-NEXT: s_waitcnt vmcnt(0)
748 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
749 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
750 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16
751 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
752 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
753 ; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
754 ; GFX10-NEXT: s_endpgm
756 ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
758 ; GFX11-NEXT: s_clause 0x1
759 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
760 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
761 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
762 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
763 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
764 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
765 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
766 ; GFX11-NEXT: s_waitcnt vmcnt(0)
767 ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
768 ; GFX11-NEXT: s_waitcnt vmcnt(0)
769 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
770 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
771 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
772 ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
773 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
774 ; GFX11-NEXT: global_store_b128 v1, v[0:3], s[0:1]
775 ; GFX11-NEXT: s_endpgm
776 %tid = call i32 @llvm.amdgcn.workitem.id.x()
777 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
778 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
779 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
780 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
781 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
782 %add = add <2 x i16> %a, %b
783 %ext = zext <2 x i16> %add to <2 x i64>
784 store <2 x i64> %ext, ptr addrspace(1) %out
788 ; FIXME: Need to handle non-uniform case for function below (load without gep).
789 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
790 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
792 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
793 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
794 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
795 ; VI-NEXT: s_waitcnt lgkmcnt(0)
796 ; VI-NEXT: v_mov_b32_e32 v1, s3
797 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
798 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
799 ; VI-NEXT: v_mov_b32_e32 v3, s5
800 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
801 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
802 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
803 ; VI-NEXT: s_waitcnt vmcnt(0)
804 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
805 ; VI-NEXT: s_waitcnt vmcnt(0)
806 ; VI-NEXT: v_mov_b32_e32 v0, s0
807 ; VI-NEXT: v_mov_b32_e32 v1, s1
808 ; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
809 ; VI-NEXT: v_add_u16_e32 v2, v4, v2
810 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
811 ; VI-NEXT: v_bfe_i32 v3, v3, 0, 16
812 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
815 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32:
817 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
818 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
819 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
820 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
821 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
822 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
823 ; GFX9-NEXT: s_waitcnt vmcnt(0)
824 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
825 ; GFX9-NEXT: s_waitcnt vmcnt(0)
826 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2
827 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
828 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
829 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
830 ; GFX9-NEXT: s_endpgm
832 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32:
834 ; GFX10-NEXT: s_clause 0x1
835 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
836 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
837 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
838 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
839 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
840 ; GFX10-NEXT: s_waitcnt vmcnt(0)
841 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
842 ; GFX10-NEXT: s_waitcnt vmcnt(0)
843 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
844 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
845 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
846 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
847 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
848 ; GFX10-NEXT: s_endpgm
850 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32:
852 ; GFX11-NEXT: s_clause 0x1
853 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
854 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
855 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
856 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
857 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
858 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
859 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
860 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
861 ; GFX11-NEXT: s_waitcnt vmcnt(0)
862 ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
863 ; GFX11-NEXT: s_waitcnt vmcnt(0)
864 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
865 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
866 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
867 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
868 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
869 ; GFX11-NEXT: s_endpgm
870 %tid = call i32 @llvm.amdgcn.workitem.id.x()
871 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
872 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
873 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
874 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
875 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
876 %add = add <2 x i16> %a, %b
877 %ext = sext <2 x i16> %add to <2 x i32>
878 store <2 x i32> %ext, ptr addrspace(1) %out
882 ; FIXME: Need to handle non-uniform case for function below (load without gep).
883 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
884 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
886 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
887 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
888 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
889 ; VI-NEXT: s_waitcnt lgkmcnt(0)
890 ; VI-NEXT: v_mov_b32_e32 v1, s3
891 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
892 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
893 ; VI-NEXT: v_mov_b32_e32 v3, s5
894 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
895 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
896 ; VI-NEXT: flat_load_dword v0, v[0:1]
897 ; VI-NEXT: flat_load_dword v1, v[2:3]
898 ; VI-NEXT: v_mov_b32_e32 v4, s0
899 ; VI-NEXT: v_mov_b32_e32 v5, s1
900 ; VI-NEXT: s_waitcnt vmcnt(0)
901 ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
902 ; VI-NEXT: v_add_u16_e32 v0, v0, v1
903 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
904 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
905 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
906 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
907 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
910 ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64:
912 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
913 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
914 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
915 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
916 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
917 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
918 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
919 ; GFX9-NEXT: s_waitcnt vmcnt(0)
920 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
921 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
922 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
923 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
924 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
925 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
926 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
927 ; GFX9-NEXT: s_endpgm
929 ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64:
931 ; GFX10-NEXT: s_clause 0x1
932 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
933 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
934 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
935 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
936 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
937 ; GFX10-NEXT: s_clause 0x1
938 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
939 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
940 ; GFX10-NEXT: s_waitcnt vmcnt(0)
941 ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2
942 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
943 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
944 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
945 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
946 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
947 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
948 ; GFX10-NEXT: s_endpgm
950 ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64:
952 ; GFX11-NEXT: s_clause 0x1
953 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
954 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
955 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
956 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
957 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
958 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
959 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
960 ; GFX11-NEXT: s_clause 0x1
961 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
962 ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
963 ; GFX11-NEXT: s_waitcnt vmcnt(0)
964 ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
965 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
966 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
967 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
968 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
969 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
970 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
971 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
972 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
973 ; GFX11-NEXT: s_endpgm
974 %tid = call i32 @llvm.amdgcn.workitem.id.x()
975 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
976 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
977 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
978 %a = load <2 x i16>, ptr addrspace(1) %gep.in0
979 %b = load <2 x i16>, ptr addrspace(1) %gep.in1
980 %add = add <2 x i16> %a, %b
981 %ext = sext <2 x i16> %add to <2 x i64>
982 store <2 x i64> %ext, ptr addrspace(1) %out
986 define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
987 ; VI-LABEL: add_inline_imm_neg1_0:
989 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
990 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
991 ; VI-NEXT: v_add_u16_e32 v0, -1, v0
992 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
993 ; VI-NEXT: s_setpc_b64 s[30:31]
995 ; GFX9-LABEL: add_inline_imm_neg1_0:
997 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
998 ; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1
999 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1001 ; GFX10-LABEL: add_inline_imm_neg1_0:
1003 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1
1005 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1007 ; GFX11-LABEL: add_inline_imm_neg1_0:
1009 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010 ; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1
1011 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1012 %y = add <2 x i16> %x, <i16 -1, i16 0>
1016 define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
1017 ; VI-LABEL: add_inline_imm_1_0:
1019 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1020 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
1021 ; VI-NEXT: v_add_u16_e32 v0, 1, v0
1022 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1023 ; VI-NEXT: s_setpc_b64 s[30:31]
1025 ; GFX9-LABEL: add_inline_imm_1_0:
1027 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1
1029 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1031 ; GFX10-LABEL: add_inline_imm_1_0:
1033 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034 ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1
1035 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1037 ; GFX11-LABEL: add_inline_imm_1_0:
1039 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040 ; GFX11-NEXT: v_pk_add_u16 v0, v0, 1
1041 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1042 %y = add <2 x i16> %x, <i16 1, i16 0>
1046 declare i32 @llvm.amdgcn.workitem.id.x() #0
1048 attributes #0 = { nounwind readnone }
1049 attributes #1 = { nounwind }